Imports System.IO
Imports System.Net
Imports System
Imports System.Text
Imports System.Text.RegularExpressions
Public Class HTMLContentParser
'Public Function Return_HTMLContent(ByVal sURL As String)
' Dim sStream As Stream
' Dim URLReq As HttpWebRequest
' Dim URLRes As HttpWebResponse
' Dim gethtmlcode1 As String
' Try
' URLReq = WebRequest.Create(sURL)
' URLRes = URLReq.GetResponse()
' sStream = URLRes.GetResponseStream()
' Return New StreamReader(sStream).ReadToEnd()
' Catch ex As Exception
' gethtmlcode1 =
' Return ex.Message
' End Try
'Hi U need to write an application
'First u bring the Total web page in txt format by using the below function.
Public Function Return_HTMLContent(ByVal URL1 As String) As String
Dim t1 As TextBox
'Dim Tmp As T
Try
Dim request1 As WebRequest = WebRequest.Create(URL1)
Dim response1 As WebResponse = request1.GetResponse()
Dim reader1 As StreamReader = New StreamReader(response1.GetResponseStream())
Dim gethtmlcode1 As String
gethtmlcode1 = reader1.ReadToEnd
Return_HTMLContent = gethtmlcode1
Catch e As Exception
Return_HTMLContent = ""
MsgBox(e.Message, MsgBoxStyle.OKOnly, "current News")
Application.Exit()
End Try
'End Function
'In the above function will get all the code in gethtmlcode1.
'after that u need to write a seach function for capturing the particular content like phone number, fax etc.
'just like
' 'Get everything within body
't1 = InStr(1, Tmp, "<body", vbTextCompare)
'If t1 > 0 Then Tmp = Mid(Tmp, t1 - 1)
't1 = InStr(1, Tmp, "</body>", vbTextCompare)
'If t1 > 0 Then Tmp = Mid(Tmp, 1, t1 - 1)
't1 = InStr(1, Tmp, "</head>", vbTextCompare)
'If t1 > 0 Then Tmp = Mid(Tmp, t1 + 7)
'then u get those phone, fax in variable fields. and proceeed further
End Function
Public Function ParseHTMLLinks(ByVal sHTMLContent As String, ByVal sURL As String) As ArrayList
Dim rRegEx As Regex
Dim mMatch As Match
Dim aMatch As New ArrayList
rRegEx = New Regex("a.*href\s*=\s*(?:""(?<1>[^""]*)""|(?<1>\S+))", RegexOptions.IgnoreCase Or RegexOptions.Compiled)
mMatch = rRegEx.Match(sHTMLContent)
While mMatch.Success
Dim sMatch As String
sMatch = ProcessURL(mMatch.Groups(1).ToString, sURL)
aMatch.Add(sMatch)
mMatch = mMatch.NextMatch()
End While
Return aMatch
End Function
Public Function ParseHTMLImages(ByVal sHTMLContent As String, ByVal sURL As String) As ArrayList
Dim rRegEx As Regex
Dim mMatch As Match
Dim aMatch As New ArrayList
rRegEx = New Regex("img.*src\s*=\s*(?:""(?<1>[^""]*)""|(?<1>\S+))", RegexOptions.IgnoreCase Or RegexOptions.Compiled)
mMatch = rRegEx.Match(sHTMLContent)
While mMatch.Success
Dim sMatch As String
sMatch = ProcessURL(mMatch.Groups(1).ToString, sURL)
aMatch.Add(sMatch)
mMatch = mMatch.NextMatch()
End While
Return aMatch
End Function
Private Function ProcessURL(ByVal sInput As String, ByVal sURL As String)
'Find out if the sURL has a "/" after the Domain Name 'If not, give a "/" at the end 'First, check out for any slash after the 'Double Dashes of the http:// 'If there is NO slash, then end the sURL string with a SLASH If InStr(8, sURL, "/") = 0 Then
sURL += "/"
'FILTERING
'Filter down to the Domain Name Directory from the Right
Dim iCount As Integer
For iCount = sURL.Length To 1 Step -1
If Mid(sURL, iCount, 1) = "/" Then
sURL = Left(sURL, iCount)
Exit For
End If
Next
'Filter out the ">" from the Left
For iCount = 1 To sInput.Length
If Mid(sInput, iCount, 4) = ">" Then
sInput = Left(sInput, iCount - 1) 'Stop and Take the Char before
Exit For
End If
Next
'Filter out unnecessary Characters
sInput = sInput.Replace("<", Chr(39))
sInput = sInput.Replace(">", Chr(39))
sInput = sInput.Replace("""", "")
sInput = sInput.Replace("'", "")
If (sInput.IndexOf("http://") < 0) Then
If (Not (sInput.StartsWith("/")) And Not (sURL.EndsWith("/"))) Then
Return sURL & "/" & sInput
Else
If (sInput.StartsWith("/")) And (sURL.EndsWith("/")) Then
Return sURL.Substring(0, sURL.Length - 1) + sInput
Else
Return sURL + sInput
End If
End If
Else
Return sInput
End If
End Function
End Class