extract the urls,phone no,faxes

**ramesh.mavin** · Jul 14th, 2006, 08:02 AM

hai freinds
my project is web data extractor(windows application) ,this deals with extract the urls and pnone no,faxes,metatags,inative sites.how to extract the these are all,give me a valluble suggestion,i have some code ,in that have some problmes
freinds any body knows plz

VB Code:

Imports System.IO
Imports System.Net
Imports System
Imports System.Text
Imports System.Text.RegularExpressions
Public Class HTMLContentParser
    'Public Function Return_HTMLContent(ByVal sURL As String)
    '    Dim sStream As Stream
    '    Dim URLReq As HttpWebRequest
    '    Dim URLRes As HttpWebResponse
    '    Dim gethtmlcode1 As String
    '    Try
    '        URLReq = WebRequest.Create(sURL)
    '        URLRes = URLReq.GetResponse()
    '        sStream = URLRes.GetResponseStream()
    '        Return New StreamReader(sStream).ReadToEnd()
    '    Catch ex As Exception
    '        gethtmlcode1 =
    '        Return ex.Message
    '    End Try
        'Hi U need to write an application 
        'First u bring the Total web page in txt format by using the below function.
    Public Function Return_HTMLContent(ByVal URL1 As String) As String
        Dim t1 As TextBox
        'Dim Tmp As T
 
 
 
        Try
 
            Dim request1 As WebRequest = WebRequest.Create(URL1)
 
            Dim response1 As WebResponse = request1.GetResponse()
 
            Dim reader1 As StreamReader = New StreamReader(response1.GetResponseStream())
 
            Dim gethtmlcode1 As String
 
 
 
            gethtmlcode1 = reader1.ReadToEnd
 
            Return_HTMLContent = gethtmlcode1
 
        Catch e As Exception
 
            Return_HTMLContent = ""
 
            MsgBox(e.Message, MsgBoxStyle.OKOnly, "current News")
 
            Application.Exit()
 
        End Try
 
        'End Function
 
 
        'In the above function will get all the code in gethtmlcode1.
        'after that u need to write a seach function for capturing the particular content like phone number, fax etc.
        'just like
 
 
        '    'Get everything within body
 
        't1 = InStr(1, Tmp, "<body", vbTextCompare)
 
        'If t1 > 0 Then Tmp = Mid(Tmp, t1 - 1)
 
        't1 = InStr(1, Tmp, "</body>", vbTextCompare)
 
        'If t1 > 0 Then Tmp = Mid(Tmp, 1, t1 - 1)
 
        't1 = InStr(1, Tmp, "</head>", vbTextCompare)
 
        'If t1 > 0 Then Tmp = Mid(Tmp, t1 + 7)
 
        'then u get those phone, fax in variable fields. and  proceeed further
 
 
    End Function
    Public Function ParseHTMLLinks(ByVal sHTMLContent As String, ByVal sURL As String) As ArrayList
        Dim rRegEx As Regex
        Dim mMatch As Match
        Dim aMatch As New ArrayList
        rRegEx = New Regex("a.*href\s*=\s*(?:""(?<1>[^""]*)""|(?<1>\S+))", RegexOptions.IgnoreCase Or RegexOptions.Compiled)
        mMatch = rRegEx.Match(sHTMLContent)
        While mMatch.Success
            Dim sMatch As String
            sMatch = ProcessURL(mMatch.Groups(1).ToString, sURL)
            aMatch.Add(sMatch)
            mMatch = mMatch.NextMatch()
        End While
        Return aMatch
    End Function
    Public Function ParseHTMLImages(ByVal sHTMLContent As String, ByVal sURL As String) As ArrayList
        Dim rRegEx As Regex
        Dim mMatch As Match
        Dim aMatch As New ArrayList
        rRegEx = New Regex("img.*src\s*=\s*(?:""(?<1>[^""]*)""|(?<1>\S+))", RegexOptions.IgnoreCase Or RegexOptions.Compiled)
        mMatch = rRegEx.Match(sHTMLContent)
        While mMatch.Success
            Dim sMatch As String
            sMatch = ProcessURL(mMatch.Groups(1).ToString, sURL)
            aMatch.Add(sMatch)
            mMatch = mMatch.NextMatch()
        End While
        Return aMatch
    End Function
    Private Function ProcessURL(ByVal sInput As String, ByVal sURL As String)
        'Find out if the sURL has a "/" after the Domain Name 'If not, give a "/" at the end 'First, check out for any slash after the 'Double Dashes of the http:// 'If there is NO slash, then end the sURL string with a SLASH If InStr(8, sURL, "/") = 0 Then
        sURL += "/"
        'FILTERING
        'Filter down to the Domain Name Directory from the Right
        Dim iCount As Integer
        For iCount = sURL.Length To 1 Step -1
            If Mid(sURL, iCount, 1) = "/" Then
                sURL = Left(sURL, iCount)
                Exit For
            End If
        Next
        'Filter out the ">" from the Left
        For iCount = 1 To sInput.Length
            If Mid(sInput, iCount, 4) = ">" Then
                sInput = Left(sInput, iCount - 1) 'Stop and Take the Char before
                Exit For
            End If
        Next
        'Filter out unnecessary Characters
        sInput = sInput.Replace("<", Chr(39))
        sInput = sInput.Replace(">", Chr(39))
        sInput = sInput.Replace("""", "")
        sInput = sInput.Replace("'", "")
        If (sInput.IndexOf("http://") < 0) Then
            If (Not (sInput.StartsWith("/")) And Not (sURL.EndsWith("/"))) Then
                Return sURL & "/" & sInput
            Else
                If (sInput.StartsWith("/")) And (sURL.EndsWith("/")) Then
                    Return sURL.Substring(0, sURL.Length - 1) + sInput
                Else
                    Return sURL + sInput
                End If
            End If
        Else
            Return sInput
        End If
    End Function
End Class

**si_the_geek** · Jul 14th, 2006, 09:32 AM

Moved from CodeBank forum (which is for code examples, not questions)

**gigemboy** · Jul 14th, 2006, 12:07 PM

You would need to look into the world of Regex.... it allows you to match on patterns of text, and can be quite powerful when done right.

A good tutorial: http://www.codeproject.com/dotnet/RegexTutorial.asp

I have also posted several Regex examples in this forum if you search for Regex and my username...

**TokersBall_CDXX** · Jul 14th, 2006, 01:05 PM

hmm.. you may want to consider editing the post to include vbcode tags so that the code doesn't look so ugly.

secondly, what was given to you was merely an example. you will have to work the rest yourself.

we cannot write the entire application for you, unless you have a specific question I'm not quite sure anyone can do much more for you.

**mendhak** · Jul 15th, 2006, 07:18 AM

Ah, I didn't see this. Hiding on the 2nd page. Good example for why you shouldn't crosspost. You get half the answers, scattered about, and you piss a few of us off. Like me.

**ramesh.mavin** · Jul 15th, 2006, 08:28 AM

thanks to everybody for replying

Thread: extract the urls,phone no,faxes

Thread Tools

Display

extract the urls,phone no,faxes

Re: extract the urls,phone no,faxes

Re: extract the urls,phone no,faxes

Re: extract the urls,phone no,faxes

Re: extract the urls,phone no,faxes

Re: extract the urls,phone no,faxes

Posting Permissions