Results 1 to 6 of 6

Thread: extract the urls,phone no,faxes

  1. #1

    Thread Starter
    Junior Member
    Join Date
    Jul 2006
    Posts
    18

    extract the urls,phone no,faxes

    hai freinds
    my project is web data extractor(windows application) ,this deals with extract the urls and pnone no,faxes,metatags,inative sites.how to extract the these are all,give me a valluble suggestion,i have some code ,in that have some problmes
    freinds any body knows plz


    VB Code:
    1. Imports System.IO
    2. Imports System.Net
    3. Imports System
    4. Imports System.Text
    5. Imports System.Text.RegularExpressions
    6. Public Class HTMLContentParser
    7.     'Public Function Return_HTMLContent(ByVal sURL As String)
    8.     '    Dim sStream As Stream
    9.     '    Dim URLReq As HttpWebRequest
    10.     '    Dim URLRes As HttpWebResponse
    11.     '    Dim gethtmlcode1 As String
    12.     '    Try
    13.     '        URLReq = WebRequest.Create(sURL)
    14.     '        URLRes = URLReq.GetResponse()
    15.     '        sStream = URLRes.GetResponseStream()
    16.     '        Return New StreamReader(sStream).ReadToEnd()
    17.     '    Catch ex As Exception
    18.     '        gethtmlcode1 =
    19.     '        Return ex.Message
    20.     '    End Try
    21.         'Hi U need to write an application
    22.         'First u bring the Total web page in txt format by using the below function.
    23.     Public Function Return_HTMLContent(ByVal URL1 As String) As String
    24.         Dim t1 As TextBox
    25.         'Dim Tmp As T
    26.  
    27.  
    28.  
    29.         Try
    30.  
    31.             Dim request1 As WebRequest = WebRequest.Create(URL1)
    32.  
    33.             Dim response1 As WebResponse = request1.GetResponse()
    34.  
    35.             Dim reader1 As StreamReader = New StreamReader(response1.GetResponseStream())
    36.  
    37.             Dim gethtmlcode1 As String
    38.  
    39.  
    40.  
    41.             gethtmlcode1 = reader1.ReadToEnd
    42.  
    43.             Return_HTMLContent = gethtmlcode1
    44.  
    45.         Catch e As Exception
    46.  
    47.             Return_HTMLContent = ""
    48.  
    49.             MsgBox(e.Message, MsgBoxStyle.OKOnly, "current News")
    50.  
    51.             Application.Exit()
    52.  
    53.         End Try
    54.  
    55.         'End Function
    56.  
    57.  
    58.         'In the above function will get all the code in gethtmlcode1.
    59.         'after that u need to write a seach function for capturing the particular content like phone number, fax etc.
    60.         'just like
    61.  
    62.  
    63.         '    'Get everything within body
    64.  
    65.         't1 = InStr(1, Tmp, "<body", vbTextCompare)
    66.  
    67.         'If t1 > 0 Then Tmp = Mid(Tmp, t1 - 1)
    68.  
    69.         't1 = InStr(1, Tmp, "</body>", vbTextCompare)
    70.  
    71.         'If t1 > 0 Then Tmp = Mid(Tmp, 1, t1 - 1)
    72.  
    73.         't1 = InStr(1, Tmp, "</head>", vbTextCompare)
    74.  
    75.         'If t1 > 0 Then Tmp = Mid(Tmp, t1 + 7)
    76.  
    77.         'then u get those phone, fax in variable fields. and  proceeed further
    78.  
    79.  
    80.     End Function
    81.     Public Function ParseHTMLLinks(ByVal sHTMLContent As String, ByVal sURL As String) As ArrayList
    82.         Dim rRegEx As Regex
    83.         Dim mMatch As Match
    84.         Dim aMatch As New ArrayList
    85.         rRegEx = New Regex("a.*href\s*=\s*(?:""(?<1>[^""]*)""|(?<1>\S+))", RegexOptions.IgnoreCase Or RegexOptions.Compiled)
    86.         mMatch = rRegEx.Match(sHTMLContent)
    87.         While mMatch.Success
    88.             Dim sMatch As String
    89.             sMatch = ProcessURL(mMatch.Groups(1).ToString, sURL)
    90.             aMatch.Add(sMatch)
    91.             mMatch = mMatch.NextMatch()
    92.         End While
    93.         Return aMatch
    94.     End Function
    95.     Public Function ParseHTMLImages(ByVal sHTMLContent As String, ByVal sURL As String) As ArrayList
    96.         Dim rRegEx As Regex
    97.         Dim mMatch As Match
    98.         Dim aMatch As New ArrayList
    99.         rRegEx = New Regex("img.*src\s*=\s*(?:""(?<1>[^""]*)""|(?<1>\S+))", RegexOptions.IgnoreCase Or RegexOptions.Compiled)
    100.         mMatch = rRegEx.Match(sHTMLContent)
    101.         While mMatch.Success
    102.             Dim sMatch As String
    103.             sMatch = ProcessURL(mMatch.Groups(1).ToString, sURL)
    104.             aMatch.Add(sMatch)
    105.             mMatch = mMatch.NextMatch()
    106.         End While
    107.         Return aMatch
    108.     End Function
    109.     Private Function ProcessURL(ByVal sInput As String, ByVal sURL As String)
    110.         'Find out if the sURL has a "/" after the Domain Name 'If not, give a "/" at the end 'First, check out for any slash after the 'Double Dashes of the http:// 'If there is NO slash, then end the sURL string with a SLASH If InStr(8, sURL, "/") = 0 Then
    111.         sURL += "/"
    112.         'FILTERING
    113.         'Filter down to the Domain Name Directory from the Right
    114.         Dim iCount As Integer
    115.         For iCount = sURL.Length To 1 Step -1
    116.             If Mid(sURL, iCount, 1) = "/" Then
    117.                 sURL = Left(sURL, iCount)
    118.                 Exit For
    119.             End If
    120.         Next
    121.         'Filter out the ">" from the Left
    122.         For iCount = 1 To sInput.Length
    123.             If Mid(sInput, iCount, 4) = ">" Then
    124.                 sInput = Left(sInput, iCount - 1) 'Stop and Take the Char before
    125.                 Exit For
    126.             End If
    127.         Next
    128.         'Filter out unnecessary Characters
    129.         sInput = sInput.Replace("<", Chr(39))
    130.         sInput = sInput.Replace(">", Chr(39))
    131.         sInput = sInput.Replace("""", "")
    132.         sInput = sInput.Replace("'", "")
    133.         If (sInput.IndexOf("http://") < 0) Then
    134.             If (Not (sInput.StartsWith("/")) And Not (sURL.EndsWith("/"))) Then
    135.                 Return sURL & "/" & sInput
    136.             Else
    137.                 If (sInput.StartsWith("/")) And (sURL.EndsWith("/")) Then
    138.                     Return sURL.Substring(0, sURL.Length - 1) + sInput
    139.                 Else
    140.                     Return sURL + sInput
    141.                 End If
    142.             End If
    143.         Else
    144.             Return sInput
    145.         End If
    146.     End Function
    147. End Class
    Last edited by si_the_geek; Jul 14th, 2006 at 02:36 PM. Reason: added vbcode tags

  2. #2
    Super Moderator si_the_geek's Avatar
    Join Date
    Jul 2002
    Location
    Bristol, UK
    Posts
    41,974

    Re: extract the urls,phone no,faxes

    Moved from CodeBank forum (which is for code examples, not questions)

  3. #3
    PowerPoster
    Join Date
    Aug 2005
    Location
    College Station, TX
    Posts
    4,521

    Re: extract the urls,phone no,faxes

    You would need to look into the world of Regex.... it allows you to match on patterns of text, and can be quite powerful when done right.

    A good tutorial: http://www.codeproject.com/dotnet/RegexTutorial.asp

    I have also posted several Regex examples in this forum if you search for Regex and my username...

  4. #4
    Fanatic Member TokersBall_CDXX's Avatar
    Join Date
    Mar 2003
    Location
    America
    Posts
    571

    Re: extract the urls,phone no,faxes

    hmm.. you may want to consider editing the post to include vbcode tags so that the code doesn't look so ugly.

    secondly, what was given to you was merely an example. you will have to work the rest yourself.

    we cannot write the entire application for you, unless you have a specific question I'm not quite sure anyone can do much more for you.
    Build your own personalized flash based chat room for your webpage for FREE! http://www.4computerheaven.com

  5. #5
    I'm about to be a PowerPoster! mendhak's Avatar
    Join Date
    Feb 2002
    Location
    Ulaan Baator GooGoo: Frog
    Posts
    38,170

    Re: extract the urls,phone no,faxes

    Ah, I didn't see this. Hiding on the 2nd page. Good example for why you shouldn't crosspost. You get half the answers, scattered about, and you piss a few of us off. Like me.

  6. #6

    Thread Starter
    Junior Member
    Join Date
    Jul 2006
    Posts
    18

    Re: extract the urls,phone no,faxes

    thanks to everybody for replying

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  



Click Here to Expand Forum to Full Width