Imports System.Net
Imports System.Text.RegularExpressions
Public Class Form1
Private Sub Form1_Load(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles MyBase.Load
ListView1.FullRowSelect = True
ListView1.GridLines = True
ListView1.Columns.Add("LinkUrl")
ListView1.Columns.Add("LinkText")
ListView1.View = View.Details
End Sub
Private Sub Button1_Click(ByVal sender As System.Object, ByVal e As System.EventArgs) Handles Button1.Click
ExtractLinks("http://www.ebay.com/")
For i% = 0 To ListView1.Columns.Count - 1
ListView1.Columns(i).Width = -2
Next i
End Sub
Public Function ExtractLinks(ByVal url As String) As Boolean
Try
Dim wc As New WebClient
Dim html As String = wc.DownloadString(url)
Dim links As MatchCollection = Regex.Matches(html, "<a.*?href=""(.*?)"".*?>(.*?)</a>")
For Each match As Match In links
Dim matchUrl As String = match.Groups(1).Value
'Ignore all anchor links
If matchUrl.StartsWith("#") Then
Continue For
End If
'Ignore all javascript calls
If matchUrl.ToLower.StartsWith("javascript:") Then
Continue For
End If
'Ignore all email links
If matchUrl.ToLower.StartsWith("mailto:") Then
Continue For
End If
'For internal links, build the url mapped to the base address
If Not matchUrl.StartsWith("http://") And Not matchUrl.StartsWith("https://") Then
matchUrl = MapUrl(url, matchUrl)
End If
Dim _lsvItm As New ListViewItem(New String() {matchUrl, match.Groups(2).Value})
ListView1.Items.Add(_lsvItm)
Next
Return True
Catch ex As Exception
Return False
End Try
End Function
Public Function MapUrl(ByVal baseAddress As String, ByVal relativePath As String) As String
Dim u As New System.Uri(baseAddress)
If relativePath = "./" Then
relativePath = "/"
End If
If relativePath.StartsWith("/") Then
Return u.Scheme + Uri.SchemeDelimiter + u.Authority + relativePath
Else
Dim pathAndQuery As String = u.AbsolutePath
' If the baseAddress contains a file name, like ..../Something.aspx
' Trim off the file name
pathAndQuery = pathAndQuery.Split("?")(0).TrimEnd("/")
If pathAndQuery.Split("/")(pathAndQuery.Split("/").Count - 1).Contains(".") Then
pathAndQuery = pathAndQuery.Substring(0, pathAndQuery.LastIndexOf("/"))
End If
baseAddress = u.Scheme + Uri.SchemeDelimiter + u.Authority + pathAndQuery
While relativePath.StartsWith("../")
relativePath = relativePath.Substring(3)
If baseAddress.LastIndexOf("/") > baseAddress.IndexOf("//" + 2) Then
baseAddress = baseAddress.Substring(0, baseAddress.LastIndexOf("/")).TrimEnd("/")
End If
End While
Return baseAddress + "/" + relativePath
End If
End Function
End Class