'Don't forget to add a reference of iTextSharp to your project and import the pdf namespace
Imports iTextSharp.text.pdf
''' <summary>
''' This function extract the hyperlinks found on a pdf files.
''' </summary>
''' <param name="sourcePdf">the full path to the source pdf file</param>
''' <param name="pageNumbers">An Integer array containing the page numbers from which the
''' the URLs will be extracted. The default value is Nothing, and it will extract URLs from
''' the whole document.</param>
''' <returns>A datatable containing the URLs and page numbers where they are found</returns>
''' <remarks>This function still need more work to extract URLs from Anchor objects or from PRIndirectReference objects.
''' I'll will update the code once I found a way to do so</remarks>
Public Shared Function ExtractURLs(ByVal sourcePdf As String, Optional ByVal pageNumbers() As Integer = Nothing) As System.Data.DataTable
'We first build a datatable to return the extracted URLs (if any)
Dim linkTable As New DataTable("ExtractedHyperlinks")
With linkTable.Columns
.Add("FoundOnPage", GetType(Integer))
.Add("URL", GetType(String))
End With
Dim row As System.Data.DataRow = Nothing
'Declare variables
Dim raf As iTextSharp.text.pdf.RandomAccessFileOrArray = Nothing
Dim reader As iTextSharp.text.pdf.PdfReader = Nothing
Dim linkArray As System.Collections.ArrayList = Nothing
Dim pageDict As iTextSharp.text.pdf.PdfDictionary = Nothing
Dim pageCount As Integer = 0
Try
'Open the pdf file and get page count
raf = New iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf)
reader = New iTextSharp.text.pdf.PdfReader(raf, Nothing)
pageCount = reader.NumberOfPages()
'Create pageNumbers array if the user did not pass in one
If pageNumbers Is Nothing Then
pageNumbers = New Integer(pageCount - 1) {}
For i As Integer = 0 To pageNumbers.GetUpperBound(0)
pageNumbers(i) = i + 1
Next
End If
'We now loop thru the pageNUmbers array to get the urls on each page
For k As Integer = 0 To pageNumbers.GetUpperBound(0)
'Get the page dictionary
Dim page As PdfDictionary = reader.GetPageNRelease(pageNumbers(k))
'Get the annotation array
Dim annots As PdfArray = DirectCast(PdfReader.GetPdfObject(page.[Get](PdfName.ANNOTS), page), PdfArray)
If Not annots Is Nothing Then
Dim arr As ArrayList = annots.ArrayList
'Now loop thru the annotation arraylist
For j As Integer = 0 To arr.Count - 1
Dim annoto As PdfObject = PdfReader.GetPdfObject(CType(arr(j), PdfObject))
'First we check this PdfObject to make sure that it is a dictionary
If TypeOf annoto Is PdfDictionary Then
Dim annot As PdfDictionary = DirectCast(annoto, PdfDictionary)
'We then get the subtype name and check to see if it's a link
If (PdfName.LINK).Equals(annot.Get(PdfName.SUBTYPE)) Then
'We now try to get the A name
Dim A As PdfObject = annot.Get(PdfName.A)
If Not A Is Nothing Then
'We then test to see what type this A name is
If TypeOf A Is PRIndirectReference Then
Dim prIndRef As PRIndirectReference = DirectCast(A, PRIndirectReference)
'Still need work to pull the url from PRIndirectReference object
'
Else
'We again has to make sure the A name is a dictionary
If A.IsDictionary Then
Try
'And finally we try to read the URL from this A name
Dim linkDict As PdfDictionary = CType(A, PdfDictionary)
If linkDict.Contains(PdfName.URI) Then
'And add the URL to our datatable
row = linkTable.NewRow()
row("FoundOnPage") = pageNumbers(k)
row("URL") = linkDict.Get(PdfName.URI).ToString
linkTable.Rows.Add(row)
End If
Catch ex As Exception
'Put your code to handle exception here
'
End Try
End If
End If
End If
End If
End If
Next
End If
Next
'Close the reader when done to realease resources.
reader.Close()
Catch ex As Exception
MessageBox.Show(ex.Message)
End Try
Return linkTable
End Function