Results 1 to 4 of 4

Thread: [2005] PDF with ITextSharp

  1. #1

    Thread Starter
    Learning .Net danasegarane's Avatar
    Join Date
    Aug 2004
    Location
    VBForums
    Posts
    5,853

    Thumbs up [2005] PDF with ITextSharp

    Hi all,
    How can I read the hyperlinks in PDF using ItextSharp ??

    Thanks in Advance
    Dana

  2. #2
    PowerPoster stanav's Avatar
    Join Date
    Jul 2006
    Location
    Providence, RI - USA
    Posts
    9,290

    Re: [2005] PDF with ITextSharp

    Hello Dana,

    Try this function.... It still needs more work as stated in the comments
    vb.net Code:
    1. 'Don't forget to add a reference of iTextSharp to your project and import the pdf namespace
    2. Imports iTextSharp.text.pdf
    3.  
    4. ''' <summary>
    5.     ''' This function extract the hyperlinks found on a pdf files.
    6.     ''' </summary>
    7.     ''' <param name="sourcePdf">the full path to the source pdf file</param>
    8.     ''' <param name="pageNumbers">An Integer array containing the page numbers from which the
    9.     ''' the URLs will be extracted. The default value is Nothing, and it will extract URLs from
    10.     ''' the whole document.</param>
    11.     ''' <returns>A datatable containing the URLs and page numbers where they are found</returns>
    12.     ''' <remarks>This function still need more work to extract URLs from Anchor objects or from PRIndirectReference objects.
    13.     ''' I'll will update the code once I found a way to do so</remarks>
    14.     Public Shared Function ExtractURLs(ByVal sourcePdf As String, Optional ByVal pageNumbers() As Integer = Nothing) As System.Data.DataTable
    15.         'We first build a datatable to return the extracted URLs (if any)
    16.         Dim linkTable As New DataTable("ExtractedHyperlinks")
    17.         With linkTable.Columns
    18.             .Add("FoundOnPage", GetType(Integer))
    19.             .Add("URL", GetType(String))
    20.         End With
    21.         Dim row As System.Data.DataRow = Nothing
    22.  
    23.         'Declare variables
    24.         Dim raf As iTextSharp.text.pdf.RandomAccessFileOrArray = Nothing
    25.         Dim reader As iTextSharp.text.pdf.PdfReader = Nothing
    26.         Dim linkArray As System.Collections.ArrayList = Nothing
    27.         Dim pageDict As iTextSharp.text.pdf.PdfDictionary = Nothing
    28.         Dim pageCount As Integer = 0
    29.  
    30.         Try
    31.             'Open the pdf file and get page count
    32.             raf = New iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf)
    33.             reader = New iTextSharp.text.pdf.PdfReader(raf, Nothing)
    34.             pageCount = reader.NumberOfPages()
    35.  
    36.             'Create pageNumbers array if the user did not pass in one
    37.             If pageNumbers Is Nothing Then
    38.                 pageNumbers = New Integer(pageCount - 1) {}
    39.                 For i As Integer = 0 To pageNumbers.GetUpperBound(0)
    40.                     pageNumbers(i) = i + 1
    41.                 Next
    42.             End If
    43.  
    44.             'We now loop thru the pageNUmbers array to get the urls on each page
    45.             For k As Integer = 0 To pageNumbers.GetUpperBound(0)
    46.                 'Get the page dictionary
    47.                 Dim page As PdfDictionary = reader.GetPageNRelease(pageNumbers(k))
    48.                 'Get the annotation array
    49.                 Dim annots As PdfArray = DirectCast(PdfReader.GetPdfObject(page.[Get](PdfName.ANNOTS), page), PdfArray)
    50.                 If Not annots Is Nothing Then
    51.                     Dim arr As ArrayList = annots.ArrayList
    52.                     'Now loop thru the annotation arraylist
    53.                     For j As Integer = 0 To arr.Count - 1
    54.                         Dim annoto As PdfObject = PdfReader.GetPdfObject(CType(arr(j), PdfObject))
    55.                         'First we check this PdfObject to make sure that it is a dictionary
    56.                         If TypeOf annoto Is PdfDictionary Then
    57.                             Dim annot As PdfDictionary = DirectCast(annoto, PdfDictionary)
    58.                             'We then get the subtype name and check to see if it's a link
    59.                             If (PdfName.LINK).Equals(annot.Get(PdfName.SUBTYPE)) Then
    60.                                 'We now try to get the A name
    61.                                 Dim A As PdfObject = annot.Get(PdfName.A)
    62.                                 If Not A Is Nothing Then
    63.                                     'We then test to see what type this A name is
    64.                                     If TypeOf A Is PRIndirectReference Then
    65.                                         Dim prIndRef As PRIndirectReference = DirectCast(A, PRIndirectReference)
    66.                                         'Still need work to pull the url from PRIndirectReference object
    67.                                         '
    68.                                     Else
    69.                                         'We again has to make sure the A name is a dictionary
    70.                                         If A.IsDictionary Then
    71.                                             Try
    72.                                                 'And finally we try to read the URL from this A name
    73.                                                 Dim linkDict As PdfDictionary = CType(A, PdfDictionary)
    74.                                                 If linkDict.Contains(PdfName.URI) Then
    75.                                                     'And add the URL to our datatable
    76.                                                     row = linkTable.NewRow()
    77.                                                     row("FoundOnPage") = pageNumbers(k)
    78.                                                     row("URL") = linkDict.Get(PdfName.URI).ToString
    79.                                                     linkTable.Rows.Add(row)
    80.                                                 End If
    81.                                             Catch ex As Exception
    82.                                                 'Put your code to handle exception here
    83.                                                 '
    84.                                             End Try
    85.                                         End If
    86.                                     End If
    87.                                 End If
    88.                             End If
    89.                         End If
    90.                     Next
    91.                 End If
    92.             Next
    93.             'Close the reader when done to realease resources.
    94.             reader.Close()
    95.         Catch ex As Exception
    96.             MessageBox.Show(ex.Message)
    97.         End Try
    98.  
    99.         Return linkTable
    100.     End Function
    Let us have faith that right makes might, and in that faith, let us, to the end, dare to do our duty as we understand it.
    - Abraham Lincoln -

  3. #3
    New Member
    Join Date
    Apr 2010
    Posts
    1

    Re: [2005] PDF with ITextSharp

    Stanav

    did you figure out how to extract anchor objects aswell?

    Thanks in advance,

  4. #4
    New Member
    Join Date
    Jul 2011
    Posts
    3

    Re: [2005] PDF with ITextSharp

    Hi Stanav,

    Is there any solution to find the URL,Name from PRIndirectReference. Please help me

    Thanks
    Ashok
    Last edited by ashok.arumugam; Jul 6th, 2011 at 07:04 AM.

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  



Click Here to Expand Forum to Full Width