Results 1 to 8 of 8

Thread: Extract Text from Pdfs using iTextSharp (02-03/2005)

Threaded View

  1. #1

    Thread Starter
    PowerPoster stanav's Avatar
    Join Date
    Jul 2006
    Location
    Providence, RI - USA
    Posts
    9,290

    Extract Text from Pdfs using iTextSharp (02-03/2005)

    Hello all,
    I was recently working on a pdf manipulating project. One of the things I needed to do was to extract the text from pdf files and search for a specific phrase. I was using iTextSharp for manipulating pdfs. While iTextSharp includes a PdfReader class, it isn't directly capable of extracting text out of the box. I did some Google and all I could find is this project by Zollor http://www.codeproject.com/useritems/PDFToText.asp. Unfortunately, his code can't extract the Pdfs created by our company (while PdfBox can - but to use PdfBox it requires another library reference and adds another 16MB to the final footprint of my project and it very is sloooowwww...), so I just went ahead and wrote my own function...
    And here it is. To use it, you'll have to add a reference to itextsharp.dll to your project and import iTextSharp.text.pdf
    VB Code:
    1. Imports iTextSharp.text.pdf
    2.  
    3. Public Function ParsePdfText(ByVal sourcePDF As String, _
    4.                                   Optional ByVal fromPageNum As Integer = 0, _
    5.                                   Optional ByVal toPageNum As Integer = 0) As String
    6.  
    7.         Dim sb As New System.Text.StringBuilder()
    8.         Try
    9.             Dim reader As New PdfReader(sourcePDF)
    10.             Dim pageBytes() As Byte = Nothing
    11.             Dim token As PRTokeniser = Nothing
    12.             Dim tknType As Integer = -1
    13.             Dim tknValue As String = String.Empty
    14.  
    15.             If fromPageNum = 0 Then
    16.                 fromPageNum = 1
    17.             End If
    18.             If toPageNum = 0 Then
    19.                 toPageNum = reader.NumberOfPages
    20.             End If
    21.  
    22.             If fromPageNum > toPageNum Then
    23.                 Throw New ApplicationException("Parameter error: The value of fromPageNum can " & _
    24.                                            "not be larger than the value of toPageNum")
    25.             End If
    26.  
    27.             For i As Integer = fromPageNum To toPageNum Step 1
    28.                 pageBytes = reader.GetPageContent(i)
    29.                 If Not IsNothing(pageBytes) Then
    30.                     token = New PRTokeniser(pageBytes)
    31.                     While token.NextToken()
    32.                         tknType = token.TokenType()
    33.                         tknValue = token.StringValue
    34.                         If tknType = PRTokeniser.TK_STRING Then
    35.                             sb.Append(token.StringValue)
    36.                         'I need to add these additional tests to properly add whitespace to the output string
    37.                         ElseIf tknType = 1 AndAlso tknValue = "-600" Then
    38.                             sb.Append(" ")
    39.                         ElseIf tknType = 10 AndAlso tknValue = "TJ" Then
    40.                             sb.Append(" ")
    41.                         End If
    42.                    End While
    43.                 End If
    44.             Next i
    45.         Catch ex As Exception
    46.             MessageBox.Show("Exception occured. " & ex.Message)
    47.             Return String.Empty
    48.         End Try
    49.         Return sb.ToString()
    50.     End Function
    Last edited by stanav; Jun 25th, 2007 at 01:22 PM.

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •  



Click Here to Expand Forum to Full Width