Public Class PdfManipulation '''

''' Extract the text from pdf pages and return it as a string '''

''' Full path to the source pdf file ''' [Optional] the page number (inclusive) to start text extraction ''' [Optional] the page number (inclusive) to stop text extraction ''' A string containing the text extracted from the specified pages ''' If fromPageNum is not specified, text extraction will start from page 1. If ''' toPageNum is not specified, text extraction will end at the last page of the source pdf file. Public Shared Function ParsePdfText(ByVal sourcePDF As String, _ Optional ByVal fromPageNum As Integer = 0, _ Optional ByVal toPageNum As Integer = 0) As String Dim sb As New System.Text.StringBuilder() Try Dim reader As New iTextSharp.text.pdf.PdfReader(sourcePDF) Dim pageBytes() As Byte = Nothing Dim token As iTextSharp.text.pdf.PRTokeniser = Nothing Dim tknType As Integer = -1 Dim tknValue As String = String.Empty If fromPageNum = 0 Then fromPageNum = 1 End If If toPageNum = 0 Then toPageNum = reader.NumberOfPages End If If fromPageNum > toPageNum Then Throw New ApplicationException("Parameter error: The value of fromPageNum can " & _ "not be larger than the value of toPageNum") End If For i As Integer = fromPageNum To toPageNum Step 1 pageBytes = reader.GetPageContent(i) If Not IsNothing(pageBytes) Then token = New iTextSharp.text.pdf.PRTokeniser(pageBytes) While token.NextToken() tknType = token.TokenType() tknValue = token.StringValue If tknType = iTextSharp.text.pdf.PRTokeniser.TK_STRING Then sb.Append(token.StringValue) 'I need to add these additional tests to properly add whitespace to the output string ElseIf tknType = 1 AndAlso tknValue = "-600" Then sb.Append(" ") ElseIf tknType = 10 AndAlso tknValue = "TJ" Then sb.Append(" ") End If End While End If Next i Catch ex As Exception MessageBox.Show("Exception occured. " & ex.Message) Return String.Empty End Try Return sb.ToString() End Function '''

''' Textually compare 2 pdf files page by page and write the difference to a text file. '''

''' the full path to 1st pdf file ''' the full path to 2nd pdf file ''' the full path to the result file ''' page number to start comparing ''' page number to stop comparing ''' If no values are specified for fromPageNum and toPageNum, the sub will ''' compare every page in the input pdfs. Public Shared Sub ComparePdfs(ByVal pdf1 As String, ByVal pdf2 As String, _ ByVal resultFile As String, _ Optional ByVal fromPageNum As Integer = 0, _ Optional ByVal toPageNum As Integer = 0) Try 'For pdf1 Dim reader1 As New iTextSharp.text.pdf.PdfReader(pdf1) Dim pageCount1 As Integer = reader1.NumberOfPages Dim pageBytes1() As Byte = Nothing Dim token1 As iTextSharp.text.pdf.PRTokeniser = Nothing Dim tknType1 As Integer = -1 Dim tknValue1 As String = String.Empty 'For pdf2 Dim reader2 As New iTextSharp.text.pdf.PdfReader(pdf2) Dim pageCount2 As Integer = reader2.NumberOfPages Dim pageBytes2() As Byte = Nothing Dim token2 As iTextSharp.text.pdf.PRTokeniser = Nothing Dim tknType2 As Integer = -1 Dim tknValue2 As String = String.Empty If fromPageNum = 0 Then fromPageNum = 1 End If If toPageNum = 0 Then toPageNum = Math.Min(pageCount1, pageCount2) Else If toPageNum > pageCount1 OrElse toPageNum > pageCount2 Then toPageNum = Math.Min(pageCount1, pageCount2) End If End If If fromPageNum > toPageNum Then Throw New ApplicationException("Parameter error: The value of fromPageNum can " & _ "not be larger than the value of toPageNum") End If Dim writer As New System.IO.StreamWriter(resultFile) For i As Integer = fromPageNum To toPageNum Step 1 writer.WriteLine("Differences found in page " & i) pageBytes1 = reader1.GetPageContent(i) pageBytes2 = reader2.GetPageContent(i) If Not IsNothing(pageBytes1) AndAlso Not IsNothing(pageBytes2) Then token1 = New iTextSharp.text.pdf.PRTokeniser(pageBytes1) token2 = New iTextSharp.text.pdf.PRTokeniser(pageBytes2) While token1.NextToken() AndAlso token2.NextToken() tknType1 = token1.TokenType() tknValue1 = token1.StringValue tknType2 = token2.TokenType() tknValue2 = token2.StringValue If tknType1 = iTextSharp.text.pdf.PRTokeniser.TK_STRING AndAlso _ tknType2 = iTextSharp.text.pdf.PRTokeniser.TK_STRING Then If String.Compare(tknValue1, tknValue2) <> 0 Then writer.WriteLine("Pdf1: " & tknValue1 & " <> Pdf2: " & tknValue2) End If End If End While End If Next i writer.Close() reader1.Close() reader2.Close() Catch ex As Exception MessageBox.Show("Exception occured. " & ex.Message) End Try End Sub '''

''' Extract a single page from source pdf to a new pdf '''

''' the full path to source pdf file ''' the page number to extract ''' the full path for the output pdf ''' Public Shared Sub ExtractPdfPage(ByVal sourcePdf As String, ByVal pageNumberToExtract As Integer, ByVal outPdf As String) Dim reader As iTextSharp.text.pdf.PdfReader = Nothing Dim doc As iTextSharp.text.Document = Nothing Dim pdfCpy As iTextSharp.text.pdf.PdfCopy = Nothing Dim page As iTextSharp.text.pdf.PdfImportedPage = Nothing Try reader = New iTextSharp.text.pdf.PdfReader(sourcePdf) doc = New iTextSharp.text.Document(reader.GetPageSizeWithRotation(1)) pdfCpy = New iTextSharp.text.pdf.PdfCopy(doc, New IO.FileStream(outPdf, IO.FileMode.Create)) doc.Open() page = pdfCpy.GetImportedPage(reader, pageNumberToExtract) pdfCpy.AddPage(page) doc.Close() reader.Close() Catch ex As Exception Throw ex End Try End Sub '''

''' Extract selected pages from a source pdf to a new pdf '''

''' the full path to source pdf to a new pdf ''' the page numbers to extract (i.e {1, 3, 5, 6}) ''' The full path for the output pdf ''' The output pdf will contains the extracted pages in the order of the page numbers listed ''' in pageNumbersToExtract parameter. Public Shared Sub ExtractPdfPage(ByVal sourcePdf As String, ByVal pageNumbersToExtract As Integer(), ByVal outPdf As String) Dim reader As iTextSharp.text.pdf.PdfReader = Nothing Dim doc As iTextSharp.text.Document = Nothing Dim pdfCpy As iTextSharp.text.pdf.PdfCopy = Nothing Dim page As iTextSharp.text.pdf.PdfImportedPage = Nothing Try reader = New iTextSharp.text.pdf.PdfReader(sourcePdf) doc = New iTextSharp.text.Document(reader.GetPageSizeWithRotation(1)) pdfCpy = New iTextSharp.text.pdf.PdfCopy(doc, New IO.FileStream(outPdf, IO.FileMode.Create)) doc.Open() For Each pageNum As Integer In pageNumbersToExtract page = pdfCpy.GetImportedPage(reader, pageNum) pdfCpy.AddPage(page) Next doc.Close() reader.Close() Catch ex As Exception Throw ex End Try End Sub '''

''' Split a single pdf file into multiple pdfs with equal number of pages. '''

''' the full path to the source pdf ''' the number of splitted pdfs to split to ''' the base file name (full path) for splitted pdfs. ''' The actual output pdf file names will be serialized. ''' The last splitted pdf may not have ''' the same number of pages as the rest, depending on the combination of number of pages in the source pdf ''' and the number of parts to be splitted. For example, if the original pdf has 9 pages and it is to be ''' splitted into 5 parts, the last splitted pdf will have only 1 page while all others have 2 pages. Public Shared Sub SplitPdfByParts(ByVal sourcePdf As String, ByVal parts As Integer, ByVal baseNameOutPdf As String) Dim reader As iTextSharp.text.pdf.PdfReader = Nothing Dim doc As iTextSharp.text.Document = Nothing Dim pdfCpy As iTextSharp.text.pdf.PdfCopy = Nothing Dim page As iTextSharp.text.pdf.PdfImportedPage = Nothing Dim pageCount As Integer = 0 Try reader = New iTextSharp.text.pdf.PdfReader(sourcePdf) pageCount = reader.NumberOfPages If pageCount < parts Then Throw New ArgumentException("Not enough pages in source pdf to split") Else Dim n As Integer = pageCount \ parts Dim currentPage As Integer = 1 Dim ext As String = IO.Path.GetExtension(baseNameOutPdf) Dim outfile As String = String.Empty For i As Integer = 1 To parts outfile = baseNameOutPdf.Replace(ext, "_" & i & ext) doc = New iTextSharp.text.Document(reader.GetPageSizeWithRotation(currentPage)) pdfCpy = New iTextSharp.text.pdf.PdfCopy(doc, New IO.FileStream(outfile, IO.FileMode.Create)) doc.Open() If i < parts Then For j As Integer = 1 To n page = pdfCpy.GetImportedPage(reader, currentPage) pdfCpy.AddPage(page) currentPage += 1 Next j Else For j As Integer = currentPage To pageCount page = pdfCpy.GetImportedPage(reader, j) pdfCpy.AddPage(page) Next j End If doc.Close() Next End If reader.Close() Catch ex As Exception Throw ex End Try End Sub '''

''' Split source pdf into multiple pdfs with specifc number of pages '''

''' the full path to source pdf ''' the number of pages each splitted pdf should contain ''' the base file name (full path) for splitted pdfs. ''' The actual output pdf file names will be serialized. ''' The last splitted pdf may not have ''' the same number of pages as the rest, depending on the combination of number of pages in the source pdf ''' and the number of target pages in each splitted pdf. For example, if the original pdf has 9 pages and it is to be ''' splitted with 2 pages for each pdf, the last splitted pdf will have only 1 page while all others have 2 pages. Public Shared Sub SplitPdfByPages(ByVal sourcePdf As String, ByVal numOfPages As Integer, ByVal baseNameOutPdf As String) Dim reader As iTextSharp.text.pdf.PdfReader = Nothing Dim doc As iTextSharp.text.Document = Nothing Dim pdfCpy As iTextSharp.text.pdf.PdfCopy = Nothing Dim page As iTextSharp.text.pdf.PdfImportedPage = Nothing Dim pageCount As Integer = 0 Try reader = New iTextSharp.text.pdf.PdfReader(sourcePdf) pageCount = reader.NumberOfPages If pageCount < numOfPages Then Throw New ArgumentException("Not enough pages in source pdf to split") Else Dim ext As String = IO.Path.GetExtension(baseNameOutPdf) Dim outfile As String = String.Empty Dim n As Integer = CInt(Math.Ceiling(pageCount / numOfPages)) Dim currentPage As Integer = 1 For i As Integer = 1 To n outfile = baseNameOutPdf.Replace(ext, "_" & i & ext) doc = New iTextSharp.text.Document(reader.GetPageSizeWithRotation(currentPage)) pdfCpy = New iTextSharp.text.pdf.PdfCopy(doc, New IO.FileStream(outfile, IO.FileMode.Create)) doc.Open() If i < n Then For j As Integer = 1 To numOfPages page = pdfCpy.GetImportedPage(reader, currentPage) pdfCpy.AddPage(page) currentPage += 1 Next j Else For j As Integer = currentPage To pageCount page = pdfCpy.GetImportedPage(reader, j) pdfCpy.AddPage(page) Next j End If doc.Close() Next End If reader.Close() Catch ex As Exception Throw ex End Try End Sub '''

''' Extract pages from multiple pdf's file and merge them into ''' a single pdf '''

''' the datatable containing source pfd paths and the pages to extract ''' from each of them. This datatable should have 2 datacolumns of type String. The 1st column (column 0) ''' is for the file (full) path while the 2nd column (column 1) is for the list of pages to extract from ''' the source pdf in column 1. This list is a string of integer values separated by commas ''' (ex: "1, 3, 2, 5 , 8, 7, 9") ''' the path to save the output pdf ''' the pdf pages are extracted and merged in the order list in the source datatable. ''' That is, for source pdf files, they will be merged from top row down, and for pages, they will be merged ''' by the order listed in the csv string Public Shared Sub ExtractAndMergePdfPages(ByVal sourceTable As DataTable, ByVal outPdf As String) Dim rowCount As Integer = sourceTable.Rows.Count Dim sourcePdf As String = String.Empty Dim pageNumbersToExtract() As Integer = Nothing Dim reader As iTextSharp.text.pdf.PdfReader = Nothing Dim doc As iTextSharp.text.Document = Nothing Dim pdfCpy As iTextSharp.text.pdf.PdfCopy = Nothing Dim page As iTextSharp.text.pdf.PdfImportedPage = Nothing Select Case rowCount Case 0 'Nothing to extract and merge Exit Sub Case 1 'only 1 source pdf sourcePdf = CStr(sourceTable.Rows(0).Item(0)) pageNumbersToExtract = ConvertToIntegerArray(CStr(sourceTable.Rows(0).Item(1))) ExtractPdfPage(sourcePdf, pageNumbersToExtract, outPdf) Case Else 'multiple source pdf's Try sourcePdf = CStr(sourceTable.Rows(0).Item(0)) pageNumbersToExtract = ConvertToIntegerArray(CStr(sourceTable.Rows(0).Item(1))) reader = New iTextSharp.text.pdf.PdfReader(sourcePdf) doc = New iTextSharp.text.Document(reader.GetPageSizeWithRotation(1)) pdfCpy = New iTextSharp.text.pdf.PdfCopy(doc, New IO.FileStream(outPdf, IO.FileMode.Create)) doc.Open() For Each pageNum As Integer In pageNumbersToExtract page = pdfCpy.GetImportedPage(reader, pageNum) pdfCpy.AddPage(page) Next reader.Close() For i As Integer = 1 To rowCount - 1 sourcePdf = CStr(sourceTable.Rows(i).Item(0)) pageNumbersToExtract = ConvertToIntegerArray(CStr(sourceTable.Rows(i).Item(1))) reader = New iTextSharp.text.pdf.PdfReader(sourcePdf) doc.SetPageSize(reader.GetPageSizeWithRotation(1)) For Each pageNum As Integer In pageNumbersToExtract page = pdfCpy.GetImportedPage(reader, pageNum) pdfCpy.AddPage(page) Next reader.Close() Next doc.Close() Catch ex As Exception Throw ex End Try End Select End Sub '''

''' Helper function to convert a csv integer string to an integer array '''

''' the integer string in csv format (ex: "1, 5, 7, 4") ''' Integer array converted from the csv string (ex: {1, 5, 7, 4} ''' No error checking/handling. If the input string contains non-numeric values ''' the function will crash. It's up to you to handle this error. Private Shared Function ConvertToIntegerArray(ByVal csvNumbers As String) As Integer() Dim numbers() As String = csvNumbers.Split(",".ToCharArray, System.StringSplitOptions.RemoveEmptyEntries) Dim upperBound As Integer = numbers.Length - 1 Dim output(upperBound) As Integer For i As Integer = 0 To upperBound output(i) = Integer.Parse(numbers(i)) Next Return output End Function End Class