Search code examples
c#parsingpdfitextpdf-parsing

Extract value from PDF file to variable


I am trying to get "Invoice number", in this case INV-3337 from PDF file and would like to store it as variable for future use in the code.

Currently I am working on example and using this PDF for test purposes: https://slicedinvoices.com/pdf/wordpress-pdf-invoice-plugin-sample.pdf

With my current code I am able to parse whole content to .txt format. Can somebody guide me how to get only needed value and store it into variable? Can it be done directly with itextsharp? Or do I need to parse first all to .txt file, then parse .txt file, store value as variable, delete .txt file and proceed forward?

Note! There will be a lot of PDF files to parse in real setup.

Here is my current code:

using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System;
using System.IO;
using System.Text;


namespace PDF_parser
{
    class Program
    {
        static void Main(string[] args)
        {

            string filePath = @"C:\temp\parser\Invoice_Template.pdf";
            string outPath = @"C:\temp\parser\Invoice_Template.txt";
            int pagesToScan = 2;

            string strText = string.Empty;
            try
            {
                PdfReader reader = new PdfReader(filePath);

                for (int page = 1; page <= pagesToScan; page++) //(int page = 1; page <= reader.NumberOfPages; page++) <- for scanning all the pages in A PDF
                {
                    ITextExtractionStrategy its = new LocationTextExtractionStrategy();
                    strText = PdfTextExtractor.GetTextFromPage(reader, page, its);

                    strText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(strText)));
                    //creating the string array and storing the PDF line by line
                    string[] lines = strText.Split('\n');
                    foreach (string line in lines)
                    {
                        //Creating and appending to a text file
                        using (StreamWriter file = new StreamWriter(outPath, true))
                        {
                            file.WriteLine(line);
                        }
                    }
                }

                reader.Close();
            }
            catch (Exception ex)
            {
                Console.Write(ex);
            }
        }
    }
}

EDIT:

Did I understand it right?

using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System;
using System.IO;
using System.Text;


namespace PDF_parser
{
    class Program
    {
        static void Main(string[] args)
        {

            string filePath = @"C:\temp\parser\Invoice_Template.pdf";
            string outPath = @"C:\temp\parser\Invoice_Template.txt";
            int pagesToScan = 2;

            string strText = string.Empty;
            try
            {
                PdfReader reader = new PdfReader(filePath);

                for (int page = 1; page <= pagesToScan; page++) //(int page = 1; page <= reader.NumberOfPages; page++) <- for scanning all the pages in A PDF
                {
                    ITextExtractionStrategy its = new LocationTextExtractionStrategy();
                    strText = PdfTextExtractor.GetTextFromPage(reader, page, its);

                    strText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(strText)));
                    //creating the string array and storing the PDF line by line
                    string[] lines = strText.Split('\n');
                    foreach (string line in lines)
                    {
                        //Creating and appending to a text file
                        using (StreamWriter file = new StreamWriter(outPath, true))
                        {
                            // file.WriteLine(line);

                           int indexOccurrance = line.LastIndexOf("Invoice Number");
                           if(indexOccurrance > 0)
                           {
                           var invoiceNumber = line.Substring(indexOccurrance, (line.Length - indexOccurrance) );
                           }
                        }
                    }
                }

                reader.Close();
            }
            catch (Exception ex)
            {
                Console.Write(ex);
            }
        }
    }
}

Solution

  • One option is to search for "Invoice Number" in each line text using LastIndexOf. If found then use Substring to get rest of that line (which will be Invoice Number)

    Something like:

    int indexOccurrance = line.LastIndexOf("Invoice Number");
    if(indexOccurrance > 0)
    {
      var invoiceNumber = line.Substring(indexOccurrance, (line.Length - indexOccurrance) );
    }