Search code examples
c#imagepdfitext

Extract image from PDF using itextsharp


I am trying to extract all the images from a pdf using itextsharp but can't seem to overcome this one hurdle.

The error occures on the line System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS); giving an error of "Parameter is not valid".

I think it works when the image is a bitmap but not of any other format.

I have this following code - sorry for the length;

    private void Form1_Load(object sender, EventArgs e)
    {
        FileStream fs = File.OpenRead(@"reader.pdf");
        byte[] data = new byte[fs.Length];
        fs.Read(data, 0, (int)fs.Length);

        List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();

        iTextSharp.text.pdf.RandomAccessFileOrArray RAFObj = null;
        iTextSharp.text.pdf.PdfReader PDFReaderObj = null;
        iTextSharp.text.pdf.PdfObject PDFObj = null;
        iTextSharp.text.pdf.PdfStream PDFStremObj = null;

        try
        {
            RAFObj = new iTextSharp.text.pdf.RandomAccessFileOrArray(data);
            PDFReaderObj = new iTextSharp.text.pdf.PdfReader(RAFObj, null);

            for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
            {
                PDFObj = PDFReaderObj.GetPdfObject(i);

                if ((PDFObj != null) && PDFObj.IsStream())
                {
                    PDFStremObj = (iTextSharp.text.pdf.PdfStream)PDFObj;
                    iTextSharp.text.pdf.PdfObject subtype = PDFStremObj.Get(iTextSharp.text.pdf.PdfName.SUBTYPE);

                    if ((subtype != null) && subtype.ToString() == iTextSharp.text.pdf.PdfName.IMAGE.ToString())
                    {
                        byte[] bytes = iTextSharp.text.pdf.PdfReader.GetStreamBytesRaw((iTextSharp.text.pdf.PRStream)PDFStremObj);

                        if ((bytes != null))
                        {
                            try
                            {
                                System.IO.MemoryStream MS = new System.IO.MemoryStream(bytes);

                                MS.Position = 0;
                                System.Drawing.Image ImgPDF = System.Drawing.Image.FromStream(MS);

                                ImgList.Add(ImgPDF);

                            }
                            catch (Exception)
                            {
                            }
                        }
                    }
                }
            }
            PDFReaderObj.Close();
        }
        catch (Exception ex)
        {
            throw new Exception(ex.Message);
        }



    } //Form1_Load

Solution

  • I have used this library in the past without any problems.

    http://www.winnovative-software.com/PdfImgExtractor.aspx

    private void btnExtractImages_Click(object sender, EventArgs e)
    {
        if (pdfFileTextBox.Text.Trim().Equals(String.Empty))
        {
            MessageBox.Show("Please choose a source PDF file", "Choose PDF file", MessageBoxButtons.OK);
            return;
        }
    
        // the source pdf file
        string pdfFileName = pdfFileTextBox.Text.Trim();
    
        // start page number
        int startPageNumber = int.Parse(textBoxStartPage.Text.Trim());
        // end page number
        // when it is 0 the extraction will continue up to the end of document
        int endPageNumber = 0;
        if (textBoxEndPage.Text.Trim() != String.Empty)
            endPageNumber = int.Parse(textBoxEndPage.Text.Trim());
    
        // create the PDF images extractor object
        PdfImagesExtractor pdfImagesExtractor = new PdfImagesExtractor();
    
        pdfImagesExtractor.LicenseKey = "31FAUEJHUEBQRl5AUENBXkFCXklJSUlQQA==";
    
        // the demo output directory
        string outputDirectory = Path.Combine(Application.StartupPath, @"DemoFiles\Output");
    
        Cursor = Cursors.WaitCursor;
    
        // set the handler to be called when an image was extracted
        pdfImagesExtractor.ImageExtractedEvent += pdfImagesExtractor_ImageExtractedEvent;
    
        try
        {
            // start images counting
            imageIndex = 0;
    
            // call the images extractor to raise the ImageExtractedEvent event when an images is extracted from a PDF page
            // the pdfImagesExtractor_ImageExtractedEvent handler below will be executed for each extracted image
            pdfImagesExtractor.ExtractImagesInEvent(pdfFileName, startPageNumber, endPageNumber);
    
            // Alternatively you can use the ExtractImages() and ExtractImagesToFile() methods
            // to extracted the images from a PDF document in memory or to image files in a directory
    
            // uncomment the line below to extract the images to an array of ExtractedImage objects
            //ExtractedImage[] pdfPageImages = pdfImagesExtractor.ExtractImages(pdfFileName, startPageNumber, endPageNumber);
    
            // uncomment the lines below to extract the images to image files in a directory
            //string outputDirectory = System.IO.Path.Combine(Application.StartupPath, @"DemoFiles\Output");
            //pdfImagesExtractor.ExtractImagesToFile(pdfFileName, startPageNumber, endPageNumber, outputDirectory, "pdfimage");
        }
        catch (Exception ex)
        {
            // The extraction failed
            MessageBox.Show(String.Format("An error occurred. {0}", ex.Message), "Error");
            return;
        }
        finally
        {
            // uninstall the event handler
            pdfImagesExtractor.ImageExtractedEvent -= pdfImagesExtractor_ImageExtractedEvent;
    
            Cursor = Cursors.Arrow;
        }
    
        try
        {
            System.Diagnostics.Process.Start(outputDirectory);
        }
        catch (Exception ex)
        {
            MessageBox.Show(string.Format("Cannot open output folder. {0}", ex.Message));
            return;
        }
    }
    
    /// <summary>
    /// The ImageExtractedEvent event handler called after an image was extracted from a PDF page.
    /// The event is raised when the ExtractImagesInEvent() method is used
    /// </summary>
    /// <param name="args">The handler argument containing the extracted image and the PDF page number</param>
    void pdfImagesExtractor_ImageExtractedEvent(ImageExtractedEventArgs args)
    {
        // get the image object and page number from even handler argument
        Image pdfPageImageObj = args.ExtractedImage.ImageObject;
        int pageNumber = args.ExtractedImage.PageNumber;
    
        // save the extracted image to a PNG file
        string outputPageImage = Path.Combine(Application.StartupPath, @"DemoFiles\Output", 
            "pdfimage_" + pageNumber.ToString() + "_" + imageIndex++ + ".png");
        pdfPageImageObj.Save(outputPageImage, ImageFormat.Png);
    
        args.ExtractedImage.Dispose();
    }