Search code examples
c#pdfitext7

How to Find PDF Text with a Given Color and Replace it


I want to find text with a given color and then replace it with a new color. I know Acrobat can do this, but it breaks the tag tree when low-level edits are made. I want to use iText to automate this task.

My application primarily works in the tag tree for accessibility related operations. However, I don't think that the color information is located in there. I've looped through a bunch of tags but I haven't seen anything for colors. Additionally, both PAC and iText Rups don't show color info in the tag tree.

I'm assuming I have to go to a "lower level" of the PDF, but I'm not sure how to do that.

I downloaded a copy of the PDF standard and found that color information is stored after the identifier "rg". iText Rups shows the following in the content stream:


EMC
/Standard <</MCID 0 >> BDC
q
0 0 0 rg --------------------- Sets color to black for this word
BT
56.8 724.1 Td
/F1 12 Tf
<0102030405> Tj
ET
Q
EMC

How would I access this stream in iText? Alternatively, is there a way to do this without going to this level? I'd prefer to work with the PdfStructs or something in the tag tree if I can do so.

Edit: After looking at the linked comment, I came up with this:


    class Program
    {
        static void Main(string[] args)
        {
            string inputPdfPath = "input.pdf";
            string outputPdfPath = "output.pdf";
    
            // set up PDF
            PdfReader reader = new(inputPdfPath);
            PdfWriter writer = new(outputPdfPath);
    
            PdfDocument pdfDocument = new PdfDocument(reader, writer);
            pdfDocument.SetTagged();

            // Search each page with the processor
            MyProcessor editor = new MyProcessor(new Listener());
            for (int i = 1; i <= pdfDocument.GetNumberOfPages(); i++)
            {
                editor.EditPage(pdfDocument, i);
            }
    
            pdfDocument.Close();
        }
    }
    public class MyProcessor : PdfCanvasProcessor
    {
        public MyProcessor(IEventListener eventListener) : base(eventListener) {}
    
        protected override void EventOccurred(IEventData data, EventType type)
        {
            // Only inspect text, ignore null objects
            if (data != null) {
                if (type is EventType.RENDER_TEXT) {
                    TextRenderInfo txt = (TextRenderInfo)data;
                    Console.WriteLine(txt.GetText());
                    foreach (var color in txt.GetFillColor().GetColorValue())
                    {
                        Console.Write(color + " ");
                    }
                    Console.WriteLine("");
                }
            }
            base.EventOccurred(data, type);
        }
    
        public void EditPage(PdfDocument pdfDocument, int pageNumber) {
            PdfPage page = pdfDocument.GetPage(pageNumber);
            PdfResources pdfResources = page.GetResources();
            PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), pdfResources, pdfDocument);
    
            ProcessContent(page.GetContentBytes(), pdfResources);
        }
    }
    
    public class Listener : IEventListener
    {
        public void EventOccurred(IEventData data, EventType type) {}
    
        public ICollection<EventType> GetSupportedEvents()
        {
            return new Collection<EventType>() {
                EventType.BEGIN_TEXT, EventType.RENDER_TEXT, EventType.END_TEXT,
                EventType.RENDER_IMAGE, EventType.RENDER_PATH, EventType.CLIP_PATH_CHANGED
            };
        }
    }

I can now find a specific color. I still need to figure out how to modify the existing color. It looks like I'll have to rewrite the page content instead of modifying an existing document though.


Solution

  • After working with the linked java example, I managed to get it working. The below code will take in a target color and replacement color. It will search the document and replace any instances of the target color with the replacement.

    This should work for all color spaces. I used RGB as my search condition, but you can change it to DeviceCMYK or DeviceGray if needed.

    using System.Collections.ObjectModel;
    using iText.Kernel.Colors;
    using iText.Kernel.Exceptions;
    using iText.Kernel.Pdf;
    using iText.Kernel.Pdf.Canvas;
    using iText.Kernel.Pdf.Canvas.Parser;
    using iText.Kernel.Pdf.Canvas.Parser.Data;
    using iText.Kernel.Pdf.Canvas.Parser.Listener;
    
    class Program
    {
        static void Main(string[] args)
        {
            // Replace the paths with your own file
            string inputPdfPath = "input.pdf";
            string outputPdfPath = "output.pdf";
    
            // set up PDF
            PdfReader reader = new(inputPdfPath);
            PdfWriter writer = new(outputPdfPath);
            PdfDocument pdfDocument = new PdfDocument(reader, writer);
            pdfDocument.SetTagged();
    
            // Use your own values here!
            Color find = new DeviceRgb(229, 18, 18);
            Color replacement = new DeviceRgb(0, 0, 0);
            // alternatively: new DeviceCmyk, new DeviceGray
    
            // Create a new editor
            PdfCanvasEditor editor = new(find, replacement);
    
            // Replace every page
            for (int i = 1; i <= pdfDocument.GetNumberOfPages(); i++)
            {
                editor.EditPage(pdfDocument, i);
            }
    
            // Save changes to output file
            pdfDocument.Close();
        }
    }
    
    public class PdfCanvasEditor : PdfCanvasProcessor
    {
        /// <summary> 
        /// The color of content in the document 
        /// </summary>
        private Color currentColor;
        /// <summary> 
        /// Color specified by the user to be replaced 
        /// </summary>
        private Color colorToFind;
    
        private Color replacementColor;
    
        /// <summary> 
        /// These operators may indicate that a color change has occurred 
        /// </summary>
        private List<string> TEXT_SHOWING_OPERATORS = new() { "Tj", "'", "\"", "TJ" };
    
        /// <summary> 
        /// Holds output canvas and related resources 
        /// </summary>
        protected PdfCanvas canvas = null;
    
        public PdfCanvasEditor(iText.Kernel.Colors.Color find, iText.Kernel.Colors.Color replace) : base(new ContentListener()) {
            this.colorToFind = find;
            this.replacementColor = replace;
        }
    
    
        /// <summary>
        /// Edits a page by a given number
        /// </summary>
        public void EditPage(PdfDocument pdfDocument, int pageNumber)
        {
            if ((pdfDocument.GetReader() == null) || (pdfDocument.GetWriter() == null))
            {
                throw new PdfException("PdfDocument must be editable");
            }
    
            // Get the current page and resources
            PdfPage page = pdfDocument.GetPage(pageNumber);
            PdfResources pdfResources = page.GetResources();
    
            // Create a new canvas to make changes on
            PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), pdfResources, pdfDocument);
            // Make the changes
            EditContent(page.GetContentBytes(), pdfResources, pdfCanvas);
    
            // Overwrites the old page with the new data
            page.Put(PdfName.Contents, pdfCanvas.GetContentStream());
        }
    
        /// <summary> 
        /// Edits content for a page with the binary data 
        /// </summary>
        public void EditContent(byte[] contentBytes, PdfResources resources, PdfCanvas canvas)
        {
            this.canvas = canvas;
            ProcessContent(contentBytes, resources);
            this.canvas = null;
        }
    
        /// <summary> 
        /// Copies a page to the "new" document. 
        /// </summary>
        public void Write(PdfCanvasProcessor processor, PdfLiteral pdfOperator, IList<PdfObject> operands)
        {
            PdfOutputStream pdfOutputStream = canvas.GetContentStream().GetOutputStream();
            int index = 0;
    
            // Copy each object to the new page
            foreach (PdfObject obj in operands)
            {
                pdfOutputStream.Write(obj);
                if (operands.Count > ++index) {
                    pdfOutputStream.WriteSpace();
                } else {
                    pdfOutputStream.WriteNewLine();
                }
            }
        }
    
        /// <summary> 
        /// Modifies a given color if it is found 
        /// </summary>
        public void Write(PdfCanvasProcessor processor, PdfLiteral pdfLiteral, IList<PdfObject> operands, bool isColor)
        {
            Console.WriteLine("Processing PDF data");
            string operatorString = pdfLiteral.ToString();
    
            // Only check a color if there is a related operator
            if (TEXT_SHOWING_OPERATORS.Contains(operatorString))
            {
                if (currentColor == null)
                {
                    // Gets the color of the content
                    Color currentFillColor = GetGraphicsState().GetFillColor();
                    
                    // If the color matches, start the replacement process
                    if (colorToFind.Equals(currentFillColor))
                    {
                        // Set the current color
                        currentColor = currentFillColor;
    
                        // Replace it
                        List<PdfObject> list = GetColorList(replacementColor);
                        Write(processor, new PdfLiteral("rg"), list);
                    }
                }
            }
            else if (currentColor != null)
            {
                if (currentColor is DeviceCmyk)
                {
                    List<PdfObject> list = GetColorList(replacementColor);
                    Write(processor, new PdfLiteral("k"), list);
                }
                else if (currentColor is DeviceGray)
                {
                    List<PdfObject> list = GetColorList(replacementColor);
                    Write(processor, new PdfLiteral("g"), list);
                }
                else
                {
                    List<PdfObject> list = GetColorList(replacementColor);
                    Write(processor, new PdfLiteral("rg"), list);
                }
    
                // Reset. Allows for more colors to be replaced
                currentColor = null;
            }
    
            Write(processor, pdfLiteral, operands);
        }
    
        /// <summary> 
        /// Overrides PdfContentStreamProcessor methods 
        /// </summary>
        public override IContentOperator RegisterContentOperator(string operatorString, IContentOperator pdfOperator)
        {
            ContentOperatorWrapper wrapper = new ContentOperatorWrapper(this);
            wrapper.setOriginalOperator(pdfOperator);
            IContentOperator formerOperator = base.RegisterContentOperator(operatorString, wrapper);
            if (formerOperator is ContentOperatorWrapper)
            {
                return ((ContentOperatorWrapper)formerOperator).getOriginalOperator();
            }
            else
            {
                return formerOperator;
            }
        }
    
        /// <summary>
        /// Gets a color list based on the color space
        /// </summary>
        private List<PdfObject> GetColorList(Color color)
        {
            List<PdfObject> list = new();
            float[] values = color.GetColorValue();
            if (color is DeviceCmyk)
            {
                list.Add(new PdfNumber(values[0]));
                list.Add(new PdfNumber(values[1]));
                list.Add(new PdfNumber(values[2]));
                list.Add(new PdfNumber(values[3]));
                list.Add(new PdfLiteral("k"));
            }
            else if (color is DeviceGray)
            {
                list.Add(new PdfNumber(values[0]));
                list.Add(new PdfLiteral("g"));
            }
            else
            {
                list.Add(new PdfNumber(values[0]));
                list.Add(new PdfNumber(values[1]));
                list.Add(new PdfNumber(values[2]));
                list.Add(new PdfLiteral("rg"));
            }
    
            return list;
        }
    
        /// <summary>
        /// A content operator class to wrap all content operators to forward the invocation to the editor
        /// </summary>
        class ContentOperatorWrapper : IContentOperator
        {
            private PdfCanvasEditor editor;
            private IContentOperator originalOperator = null;
            
            public ContentOperatorWrapper(PdfCanvasEditor editor)
            {
                this.editor = editor;
            }
    
            public IContentOperator getOriginalOperator()
            {
                return originalOperator;
            }
    
            public void setOriginalOperator(IContentOperator originalOperator)
            {
                this.originalOperator = originalOperator;
            }
    
    
            public void Invoke(PdfCanvasProcessor processor, PdfLiteral pdfLiteral, IList<PdfObject> operands)
            {
                if (originalOperator != null && !"Do".Equals(pdfLiteral.ToString()))
                {
                    originalOperator.Invoke(processor, pdfLiteral, operands);
                }
                editor.Write(processor, pdfLiteral, operands, true);
            }
        }
    
        /// <summary> 
        /// A listener which will emit when an event that may have colored text occurs 
        /// </summary>
        class ContentListener : IEventListener
        {
            public void EventOccurred(IEventData data, EventType type) { }
    
            ICollection<EventType> IEventListener.GetSupportedEvents()
            {
                return new Collection<EventType>() {
                    EventType.BEGIN_TEXT, EventType.RENDER_TEXT, EventType.END_TEXT, EventType.RENDER_IMAGE,
                    EventType.RENDER_PATH, EventType.CLIP_PATH_CHANGED
                };
            }
        }
    }