I want to find text with a given color and then replace it with a new color. I know Acrobat can do this, but it breaks the tag tree when low-level edits are made. I want to use iText to automate this task.
My application primarily works in the tag tree for accessibility related operations. However, I don't think that the color information is located in there. I've looped through a bunch of tags but I haven't seen anything for colors. Additionally, both PAC and iText Rups don't show color info in the tag tree.
I'm assuming I have to go to a "lower level" of the PDF, but I'm not sure how to do that.
I downloaded a copy of the PDF standard and found that color information is stored after the identifier "rg". iText Rups shows the following in the content stream:
EMC
/Standard <</MCID 0 >> BDC
q
0 0 0 rg --------------------- Sets color to black for this word
BT
56.8 724.1 Td
/F1 12 Tf
<0102030405> Tj
ET
Q
EMC
How would I access this stream in iText? Alternatively, is there a way to do this without going to this level? I'd prefer to work with the PdfStructs or something in the tag tree if I can do so.
Edit: After looking at the linked comment, I came up with this:
class Program
{
static void Main(string[] args)
{
string inputPdfPath = "input.pdf";
string outputPdfPath = "output.pdf";
// set up PDF
PdfReader reader = new(inputPdfPath);
PdfWriter writer = new(outputPdfPath);
PdfDocument pdfDocument = new PdfDocument(reader, writer);
pdfDocument.SetTagged();
// Search each page with the processor
MyProcessor editor = new MyProcessor(new Listener());
for (int i = 1; i <= pdfDocument.GetNumberOfPages(); i++)
{
editor.EditPage(pdfDocument, i);
}
pdfDocument.Close();
}
}
public class MyProcessor : PdfCanvasProcessor
{
public MyProcessor(IEventListener eventListener) : base(eventListener) {}
protected override void EventOccurred(IEventData data, EventType type)
{
// Only inspect text, ignore null objects
if (data != null) {
if (type is EventType.RENDER_TEXT) {
TextRenderInfo txt = (TextRenderInfo)data;
Console.WriteLine(txt.GetText());
foreach (var color in txt.GetFillColor().GetColorValue())
{
Console.Write(color + " ");
}
Console.WriteLine("");
}
}
base.EventOccurred(data, type);
}
public void EditPage(PdfDocument pdfDocument, int pageNumber) {
PdfPage page = pdfDocument.GetPage(pageNumber);
PdfResources pdfResources = page.GetResources();
PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), pdfResources, pdfDocument);
ProcessContent(page.GetContentBytes(), pdfResources);
}
}
public class Listener : IEventListener
{
public void EventOccurred(IEventData data, EventType type) {}
public ICollection<EventType> GetSupportedEvents()
{
return new Collection<EventType>() {
EventType.BEGIN_TEXT, EventType.RENDER_TEXT, EventType.END_TEXT,
EventType.RENDER_IMAGE, EventType.RENDER_PATH, EventType.CLIP_PATH_CHANGED
};
}
}
I can now find a specific color. I still need to figure out how to modify the existing color. It looks like I'll have to rewrite the page content instead of modifying an existing document though.
After working with the linked java example, I managed to get it working. The below code will take in a target color and replacement color. It will search the document and replace any instances of the target color with the replacement.
This should work for all color spaces. I used RGB as my search condition, but you can change it to DeviceCMYK or DeviceGray if needed.
using System.Collections.ObjectModel;
using iText.Kernel.Colors;
using iText.Kernel.Exceptions;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Data;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
class Program
{
static void Main(string[] args)
{
// Replace the paths with your own file
string inputPdfPath = "input.pdf";
string outputPdfPath = "output.pdf";
// set up PDF
PdfReader reader = new(inputPdfPath);
PdfWriter writer = new(outputPdfPath);
PdfDocument pdfDocument = new PdfDocument(reader, writer);
pdfDocument.SetTagged();
// Use your own values here!
Color find = new DeviceRgb(229, 18, 18);
Color replacement = new DeviceRgb(0, 0, 0);
// alternatively: new DeviceCmyk, new DeviceGray
// Create a new editor
PdfCanvasEditor editor = new(find, replacement);
// Replace every page
for (int i = 1; i <= pdfDocument.GetNumberOfPages(); i++)
{
editor.EditPage(pdfDocument, i);
}
// Save changes to output file
pdfDocument.Close();
}
}
public class PdfCanvasEditor : PdfCanvasProcessor
{
/// <summary>
/// The color of content in the document
/// </summary>
private Color currentColor;
/// <summary>
/// Color specified by the user to be replaced
/// </summary>
private Color colorToFind;
private Color replacementColor;
/// <summary>
/// These operators may indicate that a color change has occurred
/// </summary>
private List<string> TEXT_SHOWING_OPERATORS = new() { "Tj", "'", "\"", "TJ" };
/// <summary>
/// Holds output canvas and related resources
/// </summary>
protected PdfCanvas canvas = null;
public PdfCanvasEditor(iText.Kernel.Colors.Color find, iText.Kernel.Colors.Color replace) : base(new ContentListener()) {
this.colorToFind = find;
this.replacementColor = replace;
}
/// <summary>
/// Edits a page by a given number
/// </summary>
public void EditPage(PdfDocument pdfDocument, int pageNumber)
{
if ((pdfDocument.GetReader() == null) || (pdfDocument.GetWriter() == null))
{
throw new PdfException("PdfDocument must be editable");
}
// Get the current page and resources
PdfPage page = pdfDocument.GetPage(pageNumber);
PdfResources pdfResources = page.GetResources();
// Create a new canvas to make changes on
PdfCanvas pdfCanvas = new PdfCanvas(new PdfStream(), pdfResources, pdfDocument);
// Make the changes
EditContent(page.GetContentBytes(), pdfResources, pdfCanvas);
// Overwrites the old page with the new data
page.Put(PdfName.Contents, pdfCanvas.GetContentStream());
}
/// <summary>
/// Edits content for a page with the binary data
/// </summary>
public void EditContent(byte[] contentBytes, PdfResources resources, PdfCanvas canvas)
{
this.canvas = canvas;
ProcessContent(contentBytes, resources);
this.canvas = null;
}
/// <summary>
/// Copies a page to the "new" document.
/// </summary>
public void Write(PdfCanvasProcessor processor, PdfLiteral pdfOperator, IList<PdfObject> operands)
{
PdfOutputStream pdfOutputStream = canvas.GetContentStream().GetOutputStream();
int index = 0;
// Copy each object to the new page
foreach (PdfObject obj in operands)
{
pdfOutputStream.Write(obj);
if (operands.Count > ++index) {
pdfOutputStream.WriteSpace();
} else {
pdfOutputStream.WriteNewLine();
}
}
}
/// <summary>
/// Modifies a given color if it is found
/// </summary>
public void Write(PdfCanvasProcessor processor, PdfLiteral pdfLiteral, IList<PdfObject> operands, bool isColor)
{
Console.WriteLine("Processing PDF data");
string operatorString = pdfLiteral.ToString();
// Only check a color if there is a related operator
if (TEXT_SHOWING_OPERATORS.Contains(operatorString))
{
if (currentColor == null)
{
// Gets the color of the content
Color currentFillColor = GetGraphicsState().GetFillColor();
// If the color matches, start the replacement process
if (colorToFind.Equals(currentFillColor))
{
// Set the current color
currentColor = currentFillColor;
// Replace it
List<PdfObject> list = GetColorList(replacementColor);
Write(processor, new PdfLiteral("rg"), list);
}
}
}
else if (currentColor != null)
{
if (currentColor is DeviceCmyk)
{
List<PdfObject> list = GetColorList(replacementColor);
Write(processor, new PdfLiteral("k"), list);
}
else if (currentColor is DeviceGray)
{
List<PdfObject> list = GetColorList(replacementColor);
Write(processor, new PdfLiteral("g"), list);
}
else
{
List<PdfObject> list = GetColorList(replacementColor);
Write(processor, new PdfLiteral("rg"), list);
}
// Reset. Allows for more colors to be replaced
currentColor = null;
}
Write(processor, pdfLiteral, operands);
}
/// <summary>
/// Overrides PdfContentStreamProcessor methods
/// </summary>
public override IContentOperator RegisterContentOperator(string operatorString, IContentOperator pdfOperator)
{
ContentOperatorWrapper wrapper = new ContentOperatorWrapper(this);
wrapper.setOriginalOperator(pdfOperator);
IContentOperator formerOperator = base.RegisterContentOperator(operatorString, wrapper);
if (formerOperator is ContentOperatorWrapper)
{
return ((ContentOperatorWrapper)formerOperator).getOriginalOperator();
}
else
{
return formerOperator;
}
}
/// <summary>
/// Gets a color list based on the color space
/// </summary>
private List<PdfObject> GetColorList(Color color)
{
List<PdfObject> list = new();
float[] values = color.GetColorValue();
if (color is DeviceCmyk)
{
list.Add(new PdfNumber(values[0]));
list.Add(new PdfNumber(values[1]));
list.Add(new PdfNumber(values[2]));
list.Add(new PdfNumber(values[3]));
list.Add(new PdfLiteral("k"));
}
else if (color is DeviceGray)
{
list.Add(new PdfNumber(values[0]));
list.Add(new PdfLiteral("g"));
}
else
{
list.Add(new PdfNumber(values[0]));
list.Add(new PdfNumber(values[1]));
list.Add(new PdfNumber(values[2]));
list.Add(new PdfLiteral("rg"));
}
return list;
}
/// <summary>
/// A content operator class to wrap all content operators to forward the invocation to the editor
/// </summary>
class ContentOperatorWrapper : IContentOperator
{
private PdfCanvasEditor editor;
private IContentOperator originalOperator = null;
public ContentOperatorWrapper(PdfCanvasEditor editor)
{
this.editor = editor;
}
public IContentOperator getOriginalOperator()
{
return originalOperator;
}
public void setOriginalOperator(IContentOperator originalOperator)
{
this.originalOperator = originalOperator;
}
public void Invoke(PdfCanvasProcessor processor, PdfLiteral pdfLiteral, IList<PdfObject> operands)
{
if (originalOperator != null && !"Do".Equals(pdfLiteral.ToString()))
{
originalOperator.Invoke(processor, pdfLiteral, operands);
}
editor.Write(processor, pdfLiteral, operands, true);
}
}
/// <summary>
/// A listener which will emit when an event that may have colored text occurs
/// </summary>
class ContentListener : IEventListener
{
public void EventOccurred(IEventData data, EventType type) { }
ICollection<EventType> IEventListener.GetSupportedEvents()
{
return new Collection<EventType>() {
EventType.BEGIN_TEXT, EventType.RENDER_TEXT, EventType.END_TEXT, EventType.RENDER_IMAGE,
EventType.RENDER_PATH, EventType.CLIP_PATH_CHANGED
};
}
}
}