can anyone help me how to differentiate between background color and text color of pdf document if background color and text color are same.
Actually I need to set some static color to invisible text using pdfbox so that text which are invisible we can see.
TextObjectInfo contains all text object information using PDFStreamEngine .
public class SimplePdfRegeneretor {
private PDDocument _document;
private PDResources _pageResource;
private PDFTextObjectInfoExtraction _PDFTextObjectInfoExtraction;
private List<List<TextObjectInfo>> _documentTextObjectInfo;
private void RecreatePDF() throws IOException{
int _pageNo = 0;
for (PDPage page : _document.getPages())
{
List<TextObjectInfo> _pageTextObjectInfo = this._documentTextObjectInfo.get(_pageNo);
try (PDPageContentStream contentStream = new PDPageContentStream(_document,
page, AppendMode.APPEND, false, true)){
Integer _textObjInfoInx = 0 ;
//contentStream.setNonStrokingColor(0,0,0,0);
for(TextObjectInfo _textObjInfo : _pageTextObjectInfo){
Float _xmin = _textObjInfo.get_xyminmax().get(0);
Float _ymin = _textObjInfo.get_xyminmax().get(1);
putTextOnDocument(contentStream,_textObjInfo,_textObjInfo.TextFontObject,_xmin,_ymin,_textObjInfoInx);
_textObjInfoInx++;
}
}
_pageNo++;
}
_pageNo = 0;
for (PDPage _page : _document.getPages())
{
List<Object> newTokens = addTjStringtoContenStream(_page,_pageNo);
PDStream newContents = new PDStream(_document);
writeTokensToStream(newContents, newTokens);
_page.setContents(newContents);
System.out.println("Page TextObject Writting Completed.."+_pageNo);
_pageNo++;
}
}
private void putTextOnDocument(PDPageContentStream contentStream, TextObjectInfo _textObjInfo, PDFont font, Float horizontalPixel,
Float verticalPixel, int TextObjectIndex) throws IOException {
String _textobjstr = "TextObjectIndex-" + TextObjectIndex;
Matrix _tm = _textObjInfo.textMatrixs.get(_textObjInfo.textMatrixs.size()-1);
int fontSize = _textObjInfo.TextFontSize.intValue();
PDGraphicsState _GraphicsState = _textObjInfo.getGraphicsState();
PDTextState _TextState = _GraphicsState.getTextState();
contentStream.beginText();
contentStream.setNonStrokingColor(_GraphicsState.getNonStrokingColor());
contentStream.setStrokingColor(_GraphicsState.getStrokingColor());
contentStream.setRenderingMode(_TextState.getRenderingMode());
contentStream.setFont(font, fontSize);
contentStream.setTextMatrix(_tm);
contentStream.beginMarkedContent(COSName.getPDFName(_textobjstr));
contentStream.endMarkedContent();
contentStream.endText();
}
private List<Object> addTjStringtoContenStream(PDContentStream contentStream, int _pgInx) throws IOException{
PDFStreamParser parser = new PDFStreamParser(contentStream);
Object token = parser.parseNextToken();
List<Object> newTokens = new ArrayList<>();
List<TextObjectInfo> _pageTextObjInfo = this._documentTextObjectInfo.get(_pgInx);
System.out.println("Len of _pageTextObjInfo: "+_pageTextObjInfo.size());
//newTokens.add(Operator.getOperator("q"));
while (token != null)
{
if (token instanceof Operator)
{
Operator op = (Operator) token;
String opName = op.getName();
if (OperatorName.BEGIN_MARKED_CONTENT.equals(opName))
{
// remove the argument to this operator
//System.out.println(newTokens.get(newTokens.size() - 1));
Integer _tjObjInx = Integer.parseInt(((COSName)newTokens.get(newTokens.size() - 1)).getName().replace("TextObjectIndex-", ""));
TextObjectInfo _TextObjectInfo = _pageTextObjInfo.get(_tjObjInx);
COSString _tjStr = _TextObjectInfo.TjString;
newTokens.remove(newTokens.size() - 1);
newTokens.add(_tjStr);
newTokens.add(Operator.getOperator("Tj"));
token = parser.parseNextToken();
continue;
}
else if (OperatorName.END_MARKED_CONTENT.equals(opName))
{
token = parser.parseNextToken();
continue;
}
}
newTokens.add(token);
token = parser.parseNextToken();
}
//newTokens.add(Operator.getOperator("Q"));
return newTokens;
}
private static void writeTokensToStream(PDStream newContents, List<Object> newTokens) throws IOException
{
try (OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE))
{
ContentStreamWriter writer = new ContentStreamWriter(out);
writer.writeTokens(newTokens);
}
}
}
2.this is text having black color without background image
I am using pdfbox verson 2.0+ , So I have added these following operators in the constructor of my overwritten PDFStreamEngine:
addOperator(new SetStrokingColorSpace());
addOperator(new SetNonStrokingColorSpace());
addOperator(new SetStrokingDeviceCMYKColor());
addOperator(new SetNonStrokingDeviceCMYKColor());
addOperator(new SetNonStrokingDeviceRGBColor());
addOperator(new SetStrokingDeviceRGBColor());
addOperator(new SetNonStrokingDeviceGrayColor());
addOperator(new SetStrokingDeviceGrayColor());
addOperator(new SetStrokingColor());
addOperator(new SetStrokingColorN());
addOperator(new SetNonStrokingColor());
addOperator(new SetNonStrokingColorN());
Then extracted required information from this getGraphicsState(). please also look into this https://pdfbox.apache.org/2.0/migration.html specially Text Extraction part .