I need to highlight a set of words inside an existing PDF given specific coordinates that i have already extracted. I am working with pdfbox by Apache (last version 2.0.8). There is an example file I can use to such a purpose (AddAnnotations.java inside the pdfbox website) but I think this example was compiled with an older Java version as the following import does not work:
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationHighlight;
Can anyone help me with that? Which is the simplest way to highlight words by using this library?
Here is the code to highlight ALL the words inside a PDF document. Highlighting only a specific set of words can be easily performed modifying this script. Please note this is only a test and further checks are needed for words that terminates in a new line as well as words placed in negative landscape/portrait PDF pages. Optimizing this script is also possible.
This script was built using Apache PDFBox 2.0.8.
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
public class TestAnnotatePDF extends PDFTextStripper
{
static List<double[]> coordinates;
static ArrayList tokenStream;
public TestAnnotatePDF() throws IOException
{
//data structed containing coordinates information for each token
coordinates = new ArrayList<>();
//List of words extracted from text (considering a whitespace-based tokenization)
tokenStream = new ArrayList();
}
public static void main(String [] args) throws IOException
{
try
{
//Loading an existing document
File file = new File("MyDocument");
PDDocument document = PDDocument.load(file);
//extended PDFTextStripper class
PDFTextStripper stripper = new TestAnnotatePDF();
//Get number of pages
int number_of_pages = document.getDocumentCatalog().getPages().getCount();
//The method writeText will invoke an override version of writeString
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document, dummy);
//Print collected information
System.out.println(tokenStream);
System.out.println(tokenStream.size());
System.out.println(coordinates.size());
double page_height;
double page_width;
double width, height, minx, maxx, miny, maxy;
int rotation;
//scan each page and highlitht all the words inside them
for (int page_index = 0; page_index < number_of_pages; page_index++)
{
//get current page
PDPage page = document.getPage(page_index);
//Get annotations for the selected page
List<PDAnnotation> annotations = page.getAnnotations();
//Define a color to use for highlighting text
PDColor red = new PDColor(new float[] { 1, 0, 0 }, PDDeviceRGB.INSTANCE);
//Page height and width
page_height = page.getMediaBox().getHeight();
page_width = page.getMediaBox().getWidth();
//Scan collected coordinates
for (int i=0; i<coordinates.size(); i++)
{
//if the current coordinates are not related to the current
//page, ignore them
if ((int) coordinates.get(i)[4] != (page_index+1))
continue;
else
{
//get rotation of the page...portrait..landscape..
rotation = (int) coordinates.get(i)[7];
//page rotated of 90degrees
if (rotation == 90)
{
height = coordinates.get(i)[5];
width = coordinates.get(i)[6];
width = (page_height * width)/page_width;
//define coordinates of a rectangle
maxx = coordinates.get(i)[1];
minx = coordinates.get(i)[1] - height;
miny = coordinates.get(i)[0];
maxy = coordinates.get(i)[0] + width;
}
else //i should add here the cases -90/-180 degrees
{
height = coordinates.get(i)[5];
minx = coordinates.get(i)[0];
maxx = coordinates.get(i)[2];
miny = page_height - coordinates.get(i)[1];
maxy = page_height - coordinates.get(i)[3] + height;
}
//Add an annotation for each scanned word
PDAnnotationTextMarkup txtMark = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
txtMark.setColor(red);
txtMark.setConstantOpacity((float)0.3); // 30% transparent
PDRectangle position = new PDRectangle();
position.setLowerLeftX((float) minx);
position.setLowerLeftY((float) miny);
position.setUpperRightX((float) maxx);
position.setUpperRightY((float) ((float) maxy+height));
txtMark.setRectangle(position);
float[] quads = new float[8];
quads[0] = position.getLowerLeftX(); // x1
quads[1] = position.getUpperRightY()-2; // y1
quads[2] = position.getUpperRightX(); // x2
quads[3] = quads[1]; // y2
quads[4] = quads[0]; // x3
quads[5] = position.getLowerLeftY()-2; // y3
quads[6] = quads[2]; // x4
quads[7] = quads[5]; // y5
txtMark.setQuadPoints(quads);
txtMark.setContents(tokenStream.get(i).toString());
annotations.add(txtMark);
}
}
}
//Saving the document in a new file
File highlighted_doc = new File("MyDocument_final.pdf");
document.save(highlighted_doc);
document.close();
}
catch(IOException e)
{
System.out.println(e);
}
}
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException
{
String token = "";
int token_length = textPositions.size();
int counter = 1;
double minx = 0,maxx = 0,miny = 0,maxy =0;
double height = 0;
double width = 0;
int rotation = 0;
for (TextPosition text : textPositions)
{
rotation = text.getRotation();
if (text.getHeight() > height)
height = text.getHeight();
if (text.getWidth() > width)
width = text.getWidth();
//if it is the first char of the current word
if (counter == 1)
{
minx = text.getX();
miny = text.getY();
}
//if it is the last char of the current word
if (counter == token_length)
{
maxx = text.getEndX();
maxy = text.getY();
}
token += text;
counter += 1;
}
tokenStream.add(token);
double word_coordinates [] = {minx,miny,maxx,maxy,this.getCurrentPageNo(), height, width, rotation};
coordinates.add(word_coordinates);
}}