Search code examples
javapdfpdfbox

Print the positions of each TJ and charecters inside TJ/Tj in a PDF using PDFBOX?


I need control over the printing text positions. I need to print each TJ/Tj wise. I have TJ or Tj operator's cosstring object. How can I get the each character x, y positions in a PDF. enter image description here

Content stream looks like this

enter image description here

file

C position is (72, 633.8289)

h position is (88.7903125, 633.8289)

a position is (101.7059375, 633.8289) ....

How can we get these positions using PDFBOX classes. I tried with some

writeString(String, List<TextPosition>) or processTextPosition(TextPosition)

I am able to get text lines but not positions. Please help me any example code is there to get each TJ operator's each character positions?


Solution

  • we need to override all positon related classes. Thanks @Tilman Hausherr and @mkl. Please correct my answer if required. thanks once again.

    import java.awt.geom.GeneralPath;
    import java.io.ByteArrayInputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.util.ArrayList;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    import java.util.Stack;
    import org.apache.pdfbox.contentstream.PDContentStream;
    import org.apache.pdfbox.contentstream.PDFStreamEngine;
    import org.apache.pdfbox.contentstream.operator.Operator;
    import org.apache.pdfbox.contentstream.operator.OperatorProcessor;
    import org.apache.pdfbox.cos.COSArray;
    import org.apache.pdfbox.cos.COSBase;
    import org.apache.pdfbox.cos.COSNumber;
    import org.apache.pdfbox.cos.COSObject;
    import org.apache.pdfbox.cos.COSString;
    import org.apache.pdfbox.pdfparser.PDFStreamParser;
    import org.apache.pdfbox.pdmodel.PDPage;
    import org.apache.pdfbox.pdmodel.PDResources;
    import org.apache.pdfbox.pdmodel.common.PDRectangle;
    import org.apache.pdfbox.pdmodel.font.PDFont;
    import org.apache.pdfbox.pdmodel.font.PDFontFactory;
    import org.apache.pdfbox.pdmodel.graphics.blend.BlendMode;
    import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
    import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup;
    import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState;
    import org.apache.pdfbox.pdmodel.graphics.state.PDTextState;
    import org.apache.pdfbox.util.Matrix;
    import org.apache.pdfbox.util.Vector;
    public class PDStreamengine extends PDFStreamEngine {
    public static Map<String, OperatorProcessor> operators = new HashMap<String, OperatorProcessor>(80);
    
    private Matrix textMatrix;
    private Matrix textLineMatrix;
    
    private Stack<PDGraphicsState> graphicsStack = new Stack<PDGraphicsState>();
    
    private PDResources resources;
    private PDPage currentPage;
    private Matrix initialMatrix;
    public static ArrayList<ArrayList<Double>> chars;
    public static ArrayList<Matrix> charmatrixs ;
    public static ArrayList<String> tjchars;
    
    
    @Override
    public void processPage(PDPage page) throws IOException
    {
        initPage(page);
        if (page.hasContents())
        {
            processStream(page);
        }
    }
    
    
    private void initPage(PDPage page)
    {
        if (page == null)
        {
            throw new IllegalArgumentException("Page cannot be null");
        }
        currentPage = page;
        graphicsStack.clear();
        graphicsStack.push(new PDGraphicsState(page.getCropBox()));
        textMatrix = null;
        textLineMatrix = null;
        resources = null;
        initialMatrix = page.getMatrix();
    }
    
    public void processStream(PDContentStream contentStream) throws IOException
    {
        PDResources parent = pushResources(contentStream);
        Stack<PDGraphicsState> savedStack = saveGraphicsStacks();
        Matrix parentMatrix = initialMatrix;
    
        // transform the CTM using the stream's matrix
        getGraphicsState().getCurrentTransformationMatrix().concatenate(contentStream.getMatrix());
    
        // the stream's initial matrix includes the parent CTM, e.g. this allows a scaled form
        initialMatrix = getGraphicsState().getCurrentTransformationMatrix().clone();
    
        // clip to bounding box
        PDRectangle bbox = contentStream.getBBox();
        clipToRect(bbox);
    
        processStreamOperators(contentStream);
    
        initialMatrix = parentMatrix;
        restoreGraphicsStacks(savedStack);
        popResources(parent);
    }
    
    
    private PDResources pushResources(PDContentStream contentStream)
    {
        // resource lookup: first look for stream resources, then fallback to the current page
        PDResources parentResources = resources;
        PDResources streamResources = contentStream.getResources();
        if (streamResources != null)
        {
            resources = streamResources;
        }
        else if (resources != null)
        {
            // inherit directly from parent stream, this is not in the PDF spec, but the file from
            // PDFBOX-1359 does this and works in Acrobat
        }
        else
        {
            resources = currentPage.getResources();
        }
    
        // resources are required in PDF
        if (resources == null)
        {
            resources = new PDResources();
        }
        return parentResources;
    }
    
    private void clipToRect(PDRectangle rectangle)
    {
        if (rectangle != null)
        {
            GeneralPath clip = rectangle.transform(getGraphicsState().getCurrentTransformationMatrix());
            getGraphicsState().intersectClippingPath(clip);
        }
    }
    
    
    private void processStreamOperators(PDContentStream contentStream) throws IOException
    {
        List<COSBase> arguments = new ArrayList<COSBase>();
        PDFStreamParser parser = new PDFStreamParser(contentStream);
        new ProcessClasses();
        Object token = parser.parseNextToken();
        while (token != null)
        {
            if (token instanceof COSObject)
            {
                arguments.add(((COSObject) token).getObject());
            }
            else if (token instanceof Operator)
            {
                processOperator((Operator) token, arguments);
                arguments = new ArrayList<COSBase>();
            }
            else
            {
                arguments.add((COSBase) token);
            }
            token = parser.parseNextToken();
        }
    }
    
    private void popResources(PDResources parentResources)
    {
        resources = parentResources;
    }
    
    
     protected void processOperator(Operator operator, List<COSBase> operands) throws IOException
        {
            String name = operator.getName();
            OperatorProcessor processor = operators.get(name);
            if (processor != null)
            {
                processor.setContext(this);
                try
                {
                    System.out.println(operator);
                    System.out.println(operands);
                    processor.process(operator, operands);
                }
                catch (IOException e)
                {
                    operatorException(operator, operands, e);
                }
            }
            else
            {
                unsupportedOperator(operator, operands);
            }
        }
     
     
     protected final Stack<PDGraphicsState> saveGraphicsStacks()
        {
            Stack<PDGraphicsState> savedStack = graphicsStack;
            graphicsStack = new Stack<PDGraphicsState>();
            graphicsStack.add(savedStack.peek().clone());
            return savedStack;
        }
     
     @Override
     public PDGraphicsState getGraphicsState()
        {
            return graphicsStack.peek();
        }
     
     
     public  void addOperators(OperatorProcessor op)
        {
            op.setContext(this);
            operators.put(op.getName(), op);
        }
     
     protected final void restoreGraphicsStacks(Stack<PDGraphicsState> snapshot)
        {
            graphicsStack = snapshot;
        }
        
        /**
         * @return Returns the size of the graphicsStack.
         */
        public int getGraphicsStackSize()
        {
            return graphicsStack.size();
        }
    
    
        /**
         * @return Returns the textLineMatrix.
         */
        public Matrix getTextLineMatrix()
        {
            return textLineMatrix;
        }
    
        /**
         * @param value The textLineMatrix to set.
         */
        public void setTextLineMatrix(Matrix value)
        {
            textLineMatrix = value;
        }
    
        /**
         * @return Returns the textMatrix.
         */
        public Matrix getTextMatrix()
        {
            return textMatrix;
        }
    
        /**
         * @param value The textMatrix to set.
         */
        public void setTextMatrix(Matrix value)
        {
            textMatrix = value;
        }
        
        public PDResources getResources()
        {
            return resources;
        }
        
        
        /**
         * Pushes the current graphics state to the stack.
         */
        public void saveGraphicsState()
        {
            graphicsStack.push(graphicsStack.peek().clone());
        }
    
        /**
         * Pops the current graphics state from the stack.
         */
        public void restoreGraphicsState()
        {
            graphicsStack.pop();
        }
        
        protected void applyTextAdjustment(float tx, float ty) throws IOException
        {
            // update the text matrix
            textMatrix.concatenate(Matrix.getTranslateInstance(tx, ty));
        }
        
        
        public void showForm(PDFormXObject form) throws IOException
        {
            if (currentPage == null)
            {
                throw new IllegalStateException("No current page, call " +
                        "#processChildStream(PDContentStream, PDPage) instead");
            }
            if (form.getCOSObject().getLength() > 0)
            {
                processStream(form);
            }
        }
        
        
        
        /**
         * Called when a string of text is to be shown.
         *
         * @param string the encoded text
         * @throws IOException if there was an error showing the text
         */
        public void showTextString(byte[] string) throws IOException
        {
            showText(string);
        }
        
        @Override
        public void showTransparencyGroup(PDTransparencyGroup form) throws IOException
        {
            processTransparencyGroup(form);
        }
        
        @Override
        protected void processTransparencyGroup(PDTransparencyGroup group) throws IOException
        {
            if (currentPage == null)
            {
                throw new IllegalStateException("No current page, call " +
                        "#processChildStream(PDContentStream, PDPage) instead");
            }
    
            PDResources parent = pushResources(group);
            Stack<PDGraphicsState> savedStack = saveGraphicsStacks();
            
            Matrix parentMatrix = initialMatrix;
    
            // the stream's initial matrix includes the parent CTM, e.g. this allows a scaled form
            initialMatrix = getGraphicsState().getCurrentTransformationMatrix().clone();
    
            // transform the CTM using the stream's matrix
            getGraphicsState().getCurrentTransformationMatrix().concatenate(group.getMatrix());
    
            // Before execution of the transparency group XObject’s content stream, 
            // the current blend mode in the graphics state shall be initialized to Normal, 
            // the current stroking and nonstroking alpha constants to 1.0, and the current soft mask to None.
            getGraphicsState().setBlendMode(BlendMode.NORMAL);
            getGraphicsState().setAlphaConstant(1);
            getGraphicsState().setNonStrokeAlphaConstant(1);
            getGraphicsState().setSoftMask(null);
    
            // clip to bounding box
            clipToRect(group.getBBox());
    
            processStreamOperators(group);
            
            initialMatrix = parentMatrix;
    
            restoreGraphicsStack(savedStack);
            popResources(parent);
        }
        
        
        @Override
        public void showTextStrings(COSArray array) throws IOException{
            PDTextState textState = getGraphicsState().getTextState();
            float fontSize = textState.getFontSize();
            float horizontalScaling = textState.getHorizontalScaling() / 100f;
            PDFont font = textState.getFont();
            chars = new ArrayList<ArrayList<Double>>();
            charmatrixs = new ArrayList<Matrix>();
            tjchars = new ArrayList<String>();
            boolean isVertical = false;
            if (font != null)
            {
                isVertical = font.isVertical();
            }
    
            for (COSBase obj : array)
            {
                if (obj instanceof COSNumber)
                {
                    float tj = ((COSNumber)obj).floatValue();
    
                    // calculate the combined displacements
                    float tx, ty;
                    if (isVertical)
                    {
                        tx = 0;
                        ty = -tj / 1000 * fontSize;
                    }
                    else
                    {
                        tx = -tj / 1000 * fontSize * horizontalScaling;
                        ty = 0;
                    }
    
                    applyTextAdjustment(tx, ty);
                }
                else if(obj instanceof COSString)
                {
                    byte[] string = ((COSString)obj).getBytes();
                    showText(string);
                    
                }
                else
                {
                    throw new IOException("Unknown type in array for TJ operation:" + obj);
                }
            }
            if(!chars.isEmpty() && !charmatrixs.isEmpty()) {
                Horizontalparsing.poscharobj.put(Horizontalparsing.tj_ycount, chars);
                Horizontalparsing.txtposmatrix.put(Horizontalparsing.tj_ycount, charmatrixs);
                Horizontalparsing.wordobj.put(Horizontalparsing.tj_ycount, tjchars);
                Horizontalparsing.tj_ycount +=1;
            }
            
        }
        
        @Override
         protected void showText(byte[] string) throws IOException
            {
                PDGraphicsState state = getGraphicsState();
                PDTextState textState = state.getTextState();
    
                // get the current font
                PDFont font = textState.getFont();
                if (font == null)
                {
                   // LOG.warn("No current font, will use default");
                    font = PDFontFactory.createDefaultFont();
                }
    
                float fontSize = textState.getFontSize();
                float horizontalScaling = textState.getHorizontalScaling() / 100f;
                float charSpacing = textState.getCharacterSpacing();
    
                // put the text state parameters into matrix form
                Matrix parameters = new Matrix(
                        fontSize * horizontalScaling, 0, // 0
                        0, fontSize,                     // 0
                        0, textState.getRise());         // 1
    
                // read the stream until it is empty
                InputStream in = new ByteArrayInputStream(string);
                while (in.available() > 0)
                {
                    // decode a character
                    int before = in.available();
                    int code = font.readCode(in);
                    int codeLength = before - in.available();
                    String unicode = font.toUnicode(code);
                    //To record char positions
                    ArrayList<Double> pstnchar = new ArrayList<Double>();
    
                    // Word spacing shall be applied to every occurrence of the single-byte character code
                    // 32 in a string when using a simple font or a composite font that defines code 32 as
                    // a single-byte code.
                    float wordSpacing = 0;
                    if (codeLength == 1 && code == 32)
                    {
                        wordSpacing += textState.getWordSpacing();
                    }
    
                    // text rendering matrix (text space -> device space)
                    Matrix ctm = state.getCurrentTransformationMatrix();
                    Matrix textRenderingMatrix = parameters.multiply(textMatrix).multiply(ctm);
    
                    // get glyph's position vector if this is vertical text
                    // changes to vertical text should be tested with PDFBOX-2294 and PDFBOX-1422
                    if (font.isVertical())
                    {
                        // position vector, in text space
                        Vector v = font.getPositionVector(code);
    
                        // apply the position vector to the horizontal origin to get the vertical origin
                        textRenderingMatrix.translate(v);
                    }
    
                    // get glyph's horizontal and vertical displacements, in text space
                    Vector w = font.getDisplacement(code);
    
                    // process the decoded glyph
                    saveGraphicsState();
                    Matrix textMatrixOld = textMatrix;
                    Matrix textLineMatrixOld = textLineMatrix;
                    showGlyph(textRenderingMatrix, font, code, unicode, w);
                    textMatrix = textMatrixOld;
                    textLineMatrix = textLineMatrixOld;
                    pstnchar.add((double) textMatrix.getValue(2, 0));
                    pstnchar.add((double) textMatrix.getValue(2, 1));
                    charmatrixs.add(textRenderingMatrix);
                    restoreGraphicsState();
    
                    // calculate the combined displacements
                    float tx, ty;
                    if (font.isVertical())
                    {
                        tx = 0;
                        ty = w.getY() * fontSize + charSpacing + wordSpacing;
                    }
                    else
                    {
                        tx = (w.getX() * fontSize + charSpacing + wordSpacing) * horizontalScaling;
                        ty = 0;
                    }
    
                    // update the text matrix
                    textMatrix.concatenate(Matrix.getTranslateInstance(tx, ty));
                    pstnchar.add((double) textMatrix.getValue(2, 0));
                    pstnchar.add((double) textMatrix.getValue(2, 1));
                    tjchars.add(unicode);
                    chars.add(pstnchar);
                }
            }
    

    }