Search code examples
javapdfpdfboxtex

Why is my class constructor called repeatedly?


Recently I have been working on a program that can convert TeX-generated PDFs to a certain form of text that retains some semantically meaningful style information such as subscripts and superscripts.

When debugging it seems that there might be something very unusual going on with the PDFTextStripper class.

Here is my TeXUtil class that does most of the work.

import com.google.common.base.CharMatcher;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.json.JSONException;
import org.json.JSONObject;

import java.io.IOException;
import java.util.Hashtable;
import java.util.Stack;

public class TeXUtil {
    private Stack<SSStatus> ssstatus;
    private boolean accentMode;
    private String fs;
    private boolean mathMode;
    private SymbolDB db;
    private Hashtable<String, String> maccDict;
    float endY;//Positions
    float endX;
    float Y;
    int height;//Height
    //boolean test;
    public TeXUtil() throws IOException {
        ssstatus = new Stack<SSStatus>();
        fs = "rm";
        accentMode = false;//as in the state of being right after an accent
        mathMode = false;
        db = new SymbolDB();
        maccDict = new Hashtable<String, String>();
        maccDict.put("\\vec","\\vec");
        maccDict.put("\\widehat","\\widehat");
        maccDict.put("\\widetilde","\\widetilde");
        maccDict.put("\\^","\\hat");
        maccDict.put("\\v","\\check");
        maccDict.put("\\u","\\breve");
        maccDict.put("\\`","\\grave");
        maccDict.put("\\~","\\tilde");
        maccDict.put("\\=","\\bar");
        maccDict.put("\\.","\\dot");
        maccDict.put("\\","\\ddot");
        maccDict.put("\\'","\\acute");
        endY = 0;
        endX = 0;
        Y = 0;
        height = 0;
        //test = false;
        System.out.println("TeXUtil initialized!");
    }
    private static String fontShortName(PDFont font) {
        String[] segments = font.getName().split("\\+");
        return segments[segments.length - 1];
    }
    private static int fontHeight(PDFont font) {
        CharMatcher matcher = CharMatcher.inRange('0', '9');
        return Integer.parseInt(matcher.retainFrom(fontShortName(font)));
    }
    private static String fontClass(PDFont font) {
        CharMatcher matcher = CharMatcher.inRange('A', 'Z');
        return (matcher.retainFrom(fontShortName(font))).toLowerCase();
    }
    private String textToTeX(String shortFontName, int code) throws JSONException {
        JSONObject info = db.getInfo(shortFontName, code);
        return info.getString("value");
    }
    public String fullTextToTeX(PDFont font, int code, float newEndX, float newY, float newEndY){
        String shortFontName = fontClass(font);
        try {
            JSONObject info = db.getInfo(shortFontName, code);
            String teXCode = info.getString("value");
            StringBuilder preamble1 = new StringBuilder("");
            StringBuilder preamble2 = new StringBuilder("");
            StringBuilder postamble = new StringBuilder("");
            boolean text = info.getBoolean("text");
            boolean math = info.getBoolean("math");
            boolean tacc = info.getBoolean("tacc");
            boolean macc = info.getBoolean("macc");
            String newFont = info.getString("font");
            int newHeight = fontHeight(font);
            //Font change, rm is seen as having no font
            if (!newFont.equals(fs)) {
                if (!fs.equals("rm"))
                    preamble1.insert(0, '}');
                if (!newFont.equals("rm")) {
                    preamble2.append('\\');
                    preamble2.append(newFont);
                    preamble2.append('{');
                }
                preamble1.insert(0,  " fs = " + fs + " nFs = " + newFont + "\n");
                fs = newFont;
            }
            if (height == 0) {
                //preamble2.append(" Meow! am = " + accentMode + " fs = " + fs + " mm = " + mathMode + "\n");
            }
            //Subscripts/Superscripts
            if (height > newHeight && newEndX > endX) {//New subscript/superscript
                if (newEndY < endY) {//New superscript
                    //ssstatus.push(SSStatus.SUP);
                    preamble2.insert(0, "^{");
                }
                else if (newY > Y) {//New subscript
                    //ssstatus.push(SSStatus.SUB);
                    preamble2.insert(0, "_{");
                }
                //else {
                  //  System.out.println("Please investigate the situation: texcode = " + teXCode + "endY = " + endY + " Y=" + Y + " endX=" + endX + " newEndY=" + newEndY + " newY=" + newY + " newEndX= " + newEndX);
                //}
            }
            else if (height < newHeight && height != 0) {
                //ssstatus.pop();
                preamble1.append('}');
            }
            height = newHeight;
            endX = newEndX;
            endY = newEndY;
            Y = newY;
            //Enter or leave math mode
            if (mathMode && !math && !macc) {
                mathMode = false;
                preamble1.append('$');
            }
            else if (!mathMode && !text && !tacc) {
                mathMode = true;
                preamble2.insert(0,'$');
            }
            //Accents
            if (accentMode) {//If accent mode is ever entered we need to leave it at once
                postamble.append('}');
                accentMode = false;
            }
            if ((mathMode && macc) || (!mathMode && tacc)) {//Right now assume that anything that can be an accent is an accent
                postamble.append('{');
                if (mathMode)
                    teXCode = maccDict.get(teXCode);
                accentMode = true;
            }
            if (teXCode.charAt(0) == '\\')
                return preamble1.toString() + preamble2.toString() + teXCode + ' ' + postamble.toString();
            else
                return preamble1.toString() + preamble2.toString() + teXCode + postamble.toString();
        }
        catch(JSONException e) {
            return "\\" + shortFontName + "{" + code + "}";
        }
    }
}

Here is the main class.

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import com.google.common.base.CharMatcher;
import org.json.*;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Stack;

public class TEX2TXT {

    public static void main(String args[]) throws IOException {
        TeXUtil util = new TeXUtil();
        //Loading an existing document
        File file = new File("/Users/CatLover/Documents/Tex/Examples/c4.pdf");
        PDDocument document = PDDocument.load(file);
        //Instantiate PDFTextStripper class
        PDFTextStripper pdfStripper = new PDFTextStripper() {
            protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
                TeXUtil util = new TeXUtil();
                StringBuilder builder = new StringBuilder();
                for(TextPosition position: textPositions) {
                    float Y = position.getY();
                    float endY = position.getEndY();
                    float endX = position.getEndX();
                    PDFont font = position.getFont();
                    int[] codes = position.getCharacterCodes();
                    for(int code: codes) {
                        builder.append(util.fullTextToTeX(font, code, endX, Y, endY));
                    }

                }
                writeString(builder.toString());
            }
        };
        //Retrieving text from PDF document
        String text = pdfStripper.getText(document);
        System.out.println(text);
        //Closing the document
        document.close();
    }

What's really weird is that TeXUtil is constructed every time any white space between words appear while TeXUtil() should be called only once. I'm not sure why this is the case. Since the PDFs are produced by LaTeX and LaTeX does not put white space characters in PDFs but instead leave space between characters to implicitly represent white spaces this may affect how PDFBox works.


Solution

  • You're constructing a new TeXUtil in the first line of your PDFTextStripper subclass's writeString method. If you just remove that line, it should be able to still reference the util defined in your main method (though depending on the version of java you're using, you may have to make it final).