Search code examples
javaitextpdf-generationpdfclown

Using pdfclown few search keywords are not highlighting in chinese/japanese documents


Iam facing issue with some of the search keywords are not highlighting in chinese documents .Due to confidiential concerns iam not providing actual pdf . search keywords are 1)亿元或2) 收入亿来源 Please find the pdf document path which i tested ,pdfpath link. and ActualResult link .I have already posted related to this issue in following Link but some of the keywords are not highlighting properly in few chinese documents.Kindly provide your inputs to highlight the search keywords which i mentioned.

    import java.awt.Color;
    import java.awt.Desktop;
    import java.awt.geom.Rectangle2D;
    import java.io.FileOutputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.io.UnsupportedEncodingException;
    import java.net.URL;
    import java.nio.charset.Charset;
    import java.util.ArrayList;
    import java.util.Collection;
    import java.util.Date;
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    import java.util.concurrent.TimeUnit;
    import java.util.regex.Matcher;
    import java.util.regex.Pattern;
    import java.io.BufferedInputStream;
    import java.io.File;
    import org.pdfclown.documents.Page;
    import org.pdfclown.documents.contents.ITextString;
    import org.pdfclown.documents.contents.TextChar;
    import org.pdfclown.documents.contents.colorSpaces.DeviceRGBColor;
    import org.pdfclown.documents.interaction.annotations.TextMarkup;
    import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;

    import org.pdfclown.files.SerializationModeEnum;
    import org.pdfclown.util.math.Interval;
    import org.pdfclown.util.math.geom.Quad;
    import org.pdfclown.tools.TextExtractor;

    public class pdfclown2 {
        private static int count;

        public static void main(String[] args) throws IOException {

            highlight("ebook.pdf","C:\\Users\\Downloads\\6.pdf");
            System.out.println("OK");
        }
        private static void highlight(String inputPath, String outputPath) throws IOException {

            URL url = new URL(inputPath);
            InputStream in = new BufferedInputStream(url.openStream());
            org.pdfclown.files.File file = null;

            try {
                file = new org.pdfclown.files.File("C:\\Users\\Desktop\\pdf\\test123.pdf");

            Map<String, String> m = new HashMap<String, String>();
                m.put("亿元或","hi");
                m.put("收入亿来","hi");



            System.out.println("map size"+m.size());
             long startTime = System.currentTimeMillis();




                // 2. Iterating through the document pages...
                TextExtractor textExtractor = new TextExtractor(true, true);
                for (final Page page : file.getDocument().getPages()) {
                    Map<Rectangle2D, List<ITextString>> textStrings = textExtractor.extract(page);
                    for (Map.Entry<String, String> entry : m.entrySet()) {

                        Pattern pattern;
                        String serachKey =  entry.getKey();
                        final String translationKeyword = entry.getValue();
                    /*
                            if ((serachKey.contains(")") && serachKey.contains("("))
                                    || (serachKey.contains("(") && !serachKey.contains(")"))
                                    || (serachKey.contains(")") && !serachKey.contains("(")) || serachKey.contains("?")
                                    || serachKey.contains("*") || serachKey.contains("+")) {s
                                pattern = Pattern.compile(Pattern.quote(serachKey), Pattern.CASE_INSENSITIVE);
                            }
                            else*/
                                 pattern = Pattern.compile(serachKey, Pattern.CASE_INSENSITIVE);
                    // 2.1. Extract the page text!

                //System.out.println(textStrings.toString().indexOf(entry.getKey()));

                    // 2.2. Find the text pattern matches!
                    final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings));
                    // 2.3. Highlight the text pattern matches!
                    textExtractor.filter(textStrings, new TextExtractor.IIntervalFilter() {
                        public boolean hasNext() {
                            // System.out.println(matcher.find());
                            // if(key.getMatchCriteria() == 1){
                            if (matcher.find()) {
                                return true;
                            }
                            /*
                             * } else if(key.getMatchCriteria() == 2) { if
                             * (matcher.hitEnd()) { count++; return true; } }
                             */
                            return false;

                        }

                        public Interval<Integer> next() {
                            return new Interval<Integer>(matcher.start(), matcher.end());
                        }

                        public void process(Interval<Integer> interval, ITextString match) {
                            // Defining the highlight box of the text pattern
                            // match...
                            System.out.println(match);
                        /*  List<Quad> highlightQuads = new ArrayList<Quad>();
                            {
                                Rectangle2D textBox = null;
                                for (TextChar textChar : match.getTextChars()) {
                                    Rectangle2D textCharBox = textChar.getBox();
                                    if (textBox == null) {
                                        textBox = (Rectangle2D) textCharBox.clone();
                                    } else {
                                        if (textCharBox.getY() > textBox.getMaxY()) {
                                            highlightQuads.add(Quad.get(textBox));
                                            textBox = (Rectangle2D) textCharBox.clone();
                                        } else {
                                            textBox.add(textCharBox);
                                        }
                                    }
                                }
                                textBox.setRect(textBox.getX(), textBox.getY(), textBox.getWidth(), textBox.getHeight());
                                highlightQuads.add(Quad.get(textBox));
                            }*/
                            List<Quad> highlightQuads = new ArrayList<Quad>();
                            List<TextChar> textChars = match.getTextChars();
                            Rectangle2D firstRect = textChars.get(0).getBox();
                            Rectangle2D lastRect = textChars.get(textChars.size()-1).getBox();
                            Rectangle2D rect = firstRect.createUnion(lastRect);
                            highlightQuads.add(Quad.get(rect).get(rect));
                            // subtype can be Highlight, Underline, StrikeOut, Squiggly


                            new TextMarkup(page, highlightQuads, translationKeyword, MarkupTypeEnum.Highlight);

                        }

                        public void remove() {
                            throw new UnsupportedOperationException();
                        }

                    });
                }

            }

            SerializationModeEnum serializationMode = SerializationModeEnum.Standard;

                file.save(new java.io.File(outputPath), serializationMode);

                System.out.println("file created");
                long endTime = System.currentTimeMillis();

                 System.out.println("seconds take for execution is:"+(endTime-startTime)/1000);

            } catch (Exception e) {
                   e.printStackTrace();
            }
            finally{
                in.close();
            }


        }
    }

Solution

  • Indeed, when searching for "亿元或" the result highlight is somewhat wrong:

    screenshot of bug

    The cause is a PDF Clown bug. When it parses a composite font (aka Type 0 font), it expects the DW (default width) entry in the Type 0 font base dictionary while it is specified to be in the CIDFont subdictionary!

    In case of the document at hand the widths of most characters, in particular of the Chinese characters, are not given explicitly and, therefore, default to that DW value. As this value cannot be determined properly due to the bug mentioned above, an average over the explicitly given widths is used, and this average happens to be merely ¾ of the correct value. Thus, the highlighted area is too short.

    You can fix this bug in the CompositeFont class (package org.pdfclown.documents.contents.fonts) at the end of the method onLoad. Simply replace

    PdfInteger defaultWidthObject = (PdfInteger)getBaseDataObject().get(PdfName.DW);
    

    by

    PdfInteger defaultWidthObject = (PdfInteger)getCIDFontDictionary().get(PdfName.DW);
    

    The highlighting now results in

    screenshot fix