Search code examples
c#.net-5itext7

Extract fontname, size, style from pdf with iText


I am trying to extract text from various pdf files depending on font, fontsize and fontstyle, using iText 7.1.14.

public class FontSizeSimpleTextExtractionStrategy : SimpleTextExtractionStrategy
{
    FieldInfo _textField = typeof(TextRenderInfo).GetField("text", BindingFlags.NonPublic | BindingFlags.Instance);
    public override void EventOccurred(IEventData data, EventType type)
    {
        if (type.Equals(EventType.RENDER_TEXT))
        {
            TextRenderInfo renderInfo = (TextRenderInfo)data;
            string fontName = renderInfo.GetFont()?.GetFontProgram()?.GetFontNames()?.GetFontName();
            iText.Kernel.Colors.Color color = renderInfo.GetFillColor();
            float size = renderInfo.GetFontSize();

            if (fontName != null)
            {
                _textField.SetValue(renderInfo, "#Data|" + fontName + "|" + size.ToString() + "|" + ColorToString(color) + "|Data#" + renderInfo.GetText());
            }

        }
        base.EventOccurred(data, type);
    }
}

On some files the value of "size" is always "1", although Adobe Acrobat displays the correct font size, which is 25 and 11 in this example file.

Is there any chance to get the correct size with iText?


Solution

  • The cause of this issue is that the transformation of the drawn text by the current transformation matrix and the text matrix are ignored.

    The font size returned from the TextRenderInfo is the font size value from the current graphics state at the time the text is drawn. This value does not yet include the transformation of the drawn text by the current text and transformation matrices. Thus, one has to transform an upright vector as long as the font size value by these matrices and determine the effective size from the result.

    The TextRenderInfo.GetTextMatrix() value actually contains the product of the text matrix and the current transformation matrix, so we only need to use that value.

    class FontSizeSimpleTextExtractionStrategyImproved : SimpleTextExtractionStrategy
    {
        FieldInfo _textField = typeof(TextRenderInfo).GetField("text", BindingFlags.NonPublic | BindingFlags.Instance);
        public override void EventOccurred(IEventData data, EventType type)
        {
            if (type.Equals(EventType.RENDER_TEXT))
            {
                TextRenderInfo renderInfo = (TextRenderInfo)data;
                string fontName = renderInfo.GetFont()?.GetFontProgram()?.GetFontNames()?.GetFontName();
                Color color = renderInfo.GetFillColor();
    
                float size = renderInfo.GetFontSize();
                Vector sizeHighVector = new Vector(0, size, 0);
                Matrix matrix = renderInfo.GetTextMatrix();
                float sizeAdjusted = sizeHighVector.Cross(matrix).Length();
    
                if (fontName != null)
                {
                    _textField.SetValue(renderInfo, "#Data|" + fontName + "|" + sizeAdjusted.ToString() + "|" + ColorToString(color) + "|Data#" + renderInfo.GetText());
                }
            }
            base.EventOccurred(data, type);
        }
    }
    

    (ExtractWithFontSize helper class)