This is a follow on from this question
We figured out how to extract text marked for redaction using the code below.
However, there are additional leading and trailing characters captured beyond the text marked for redaction.
For example, if a sentence is marked for redaction in the source PDF document, the code below also captures the last few characters of the preceding sentence and the first few characters of the next sentence.
Is anybody able to see a problem in the code below?
for (int i = 1; i <= rdr.NumberOfPages; i++)
{
// Get pages and corresponding annotations
PdfDictionary dict = rdr.GetPageN(i);
PdfArray annots = dict.GetAsArray(PdfName.ANNOTS);
foreach (var annItem in annots.ArrayList)
{
PdfDictionary d = PdfReader.GetPdfObject(annItem) as PdfDictionary;
PdfName typ = d.GetAsName(PdfName.SUBTYPE);
if (typ.ToString().StartsWith("/Redact"))
{
sb = new StringBuilder();
PdfObject o1 = d.Get(PdfName.QUADPOINTS);
PdfArray arr2 = o1 as PdfArray;
int numLines = arr2.ArrayList.Count / 8;
for (int k = 0; k < numLines; k++)
{
llx = float.Parse(arr2[(0 + k * 8)].ToString());
lly = float.Parse(arr2[(1 + k * 8)].ToString());
urx = float.Parse(arr2[(6 + k * 8)].ToString());
ury = float.Parse(arr2[(7 + k * 8)].ToString());
Rectangle rect = new Rectangle(llx, lly, urx, ury, 1);
List<RenderFilter> filters = new List<RenderFilter>();
filters.Add(new RegionTextRenderFilter(rect));
strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(),
filters.ToArray<RenderFilter>());
// We get the text but get extra leading/trailing chars that are not in redaction.
sb.Append(PdfTextExtractor.GetTextFromPage(rdr, i, strategy));
}
Console.WriteLine("Page: " + i.ToString());
Console.WriteLine(sb.ToString() + Environment.NewLine);
sb.Clear();
}
}
}
For the sake of completeness, this question was answered on the iText mailing list: http://thread.gmane.org/gmane.comp.java.lib.itext.general/62918