I'm using iText library in C# / Net5 (5.0.1). The library is installed using NuGet and is version 7.1.13. I want to read PDF documents and search inside the text.
My problem comes from the API GetPage(n)
. I assumed it reads the page n, but the fact is that is returning all the pages from 1 to n.
This is my code for getting the PDF content
public PdfDocument? GetPdfContent() {
PdfDocument? document = null;
HttpWebRequest? request = null;
HttpWebResponse? response = null;
Stream? responseStream = null;
MemoryStream? memoryStream = null;
try {
request = WebRequest.CreateHttp(_contentUrl);
} catch (ArgumentNullException e) {
Log.Logger.LogError(e, "Null address/URL in WebContent.GetPdfContent");
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, "Null address/URL", e);
} catch (UriFormatException e) {
Log.Logger.LogError(e, "Invalid address/URL in WebContent.GetPdfContent " + _contentUrl);
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, "Invalid address/URL", e);
} catch (NotSupportedException e) {
Log.Logger.LogError(e, "Invalid protocol URL in WebContent.GetPdfContent. Only http and https are supported. " + _contentUrl);
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, "Invalid protocol URL", e);
} catch (SecurityException e) {
Log.Logger.LogError(e, "Cannot contect to uri. Invalid user/password provided. " + _contentUrl);
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, "Invalid user/password", e);
}
if (request != null) {
// Configure request
request.Method = "GET";
// Automatic redirection enabled
request.AllowAutoRedirect = true;
// acept-encoding: deflate, gzip
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip;
if (_accept != null) {
request.Accept = _accept;
}
request.Headers.Add(HttpRequestHeader.UserAgent, "sample/0.0.0");
if (_authorization != null) {
request.Headers.Add(HttpRequestHeader.Authorization, _authorization);
}
try {
using (response = (HttpWebResponse)request.GetResponse()) {
if (response.StatusCode != HttpStatusCode.OK) {
if (response.StatusCode == HttpStatusCode.NotFound) {
throw new ContentSearchException(ContentSearchErrorCode.ContentNotFound, $"Error topic not found: {response.StatusCode} {response.StatusDescription}.");
} else {
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, $"Error returned by server: {response.StatusCode} {response.StatusDescription}.");
}
} else if (String.IsNullOrEmpty(response.ContentType) || response.ContentType.Split(";")[0] != "application/pdf") {
throw new ContentSearchException(ContentSearchErrorCode.InvalidContentType, $"Error invalid content type {response.ContentType}.");
} else {
try {
using (responseStream = response.GetResponseStream()) {
memoryStream = new MemoryStream();
responseStream.CopyTo(memoryStream);
// memoryStream remains open!
memoryStream.Position = 0;
document = new PdfDocument(new PdfReader(memoryStream));
responseStream.Close();
memoryStream.Close();
}
} catch (Exception e) {
// Error in GetResponseStream
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, $"Error reading response: {e.Message}", e);
}
}
response.Close();
}
} catch (Exception e) {
// Error in GetResponse
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, $"Error getting response: {e.Message}", e);
}
}
return document;
}
And this is the failing code with GetPage
private List<string> GetStringPdfContent() {
List<string> ret = null;
// iText
PdfDocument pdfContent;
PdfPage page;
ITextExtractionStrategy strategy;
string strPage;
pdfContent = (PdfDocument)GetContent();
if (pdfContent != null) {
ret = new List<string>();
// Code for iText
strategy = new SimpleTextExtractionStrategy();
for (int i = 1; i <= pdfContent.GetNumberOfPages(); i++) {
page = pdfContent.GetPage(i);
strPage = PdfTextExtractor.GetTextFromPage(page, strategy);
Log.Logger.LogDebug($"[GetStringPdfContent] Extracted page {i} with length {strPage.Length}.");
ret.Add(strPage);
}
}
return ret;
}
This a sample output. As you see, we get pages 1, 1-2, 1-3, and so on...
dbug: Spider.Program[0]
[23/12/2020 17:47:59.793]: [GetStringPdfContent] Extracted page 1 with length 615.
dbug: Spider.Program[0]
[23/12/2020 17:48:10.207]: [GetStringPdfContent] Extracted page 2 with length 2659.
dbug: Spider.Program[0]
[23/12/2020 17:48:12.112]: [GetStringPdfContent] Extracted page 3 with length 4609.
dbug: Spider.Program[0]
[23/12/2020 17:48:13.255]: [GetStringPdfContent] Extracted page 4 with length 7273.
dbug: Spider.Program[0]
[23/12/2020 17:48:16.155]: [GetStringPdfContent] Extracted page 5 with length 9245.
My problem comes from the API GetPage(n). I assumed it reads the page n, but the fact is that is returning all the pages from 1 to n.
This cannot be true, GetPage(n)
after all returns a PdfPage
object which represents a single page.
The error is your code is that you re-use the same SimpleTextExtractionStrategy
object on all pages. A SimpleTextExtractionStrategy
collects all the text it is given, so if you first use it for page 1 and then for page 2, it contains the text of both pages.
Thus, instantiate a separate text extraction strategy object per page:
for (int i = 1; i <= pdfContent.GetNumberOfPages(); i++) {
strategy = new SimpleTextExtractionStrategy();
page = pdfContent.GetPage(i);
strPage = PdfTextExtractor.GetTextFromPage(page, strategy);
Log.Logger.LogDebug($"[GetStringPdfContent] Extracted page {i} with length {strPage.Length}.");
ret.Add(strPage);
}