I'm trying to extract some text from an image using tesseract, and I've noticed if I divide the image to 9 smaller pieces the system is more accurate, so what I'm trying to accomplish is to process all 9 images at once (parallel) and this is the way I wanted to do it:
private static int GetImageText(Image src)
{
string[] words = { words-to-check };
List<string> found = new();
string path = Environment.CurrentDirectory;
try
{
using (var engine = new TesseractEngine(path, "eng", EngineMode.LstmOnly))
{
Parallel.ForEach(CutUpImage(src), (img) =>
{
using (var ms = new MemoryStream())
{
img.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);
using (var loadedImg = Pix.LoadFromMemory(ms.ToArray()))
using (var page = engine.Process(loadedImg))
{
var c = page.GetText();
foreach (var word in words) if (c.Contains(word)) found.Add(word);
}
}
});
return found.Count;
}
}
catch (Exception ex)
{
throw ex;
}
}
but I'm getting an error (Only one image can be processed at once. Please make sure you dispose of the page once your finished with it.)
.
So I had to move the new TesseractEngine
into the loop like this:
private static int GetImageText(Image src)
{
string[] words = { words-to-check };
List<string> found = new();
string path = Environment.CurrentDirectory;
Parallel.ForEach(CutUpImage(src), (img) =>
{
using (var engine = new TesseractEngine(path, "eng", EngineMode.LstmOnly))
{
using (var ms = new MemoryStream())
{
img.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);
using (var loadedImg = Pix.LoadFromMemory(ms.ToArray()))
using (var page = engine.Process(loadedImg))
{
var c = page.GetText();
foreach (var word in words) if (c.Contains(word)) found.Add(word);
}
}
}
});
return found.Count;
}
but then it takes a full minute to finish processing all images.
so my question is how do I make the new TesseractEngine
work outside the loop, and more generally how do I make this work faster?
ok so the solution to my problem is simple... don't use parallel processing!
I switched the Parallel.ForEach
to a traditional foreach
(idk why I decided to try parallel processing first...) and it now takes 12 seconds to process them all, the reason it took a minute is most probably to spin up the actual engine... this is the code :
private static int GetImageText(Image src)
{
string[] words = { words-to-check };
List<string> found = new();
string path = Environment.CurrentDirectory;
using (var engine = new TesseractEngine(path, "eng", EngineMode.LstmOnly))
using (var ms = new MemoryStream())
foreach (var img in CutUpImage(src))
{
img.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);
using (var loadedImg = Pix.LoadFromMemory(ms.ToArray()))
using (var page = engine.Process(loadedImg))
{
var c = page.GetText();
foreach (var word in words) if (c.Contains(word)) found.Add(word);
}
ms.SetLength(0);
}
return found.Count;
}
p.s. this is the CutUpImage
code if someone ever wants to use it...
private static Image[] CutUpImage(Image src)
{
int widthThird = (int)((double)src.Width / 3.0 + 0.5);
int heightThird = (int)((double)src.Height / 3.0 + 0.5);
var imgarray = new Image[9];
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++)
{
var index = i*3+j;
imgarray[index] = new Bitmap(widthThird, heightThird);
Graphics g = Graphics.FromImage(imgarray[index]);
g.DrawImage(src, new Rectangle(0, 0, widthThird, heightThird),
new Rectangle(j * widthThird, i * heightThird, widthThird, heightThird),
GraphicsUnit.Pixel);
g.Dispose();
}
return imgarray;
}