I have a static phrase the I am searching an OCR'd image for.
string KeywordToFind = "Account Number"
string OcrPageText = "
GEORGIA
POWER
A SOUTHERN COMPANY
AecountNumber
122- 493
Pagel of2
Please Pay By
Jan 29,2014
Total Due
39.11
"
How can I find the word "AecountNumber" using my keyword "Account Number"?
I have tried using variations of the Levenshtein Distance Algorithm HERE with varied success. I've also tried regexes, but the OCR often converts the text differently, thus rendering the regex useless.
Suggestions? I can provide more code if the link doesn't give enough information. Also, Thanks!
Answered My Question with the use of sub-strings. Posting in case others run into the same type of problem. A little unorthodox, but it works great for me.
int TextLengthBuffer = (int)StaticTextLength - 1; //start looking for correct result with one less character than it should have.
int LowestLevenshteinNumber = 999999; //initialize insanely high maximum
decimal PossibleStringLength = (PossibleString.Length); //Length of string to search
decimal StaticTextLength = (StaticText.Length); //Length of text to search for
decimal NumberOfErrorsAllowed = Math.Round((StaticTextLength * (ErrorAllowance / 100)), MidpointRounding.AwayFromZero); //Find number of errors allowed with given ErrorAllowance percentage
//Look for best match with 1 less character than it should have, then the correct amount of characters.
//And last, with 1 more character. (This is because one letter can be recognized as
//two (W -> VV) and visa versa)
for (int i = 0; i < 3; i++)
{
for (int e = TextLengthBuffer; e <= (int)PossibleStringLength; e++)
{
string possibleResult = (PossibleString.Substring((e - TextLengthBuffer), TextLengthBuffer));
int lAllowance = (int)(Math.Round((possibleResult.Length - StaticTextLength) + (NumberOfErrorsAllowed), MidpointRounding.AwayFromZero));
int lNumber = LevenshteinAlgorithm(StaticText, possibleResult);
if (lNumber <= lAllowance && ((lNumber < LowestLevenshteinNumber) || (TextLengthBuffer == StaticText.Length && lNumber <= LowestLevenshteinNumber)))
{
PossibleResult = (new StaticTextResult { text = possibleResult, errors = lNumber });
LowestLevenshteinNumber = lNumber;
}
}
TextLengthBuffer++;
}
public static int LevenshteinAlgorithm(string s, string t) // Levenshtein Algorithm
{
int n = s.Length;
int m = t.Length;
int[,] d = new int[n + 1, m + 1];
if (n == 0)
{
return m;
}
if (m == 0)
{
return n;
}
for (int i = 0; i <= n; d[i, 0] = i++)
{
}
for (int j = 0; j <= m; d[0, j] = j++)
{
}
for (int i = 1; i <= n; i++)
{
for (int j = 1; j <= m; j++)
{
int cost = (t[j - 1] == s[i - 1]) ? 0 : 1;
d[i, j] = Math.Min(
Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1),
d[i - 1, j - 1] + cost);
}
}
return d[n, m];
}