Tweaking new Tesseract OCR results

git-svn-id: https://subtitleedit.googlecode.com/svn/trunk@759 99eadd0c-20b8-1223-b5c4-2a2b2df33de2
This commit is contained in:
niksedk 2011-10-28 14:06:34 +00:00
parent d8ec8464fa
commit d66ab8b336
3 changed files with 105 additions and 19 deletions

View File

@ -2055,7 +2055,7 @@ namespace Nikse.SubtitleEdit.Forms
}
private string OcrViaTessnet(Bitmap bitmap, int index)
{
{
if (_ocrFixEngine == null)
LoadOcrFixEngine();
@ -2075,23 +2075,52 @@ namespace Nikse.SubtitleEdit.Forms
if (!textWithOutFixes.Contains(Environment.NewLine) && textWithOutFixes.Length < 17)
{
string psm = Tesseract3DoOcrViaExe(bitmap, _languageId, "-psm 7"); // 7 = Treat the image as a single text line.
if (psm.Length > textWithOutFixes.Length)
textWithOutFixes = psm;
else if (psm.Length == textWithOutFixes.Length &&
(!psm.Contains("0") && textWithOutFixes.Contains("0") || // these chars are often mistaken
!psm.Contains("9") && textWithOutFixes.Contains("9") ||
!psm.Contains("1") && textWithOutFixes.Contains("1") ||
!psm.Contains("$") && textWithOutFixes.Contains("$") ||
!psm.Contains("/") && textWithOutFixes.Contains("/") ||
!psm.Contains("(") && textWithOutFixes.Contains("(") ||
!psm.Contains(")") && textWithOutFixes.Contains(")") ||
!psm.Contains("_") && textWithOutFixes.Contains("_")))
textWithOutFixes = psm;
if (textWithOutFixes != psm)
{
if (textWithOutFixes.Trim().Length == 0)
{
textWithOutFixes = psm;
}
else if (psm.Length > textWithOutFixes.Length)
{
if ((!psm.Contains("9") && textWithOutFixes.Contains("9")) ||
(!psm.Contains("6") && textWithOutFixes.Contains("6")) ||
(!psm.Contains("5") && textWithOutFixes.Contains("5")) ||
(!psm.Contains("3") && textWithOutFixes.Contains("3")) ||
(!psm.Contains("1") && textWithOutFixes.Contains("1")) ||
(!psm.Contains("$") && textWithOutFixes.Contains("$")) ||
(!psm.Contains("€") && textWithOutFixes.Contains("€")))
textWithOutFixes = psm;
}
else if (psm.Length == textWithOutFixes.Length &&
(!psm.Contains("0") && textWithOutFixes.Contains("0") || // these chars are often mistaken
!psm.Contains("9") && textWithOutFixes.Contains("9") ||
!psm.Contains("8") && textWithOutFixes.Contains("8") ||
!psm.Contains("5") && textWithOutFixes.Contains("5") ||
!psm.Contains("3") && textWithOutFixes.Contains("3") ||
!psm.Contains("1") && textWithOutFixes.Contains("1") ||
!psm.Contains("$") && textWithOutFixes.Contains("$") ||
!psm.Contains("€") && textWithOutFixes.Contains("€") ||
!psm.Contains("/") && textWithOutFixes.Contains("/") ||
!psm.Contains("(") && textWithOutFixes.Contains("(") ||
!psm.Contains(")") && textWithOutFixes.Contains(")") ||
!psm.Contains("_") && textWithOutFixes.Contains("_")))
{
textWithOutFixes = psm;
}
else if (psm.Length == textWithOutFixes.Length && psm.EndsWith(".") && !textWithOutFixes.EndsWith("."))
{
textWithOutFixes = psm;
}
}
}
if (textWithOutFixes.ToString().Trim().Length == 0)
textWithOutFixes = TesseractResizeAndRetry(bitmap);
if (textWithOutFixes.Contains("<i>") && Utilities.CountTagInText(textWithOutFixes, "<i>") > 1)
textWithOutFixes = "<i>" + textWithOutFixes.Replace("<i>", string.Empty).Replace("</i>", string.Empty) + "</i>";
int numberOfWords = textWithOutFixes.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length;
string line = textWithOutFixes.ToString().Trim();
@ -2124,7 +2153,7 @@ namespace Nikse.SubtitleEdit.Forms
}
}
if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length == 0)
if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length < 2)
{
_ocrFixEngine.AutoGuessesUsed.Clear();
_ocrFixEngine.UnknownWordsFound.Clear();
@ -2139,7 +2168,7 @@ namespace Nikse.SubtitleEdit.Forms
if (modiText.Length == 0)
modiText = CallModi(index); // retry... strange MODI
if (modiText.Length > 1 && !modiText.Contains("0") && !modiText.Contains("9") &&
if (modiText.Length > 1 && (!modiText.Contains("0") || line.Contains("0")) && (!modiText.Contains("9") || line.Contains("9")) &&
Utilities.CountTagInText(modiText,"(") < 2 && Utilities.CountTagInText(modiText,")") < 2)
{
int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiText, out correctWords);
@ -2153,8 +2182,10 @@ namespace Nikse.SubtitleEdit.Forms
modiText = modiTextOcrFixed;
}
if (modiWordsNotFound < wordsNotFound)
if (modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0))
line = modiText; // use the modi ocr'ed text
else if (wordsNotFound == modiWordsNotFound && modiText.EndsWith("!") && (line.EndsWith("l") || line.EndsWith("fl")))
line = modiText;
}
// take the best option - before ocr fixing, which we do again to save suggestions and prompt for user input
@ -2187,7 +2218,7 @@ namespace Nikse.SubtitleEdit.Forms
subtitleListView1.SetBackgroundColor(index, Color.Red);
if (wordsNotFound == 2)
subtitleListView1.SetBackgroundColor(index, Color.Orange);
else if (wordsNotFound == 1)
else if (wordsNotFound == 1 || line.Length == 1)
subtitleListView1.SetBackgroundColor(index, Color.Yellow);
else if (line.Trim().Length == 0)
subtitleListView1.SetBackgroundColor(index, Color.Orange);

View File

@ -297,7 +297,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
string lastWord = null;
for (int i = 0; i < text.Length; i++)
{
if (" ¡¿,.!?:;()[]{}+-$£\"#&%\r\n".Contains(text[i].ToString()))
if (" ¡¿,.!?:;()[]{}+-£\"#&%\r\n".Contains(text[i].ToString())) // removed $
{
if (word.Length > 0)
{
@ -319,7 +319,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
else
{
word.Append(text[i]);
}
}
}
if (word.Length > 0) // last word
{
@ -929,6 +929,12 @@ namespace Nikse.SubtitleEdit.Logic.OCR
newText = sb.ToString().TrimEnd('\r').TrimEnd('\n').TrimEnd('\r').TrimEnd('\n');
newText = pre + newText;
string post = string.Empty;
if (newText.EndsWith("</i>"))
{
newText = newText.Remove(newText.Length - 4, 4);
post = "</i>";
}
foreach (string from in _endLineReplaceList.Keys)
{
if (newText.EndsWith(from))
@ -937,6 +943,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
newText = newText.Remove(position).Insert(position, _endLineReplaceList[from]);
}
}
newText += post;
foreach (string from in _partialLineReplaceList.Keys)
{
@ -952,6 +959,16 @@ namespace Nikse.SubtitleEdit.Logic.OCR
{
List<string> localIgnoreWords = new List<string>();
wordsNotFound = 0;
if (line.Length == 1 && !IsWordKnownOrNumber(line, line))
{
SpellcheckOcrTextResult res = SpellcheckOcrText(line, bitmap, new string[1] { line}, 0, line, localIgnoreWords);
if (res.FixedWholeLine || res.Fixed)
return res.Line;
wordsNotFound++;
return line;
}
if (_hunspell == null)
return line;
@ -1003,6 +1020,25 @@ namespace Nikse.SubtitleEdit.Logic.OCR
if (!correct && !line.Contains(word))
correct = true; // already fixed
if (!correct)
{
//look for match via dash'ed word, e.g. sci-fi
string dashedWord = GetDashedWordBefore(wordNoItalics, line, words, i);
if (!string.IsNullOrEmpty(dashedWord))
{
correct = IsWordKnownOrNumber(dashedWord, line);
if (!correct)
correct = DoSpell(dashedWord);
}
dashedWord = GetDashedWordAfter(wordNoItalics, line, words, i);
if (!string.IsNullOrEmpty(dashedWord))
{
correct = IsWordKnownOrNumber(dashedWord, line);
if (!correct)
correct = DoSpell(dashedWord);
}
}
if (!correct)
{
wordsNotFound++;
@ -1104,6 +1140,22 @@ namespace Nikse.SubtitleEdit.Logic.OCR
return line;
}
private string GetDashedWordBefore(string word, string line, string[] words, int index)
{
if (index > 1 && line.Contains(words[index - 1] + "-" + word))
return words[index - 1] + "-" + word;
return null;
}
private string GetDashedWordAfter(string word, string line, string[] words, int index)
{
if (index < words.Length - 1 && line.Contains(word + "-" + words[index + 1]))
return word + "-" + words[index + 1];
return null;
}
private string GetWordWithDominatedCasing(string word)
{
string uppercaseLetters = Utilities.GetLetters(true, false, false);

View File

@ -205,6 +205,9 @@ namespace Nikse.SubtitleEdit.Logic.VobSub
private Bitmap GenerateBitmap(Rectangle imageDisplayArea, int imageTopFieldDataAddress, int imageBottomFieldDataAddress, List<Color> fourColors)
{
if (imageDisplayArea.Width <= 0 || imageDisplayArea.Height <= 0)
return new Bitmap(1,1);
var bmp = new Bitmap(imageDisplayArea.Width + 1, imageDisplayArea.Height + 1);
if (fourColors[0] != Color.Transparent)
{