mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-11-22 19:22:53 +01:00
Tweaking new Tesseract OCR results
git-svn-id: https://subtitleedit.googlecode.com/svn/trunk@759 99eadd0c-20b8-1223-b5c4-2a2b2df33de2
This commit is contained in:
parent
d8ec8464fa
commit
d66ab8b336
@ -2055,7 +2055,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
}
|
||||
|
||||
private string OcrViaTessnet(Bitmap bitmap, int index)
|
||||
{
|
||||
{
|
||||
if (_ocrFixEngine == null)
|
||||
LoadOcrFixEngine();
|
||||
|
||||
@ -2075,23 +2075,52 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
if (!textWithOutFixes.Contains(Environment.NewLine) && textWithOutFixes.Length < 17)
|
||||
{
|
||||
string psm = Tesseract3DoOcrViaExe(bitmap, _languageId, "-psm 7"); // 7 = Treat the image as a single text line.
|
||||
if (psm.Length > textWithOutFixes.Length)
|
||||
textWithOutFixes = psm;
|
||||
else if (psm.Length == textWithOutFixes.Length &&
|
||||
(!psm.Contains("0") && textWithOutFixes.Contains("0") || // these chars are often mistaken
|
||||
!psm.Contains("9") && textWithOutFixes.Contains("9") ||
|
||||
!psm.Contains("1") && textWithOutFixes.Contains("1") ||
|
||||
!psm.Contains("$") && textWithOutFixes.Contains("$") ||
|
||||
!psm.Contains("/") && textWithOutFixes.Contains("/") ||
|
||||
!psm.Contains("(") && textWithOutFixes.Contains("(") ||
|
||||
!psm.Contains(")") && textWithOutFixes.Contains(")") ||
|
||||
!psm.Contains("_") && textWithOutFixes.Contains("_")))
|
||||
textWithOutFixes = psm;
|
||||
if (textWithOutFixes != psm)
|
||||
{
|
||||
if (textWithOutFixes.Trim().Length == 0)
|
||||
{
|
||||
textWithOutFixes = psm;
|
||||
}
|
||||
else if (psm.Length > textWithOutFixes.Length)
|
||||
{
|
||||
if ((!psm.Contains("9") && textWithOutFixes.Contains("9")) ||
|
||||
(!psm.Contains("6") && textWithOutFixes.Contains("6")) ||
|
||||
(!psm.Contains("5") && textWithOutFixes.Contains("5")) ||
|
||||
(!psm.Contains("3") && textWithOutFixes.Contains("3")) ||
|
||||
(!psm.Contains("1") && textWithOutFixes.Contains("1")) ||
|
||||
(!psm.Contains("$") && textWithOutFixes.Contains("$")) ||
|
||||
(!psm.Contains("€") && textWithOutFixes.Contains("€")))
|
||||
textWithOutFixes = psm;
|
||||
}
|
||||
else if (psm.Length == textWithOutFixes.Length &&
|
||||
(!psm.Contains("0") && textWithOutFixes.Contains("0") || // these chars are often mistaken
|
||||
!psm.Contains("9") && textWithOutFixes.Contains("9") ||
|
||||
!psm.Contains("8") && textWithOutFixes.Contains("8") ||
|
||||
!psm.Contains("5") && textWithOutFixes.Contains("5") ||
|
||||
!psm.Contains("3") && textWithOutFixes.Contains("3") ||
|
||||
!psm.Contains("1") && textWithOutFixes.Contains("1") ||
|
||||
!psm.Contains("$") && textWithOutFixes.Contains("$") ||
|
||||
!psm.Contains("€") && textWithOutFixes.Contains("€") ||
|
||||
!psm.Contains("/") && textWithOutFixes.Contains("/") ||
|
||||
!psm.Contains("(") && textWithOutFixes.Contains("(") ||
|
||||
!psm.Contains(")") && textWithOutFixes.Contains(")") ||
|
||||
!psm.Contains("_") && textWithOutFixes.Contains("_")))
|
||||
{
|
||||
textWithOutFixes = psm;
|
||||
}
|
||||
else if (psm.Length == textWithOutFixes.Length && psm.EndsWith(".") && !textWithOutFixes.EndsWith("."))
|
||||
{
|
||||
textWithOutFixes = psm;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (textWithOutFixes.ToString().Trim().Length == 0)
|
||||
textWithOutFixes = TesseractResizeAndRetry(bitmap);
|
||||
|
||||
if (textWithOutFixes.Contains("<i>") && Utilities.CountTagInText(textWithOutFixes, "<i>") > 1)
|
||||
textWithOutFixes = "<i>" + textWithOutFixes.Replace("<i>", string.Empty).Replace("</i>", string.Empty) + "</i>";
|
||||
|
||||
int numberOfWords = textWithOutFixes.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length;
|
||||
|
||||
string line = textWithOutFixes.ToString().Trim();
|
||||
@ -2124,7 +2153,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
}
|
||||
}
|
||||
|
||||
if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length == 0)
|
||||
if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length < 2)
|
||||
{
|
||||
_ocrFixEngine.AutoGuessesUsed.Clear();
|
||||
_ocrFixEngine.UnknownWordsFound.Clear();
|
||||
@ -2139,7 +2168,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
if (modiText.Length == 0)
|
||||
modiText = CallModi(index); // retry... strange MODI
|
||||
|
||||
if (modiText.Length > 1 && !modiText.Contains("0") && !modiText.Contains("9") &&
|
||||
if (modiText.Length > 1 && (!modiText.Contains("0") || line.Contains("0")) && (!modiText.Contains("9") || line.Contains("9")) &&
|
||||
Utilities.CountTagInText(modiText,"(") < 2 && Utilities.CountTagInText(modiText,")") < 2)
|
||||
{
|
||||
int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiText, out correctWords);
|
||||
@ -2153,8 +2182,10 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
modiText = modiTextOcrFixed;
|
||||
}
|
||||
|
||||
if (modiWordsNotFound < wordsNotFound)
|
||||
if (modiWordsNotFound < wordsNotFound || (textWithOutFixes.Length == 1 && modiWordsNotFound == 0))
|
||||
line = modiText; // use the modi ocr'ed text
|
||||
else if (wordsNotFound == modiWordsNotFound && modiText.EndsWith("!") && (line.EndsWith("l") || line.EndsWith("fl")))
|
||||
line = modiText;
|
||||
}
|
||||
|
||||
// take the best option - before ocr fixing, which we do again to save suggestions and prompt for user input
|
||||
@ -2187,7 +2218,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
subtitleListView1.SetBackgroundColor(index, Color.Red);
|
||||
if (wordsNotFound == 2)
|
||||
subtitleListView1.SetBackgroundColor(index, Color.Orange);
|
||||
else if (wordsNotFound == 1)
|
||||
else if (wordsNotFound == 1 || line.Length == 1)
|
||||
subtitleListView1.SetBackgroundColor(index, Color.Yellow);
|
||||
else if (line.Trim().Length == 0)
|
||||
subtitleListView1.SetBackgroundColor(index, Color.Orange);
|
||||
|
@ -297,7 +297,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
string lastWord = null;
|
||||
for (int i = 0; i < text.Length; i++)
|
||||
{
|
||||
if (" ¡¿,.!?:;()[]{}+-$£\"#&%\r\n".Contains(text[i].ToString()))
|
||||
if (" ¡¿,.!?:;()[]{}+-£\"#&%\r\n".Contains(text[i].ToString())) // removed $
|
||||
{
|
||||
if (word.Length > 0)
|
||||
{
|
||||
@ -319,7 +319,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
else
|
||||
{
|
||||
word.Append(text[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (word.Length > 0) // last word
|
||||
{
|
||||
@ -929,6 +929,12 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
newText = sb.ToString().TrimEnd('\r').TrimEnd('\n').TrimEnd('\r').TrimEnd('\n');
|
||||
newText = pre + newText;
|
||||
|
||||
string post = string.Empty;
|
||||
if (newText.EndsWith("</i>"))
|
||||
{
|
||||
newText = newText.Remove(newText.Length - 4, 4);
|
||||
post = "</i>";
|
||||
}
|
||||
foreach (string from in _endLineReplaceList.Keys)
|
||||
{
|
||||
if (newText.EndsWith(from))
|
||||
@ -937,6 +943,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
newText = newText.Remove(position).Insert(position, _endLineReplaceList[from]);
|
||||
}
|
||||
}
|
||||
newText += post;
|
||||
|
||||
foreach (string from in _partialLineReplaceList.Keys)
|
||||
{
|
||||
@ -952,6 +959,16 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
{
|
||||
List<string> localIgnoreWords = new List<string>();
|
||||
wordsNotFound = 0;
|
||||
|
||||
if (line.Length == 1 && !IsWordKnownOrNumber(line, line))
|
||||
{
|
||||
SpellcheckOcrTextResult res = SpellcheckOcrText(line, bitmap, new string[1] { line}, 0, line, localIgnoreWords);
|
||||
if (res.FixedWholeLine || res.Fixed)
|
||||
return res.Line;
|
||||
wordsNotFound++;
|
||||
return line;
|
||||
}
|
||||
|
||||
if (_hunspell == null)
|
||||
return line;
|
||||
|
||||
@ -1003,6 +1020,25 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
if (!correct && !line.Contains(word))
|
||||
correct = true; // already fixed
|
||||
|
||||
if (!correct)
|
||||
{
|
||||
//look for match via dash'ed word, e.g. sci-fi
|
||||
string dashedWord = GetDashedWordBefore(wordNoItalics, line, words, i);
|
||||
if (!string.IsNullOrEmpty(dashedWord))
|
||||
{
|
||||
correct = IsWordKnownOrNumber(dashedWord, line);
|
||||
if (!correct)
|
||||
correct = DoSpell(dashedWord);
|
||||
}
|
||||
dashedWord = GetDashedWordAfter(wordNoItalics, line, words, i);
|
||||
if (!string.IsNullOrEmpty(dashedWord))
|
||||
{
|
||||
correct = IsWordKnownOrNumber(dashedWord, line);
|
||||
if (!correct)
|
||||
correct = DoSpell(dashedWord);
|
||||
}
|
||||
}
|
||||
|
||||
if (!correct)
|
||||
{
|
||||
wordsNotFound++;
|
||||
@ -1104,6 +1140,22 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
return line;
|
||||
}
|
||||
|
||||
private string GetDashedWordBefore(string word, string line, string[] words, int index)
|
||||
{
|
||||
if (index > 1 && line.Contains(words[index - 1] + "-" + word))
|
||||
return words[index - 1] + "-" + word;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private string GetDashedWordAfter(string word, string line, string[] words, int index)
|
||||
{
|
||||
if (index < words.Length - 1 && line.Contains(word + "-" + words[index + 1]))
|
||||
return word + "-" + words[index + 1];
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private string GetWordWithDominatedCasing(string word)
|
||||
{
|
||||
string uppercaseLetters = Utilities.GetLetters(true, false, false);
|
||||
|
@ -205,6 +205,9 @@ namespace Nikse.SubtitleEdit.Logic.VobSub
|
||||
|
||||
private Bitmap GenerateBitmap(Rectangle imageDisplayArea, int imageTopFieldDataAddress, int imageBottomFieldDataAddress, List<Color> fourColors)
|
||||
{
|
||||
if (imageDisplayArea.Width <= 0 || imageDisplayArea.Height <= 0)
|
||||
return new Bitmap(1,1);
|
||||
|
||||
var bmp = new Bitmap(imageDisplayArea.Width + 1, imageDisplayArea.Height + 1);
|
||||
if (fourColors[0] != Color.Transparent)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user