Bugfix in ocr + a few minor improvements

git-svn-id: https://subtitleedit.googlecode.com/svn/trunk@119 99eadd0c-20b8-1223-b5c4-2a2b2df33de2
This commit is contained in:
niksedk 2010-10-31 21:23:24 +00:00
parent 81bac71766
commit 0e4e91281f
3 changed files with 27 additions and 23 deletions

View File

@ -255,6 +255,8 @@ namespace Nikse.SubtitleEdit.Forms
// checkBoxUseModiInTesseractForUnknownWords // checkBoxUseModiInTesseractForUnknownWords
// //
this.checkBoxUseModiInTesseractForUnknownWords.AutoSize = true; this.checkBoxUseModiInTesseractForUnknownWords.AutoSize = true;
this.checkBoxUseModiInTesseractForUnknownWords.Checked = true;
this.checkBoxUseModiInTesseractForUnknownWords.CheckState = System.Windows.Forms.CheckState.Checked;
this.checkBoxUseModiInTesseractForUnknownWords.Enabled = false; this.checkBoxUseModiInTesseractForUnknownWords.Enabled = false;
this.checkBoxUseModiInTesseractForUnknownWords.Location = new System.Drawing.Point(22, 74); this.checkBoxUseModiInTesseractForUnknownWords.Location = new System.Drawing.Point(22, 74);
this.checkBoxUseModiInTesseractForUnknownWords.Name = "checkBoxUseModiInTesseractForUnknownWords"; this.checkBoxUseModiInTesseractForUnknownWords.Name = "checkBoxUseModiInTesseractForUnknownWords";

View File

@ -999,43 +999,38 @@ namespace Nikse.SubtitleEdit.Forms
comboBoxModiLanguage.SelectedIndex = -1; comboBoxModiLanguage.SelectedIndex = -1;
} }
var sb = new StringBuilder();
int badWords = 0; int badWords = 0;
var textWithOutFixes = new StringBuilder(); string textWithOutFixes = Tesseract3DoOcrViaExe(bitmap, _languageId);
textWithOutFixes.Append(Tesseract3DoOcrViaExe(bitmap, _languageId));
if (textWithOutFixes.ToString().Trim().Length == 0) if (textWithOutFixes.ToString().Trim().Length == 0)
{ {
textWithOutFixes = new StringBuilder(); textWithOutFixes = TesseractResizeAndRetry(bitmap);
textWithOutFixes.Append(TesseractResizeAndRetry(bitmap));
} }
sb.Append(textWithOutFixes.ToString()); int numberOfWords = textWithOutFixes.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length;
int numberOfWords = sb.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length;
string line = sb.ToString().Trim(); string line = textWithOutFixes.ToString().Trim();
if (_ocrFixEngine.IsDictionaryLoaded) if (_ocrFixEngine.IsDictionaryLoaded)
{ {
if (checkBoxAutoFixCommonErrors.Checked) if (checkBoxAutoFixCommonErrors.Checked)
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line); int correctWords;
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
if (wordsNotFound > 0) if (wordsNotFound > 0 || correctWords == 0)
{ {
string newText = TesseractResizeAndRetry(bitmap); string newUnfixedText = TesseractResizeAndRetry(bitmap);
newText = _ocrFixEngine.FixOcrErrors(newText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); string newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText); int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords);
if (newWordsNotFound < wordsNotFound) if (newWordsNotFound < wordsNotFound)
{ {
wordsNotFound = newWordsNotFound; wordsNotFound = newWordsNotFound;
textWithOutFixes = new StringBuilder(); textWithOutFixes = newUnfixedText;
textWithOutFixes.Append(newText); line = newText;
sb = new StringBuilder();
sb.Append(newText);
} }
} }
if (wordsNotFound > 0 || sb.ToString().Replace("~", string.Empty).Trim().Length == 0) if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length == 0)
{ {
_ocrFixEngine.AutoGuessesUsed.Clear(); _ocrFixEngine.AutoGuessesUsed.Clear();
_ocrFixEngine.UnknownWordsFound.Clear(); _ocrFixEngine.UnknownWordsFound.Clear();
@ -1051,13 +1046,13 @@ namespace Nikse.SubtitleEdit.Forms
if (modiText.Length > 1) if (modiText.Length > 1)
{ {
int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiText); int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiText, out correctWords);
if (modiWordsNotFound > 0) if (modiWordsNotFound > 0)
{ {
string modiTextOcrFixed = modiText; string modiTextOcrFixed = modiText;
if (checkBoxAutoFixCommonErrors.Checked) if (checkBoxAutoFixCommonErrors.Checked)
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked); modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked);
int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed); int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out correctWords);
if (modiOcrCorrectedWordsNotFound < modiWordsNotFound) if (modiOcrCorrectedWordsNotFound < modiWordsNotFound)
modiText = modiTextOcrFixed; modiText = modiTextOcrFixed;
} }

View File

@ -583,12 +583,15 @@ namespace Nikse.SubtitleEdit.Logic.OCR
} }
else else
{ {
if (i==0)
guesses.Add(word.Replace(@"\/", "V"));
else
guesses.Add(word.Replace(@"\/", "v"));
guesses.Add(word.Replace("fi", "fi")); guesses.Add(word.Replace("fi", "fi"));
guesses.Add(word.Replace("fi", "fj")); guesses.Add(word.Replace("fi", "fj"));
guesses.Add(word.Replace("fl", "fl")); guesses.Add(word.Replace("fl", "fl"));
if (!word.EndsWith("€") && !word.StartsWith("€")) if (!word.EndsWith("€") && !word.StartsWith("€"))
guesses.Add(word.Replace("€", "e")); guesses.Add(word.Replace("€", "e"));
guesses.Add(word.Replace("fi", "fj"));
} }
foreach (string guess in guesses) foreach (string guess in guesses)
{ {
@ -884,8 +887,9 @@ namespace Nikse.SubtitleEdit.Logic.OCR
return false; return false;
} }
public int CountUnknownWordsViaDictionary(string line) public int CountUnknownWordsViaDictionary(string line, out int numberOfCorrectWords)
{ {
numberOfCorrectWords = 0;
if (_hunspell == null) if (_hunspell == null)
return 0; return 0;
@ -900,8 +904,11 @@ namespace Nikse.SubtitleEdit.Logic.OCR
if (!correct) if (!correct)
correct = _hunspell.Spell(word.Trim('\'')); correct = _hunspell.Spell(word.Trim('\''));
if (!correct) if (correct)
numberOfCorrectWords++;
else
wordsNotFound++; wordsNotFound++;
} }
} }
return wordsNotFound; return wordsNotFound;