From 69877e2fc7bbef9313c30e8526f754a5c3e799d4 Mon Sep 17 00:00:00 2001 From: niksedk Date: Fri, 13 Jun 2014 20:55:05 +0200 Subject: [PATCH] Fix common errors uses cautious auto-guess-unknown-words - thx XhmikosR :) --- src/Forms/FixCommonErrors.cs | 6 ++--- src/Forms/VobSubOcr.cs | 47 +++++++++++++++++++++------------ src/Logic/OCR/OcrFixEngine.cs | 21 ++++++++++----- src/Test/FixCommonErrorsTest.cs | 19 +++++++------ 4 files changed, 57 insertions(+), 36 deletions(-) diff --git a/src/Forms/FixCommonErrors.cs b/src/Forms/FixCommonErrors.cs index b3c7331b0..26c22de9f 100644 --- a/src/Forms/FixCommonErrors.cs +++ b/src/Forms/FixCommonErrors.cs @@ -2737,14 +2737,14 @@ namespace Nikse.SubtitleEdit.Forms public void FixOcrErrorsViaReplaceList(string threeLetterISOLanguageName) { - OcrFixEngine ocrFixEngine = new OcrFixEngine(threeLetterISOLanguageName, null, this); + var ocrFixEngine = new OcrFixEngine(threeLetterISOLanguageName, null, this); string fixAction = _language.FixCommonOcrErrors; int noOfFixes = 0; string lastLine = string.Empty; for (int i = 0; i < _subtitle.Paragraphs.Count; i++) { - Paragraph p = _subtitle.Paragraphs[i]; - string text = ocrFixEngine.FixOcrErrors(p.Text, i, lastLine, false, false); + var p = _subtitle.Paragraphs[i]; + string text = ocrFixEngine.FixOcrErrors(p.Text, i, lastLine, false, OcrFixEngine.AutoGuessLevel.Cautious); lastLine = text; if (p.Text != text) { diff --git a/src/Forms/VobSubOcr.cs b/src/Forms/VobSubOcr.cs index bce3e726d..49a6ce63e 100644 --- a/src/Forms/VobSubOcr.cs +++ b/src/Forms/VobSubOcr.cs @@ -3915,7 +3915,8 @@ namespace Nikse.SubtitleEdit.Forms if (_ocrFixEngine != null && _ocrFixEngine.IsDictionaryLoaded) { if (checkBoxAutoFixCommonErrors.Checked) - line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, checkBoxGuessUnknownWords.Checked); + line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, GetAutoGuessLevel()); + int correctWords; int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords); @@ -3923,7 +3924,7 @@ namespace Nikse.SubtitleEdit.Forms { _ocrFixEngine.AutoGuessesUsed.Clear(); _ocrFixEngine.UnknownWordsFound.Clear(); - line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked); + line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel()); } if (_ocrFixEngine.Abort) @@ -4120,8 +4121,12 @@ namespace Nikse.SubtitleEdit.Forms string textWithOutFixes = line; if (_ocrFixEngine.IsDictionaryLoaded) { + var autoGuessLevel = OcrFixEngine.AutoGuessLevel.None; + if (checkBoxGuessUnknownWords.Checked) + autoGuessLevel = OcrFixEngine.AutoGuessLevel.Aggressive; + if (checkBoxAutoFixCommonErrors.Checked) - line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, checkBoxGuessUnknownWords.Checked); + line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, autoGuessLevel); int correctWords; int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords); @@ -4129,7 +4134,7 @@ namespace Nikse.SubtitleEdit.Forms { _ocrFixEngine.AutoGuessesUsed.Clear(); _ocrFixEngine.UnknownWordsFound.Clear(); - line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked); + line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, autoGuessLevel); } if (_ocrFixEngine.Abort) @@ -4427,7 +4432,7 @@ namespace Nikse.SubtitleEdit.Forms if (_ocrFixEngine.IsDictionaryLoaded) { if (checkBoxAutoFixCommonErrors.Checked) - line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, checkBoxGuessUnknownWords.Checked); + line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, GetAutoGuessLevel()); int correctWords; int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords); @@ -4435,7 +4440,7 @@ namespace Nikse.SubtitleEdit.Forms { _ocrFixEngine.AutoGuessesUsed.Clear(); _ocrFixEngine.UnknownWordsFound.Clear(); - line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked); + line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel()); } if (_ocrFixEngine.Abort) @@ -5930,7 +5935,7 @@ namespace Nikse.SubtitleEdit.Forms if (_ocrFixEngine.IsDictionaryLoaded) { if (checkBoxAutoFixCommonErrors.Checked) - line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); + line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel()); int correctWords; int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords); int oldCorrectWords = correctWords; @@ -5942,7 +5947,7 @@ namespace Nikse.SubtitleEdit.Forms _ocrFixEngine.UnknownWordsFound.Clear(); string newUnfixedText = TesseractResizeAndRetry(bitmap); - string newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); + string newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, true, GetAutoGuessLevel()); int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords); if (wordsNotFound == 1 && newWordsNotFound == 1 && newUnfixedText.EndsWith("!!") && textWithOutFixes.EndsWith("u") && newText.Length > 1) @@ -6015,7 +6020,7 @@ namespace Nikse.SubtitleEdit.Forms int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(oneColorText, out modiCorrectWords); string modiTextOcrFixed = oneColorText; if (checkBoxAutoFixCommonErrors.Checked) - modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked); + modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, index, _lastLine, false, GetAutoGuessLevel()); int modiOcrCorrectedCorrectWords; int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords); if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound) @@ -6031,7 +6036,7 @@ namespace Nikse.SubtitleEdit.Forms wordsNotFound = modiWordsNotFound; correctWords = modiCorrectWords; if (checkBoxAutoFixCommonErrors.Checked) - line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); + line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel()); } else if (wordsNotFound == modiWordsNotFound && oneColorText.EndsWith("!") && (line.EndsWith("l") || line.EndsWith("fl"))) { @@ -6039,7 +6044,7 @@ namespace Nikse.SubtitleEdit.Forms wordsNotFound = modiWordsNotFound; correctWords = modiCorrectWords; if (checkBoxAutoFixCommonErrors.Checked) - line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); + line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel()); } } } @@ -6063,7 +6068,7 @@ namespace Nikse.SubtitleEdit.Forms int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(unItalicText, out modiCorrectWords); string modiTextOcrFixed = unItalicText; if (checkBoxAutoFixCommonErrors.Checked) - modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked); + modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, index, _lastLine, false, GetAutoGuessLevel()); int modiOcrCorrectedCorrectWords; int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords); if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound) @@ -6295,7 +6300,7 @@ namespace Nikse.SubtitleEdit.Forms { line = line.Replace("'.", ":"); } - line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); + line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel()); } line = "" + line + ""; } @@ -6392,7 +6397,7 @@ namespace Nikse.SubtitleEdit.Forms { string modiTextOcrFixed = modiText; if (checkBoxAutoFixCommonErrors.Checked) - modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked); + modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, GetAutoGuessLevel()); int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out correctWords); if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound) modiText = modiTextOcrFixed; @@ -6405,11 +6410,11 @@ namespace Nikse.SubtitleEdit.Forms } // take the best option - before ocr fixing, which we do again to save suggestions and prompt for user input - line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked); + line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel()); } else { // fix some error manually (modi not available) - line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked); + line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel()); } } @@ -6475,7 +6480,7 @@ namespace Nikse.SubtitleEdit.Forms else { // no dictionary :( if (checkBoxAutoFixCommonErrors.Checked) - line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); + line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel()); if (badWords >= numberOfWords) subtitleListView1.SetBackgroundColor(index, Color.Red); @@ -8417,5 +8422,13 @@ namespace Nikse.SubtitleEdit.Forms form.Show(this); } + private OcrFixEngine.AutoGuessLevel GetAutoGuessLevel() + { + var autoGuessLevel = OcrFixEngine.AutoGuessLevel.None; + if (checkBoxGuessUnknownWords.Checked) + autoGuessLevel = OcrFixEngine.AutoGuessLevel.Aggressive; + return autoGuessLevel; + } + } } diff --git a/src/Logic/OCR/OcrFixEngine.cs b/src/Logic/OCR/OcrFixEngine.cs index 91b6b70fa..efd60b168 100644 --- a/src/Logic/OCR/OcrFixEngine.cs +++ b/src/Logic/OCR/OcrFixEngine.cs @@ -14,6 +14,13 @@ namespace Nikse.SubtitleEdit.Logic.OCR { public class OcrFixEngine { + public enum AutoGuessLevel + { + None, + Cautious, + Aggressive + } + // Dictionaries/spellchecking/fixing Dictionary _wordReplaceList; Dictionary _partialLineWordBoundaryReplaceList; @@ -386,7 +393,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR return list; } - public string FixOcrErrors(string text, int index, string lastLine, bool logSuggestions, bool useAutoGuess) + public string FixOcrErrors(string text, int index, string lastLine, bool logSuggestions, AutoGuessLevel autoGuess) { var sb = new StringBuilder(); var word = new StringBuilder(); @@ -452,7 +459,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR text = FixCommenOcrLineErrors(sb.ToString(), lastLine); int wordsNotFound; - text = FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, text, index, null, true, false, logSuggestions, useAutoGuess); + text = FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, text, index, null, true, false, logSuggestions, autoGuess); if (Configuration.Settings.Tools.OcrFixUseHardcodedRules) { text = FixLowercaseIToUppercaseI(text, lastLine); @@ -1424,7 +1431,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR return newText; } - public string FixUnknownWordsViaGuessOrPrompt(out int wordsNotFound, string line, int index, Bitmap bitmap, bool autoFix, bool promptForFixingErrors, bool log, bool useAutoGuess) + public string FixUnknownWordsViaGuessOrPrompt(out int wordsNotFound, string line, int index, Bitmap bitmap, bool autoFix, bool promptForFixingErrors, bool log, AutoGuessLevel autoGuess) { var localIgnoreWords = new List(); wordsNotFound = 0; @@ -1549,10 +1556,10 @@ namespace Nikse.SubtitleEdit.Logic.OCR UnknownWordsFound.Add(string.Format("#{0}: {1}", index + 1, nf)); } - if (autoFix && useAutoGuess) + if (autoFix && autoGuess != AutoGuessLevel.None) { var guesses = new List(); - if (word.Length > 5) + if (word.Length > 5 && autoGuess == AutoGuessLevel.Aggressive) { guesses = (List)CreateGuessesFromLetters(word); @@ -1566,7 +1573,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR if (DoSpell(word.ToLower())) guesses.Insert(0, wordWithCasingChanged); } - else + else if (Configuration.Settings.Tools.OcrFixUseHardcodedRules) { if (word[0] == 'L') guesses.Add("I" + word.Substring(1)); @@ -1585,6 +1592,8 @@ namespace Nikse.SubtitleEdit.Logic.OCR guesses.Add(word.Replace("$", "s")); if (!word.EndsWith("€") && !word.StartsWith("€")) guesses.Add(word.Replace("€", "e")); + guesses.Add(word.Replace("/", "l")); + guesses.Add(word.Replace(")/", "y")); } foreach (string guess in guesses) { diff --git a/src/Test/FixCommonErrorsTest.cs b/src/Test/FixCommonErrorsTest.cs index 748ac8295..5c6f86015 100644 --- a/src/Test/FixCommonErrorsTest.cs +++ b/src/Test/FixCommonErrorsTest.cs @@ -368,16 +368,15 @@ namespace Test Assert.AreEqual(target._subtitle.Paragraphs[0].Text, "(laughing/clapping)"); } - //Auto-guess unknown words in "Fix common errors" is now disabled - //[TestMethod] - //[DeploymentItem("SubtitleEdit.exe")] - //public void FixCommonOcrErrorsSlashIsL() - //{ - // var target = GetFixCommonErrorsLib(); - // InitializeFixCommonErrorsLine(target, "The font is ita/ic!"); - // target.FixOcrErrorsViaReplaceList("eng"); - // Assert.AreEqual(target._subtitle.Paragraphs[0].Text, "The font is italic!"); // will fail if English dictionary is not found - //} + [TestMethod] + [DeploymentItem("SubtitleEdit.exe")] + public void FixCommonOcrErrorsSlashIsL() // requires hardcoded rules enabled + { + var target = GetFixCommonErrorsLib(); + InitializeFixCommonErrorsLine(target, "The font is ita/ic!"); + target.FixOcrErrorsViaReplaceList("eng"); + Assert.AreEqual(target._subtitle.Paragraphs[0].Text, "The font is italic!"); // will fail if English dictionary is not found + } [TestMethod] [DeploymentItem("SubtitleEdit.exe")]