Fix common errors uses cautious auto-guess-unknown-words - thx XhmikosR :)

This commit is contained in:
niksedk 2014-06-13 20:55:05 +02:00
parent 34609e5bf9
commit 69877e2fc7
4 changed files with 57 additions and 36 deletions

View File

@ -2737,14 +2737,14 @@ namespace Nikse.SubtitleEdit.Forms
public void FixOcrErrorsViaReplaceList(string threeLetterISOLanguageName)
{
OcrFixEngine ocrFixEngine = new OcrFixEngine(threeLetterISOLanguageName, null, this);
var ocrFixEngine = new OcrFixEngine(threeLetterISOLanguageName, null, this);
string fixAction = _language.FixCommonOcrErrors;
int noOfFixes = 0;
string lastLine = string.Empty;
for (int i = 0; i < _subtitle.Paragraphs.Count; i++)
{
Paragraph p = _subtitle.Paragraphs[i];
string text = ocrFixEngine.FixOcrErrors(p.Text, i, lastLine, false, false);
var p = _subtitle.Paragraphs[i];
string text = ocrFixEngine.FixOcrErrors(p.Text, i, lastLine, false, OcrFixEngine.AutoGuessLevel.Cautious);
lastLine = text;
if (p.Text != text)
{

View File

@ -3915,7 +3915,8 @@ namespace Nikse.SubtitleEdit.Forms
if (_ocrFixEngine != null && _ocrFixEngine.IsDictionaryLoaded)
{
if (checkBoxAutoFixCommonErrors.Checked)
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, checkBoxGuessUnknownWords.Checked);
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, GetAutoGuessLevel());
int correctWords;
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
@ -3923,7 +3924,7 @@ namespace Nikse.SubtitleEdit.Forms
{
_ocrFixEngine.AutoGuessesUsed.Clear();
_ocrFixEngine.UnknownWordsFound.Clear();
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked);
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel());
}
if (_ocrFixEngine.Abort)
@ -4120,8 +4121,12 @@ namespace Nikse.SubtitleEdit.Forms
string textWithOutFixes = line;
if (_ocrFixEngine.IsDictionaryLoaded)
{
var autoGuessLevel = OcrFixEngine.AutoGuessLevel.None;
if (checkBoxGuessUnknownWords.Checked)
autoGuessLevel = OcrFixEngine.AutoGuessLevel.Aggressive;
if (checkBoxAutoFixCommonErrors.Checked)
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, checkBoxGuessUnknownWords.Checked);
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, autoGuessLevel);
int correctWords;
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
@ -4129,7 +4134,7 @@ namespace Nikse.SubtitleEdit.Forms
{
_ocrFixEngine.AutoGuessesUsed.Clear();
_ocrFixEngine.UnknownWordsFound.Clear();
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked);
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, autoGuessLevel);
}
if (_ocrFixEngine.Abort)
@ -4427,7 +4432,7 @@ namespace Nikse.SubtitleEdit.Forms
if (_ocrFixEngine.IsDictionaryLoaded)
{
if (checkBoxAutoFixCommonErrors.Checked)
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, checkBoxGuessUnknownWords.Checked);
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, GetAutoGuessLevel());
int correctWords;
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
@ -4435,7 +4440,7 @@ namespace Nikse.SubtitleEdit.Forms
{
_ocrFixEngine.AutoGuessesUsed.Clear();
_ocrFixEngine.UnknownWordsFound.Clear();
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked);
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel());
}
if (_ocrFixEngine.Abort)
@ -5930,7 +5935,7 @@ namespace Nikse.SubtitleEdit.Forms
if (_ocrFixEngine.IsDictionaryLoaded)
{
if (checkBoxAutoFixCommonErrors.Checked)
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
int correctWords;
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
int oldCorrectWords = correctWords;
@ -5942,7 +5947,7 @@ namespace Nikse.SubtitleEdit.Forms
_ocrFixEngine.UnknownWordsFound.Clear();
string newUnfixedText = TesseractResizeAndRetry(bitmap);
string newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
string newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, true, GetAutoGuessLevel());
int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords);
if (wordsNotFound == 1 && newWordsNotFound == 1 && newUnfixedText.EndsWith("!!") && textWithOutFixes.EndsWith("u") && newText.Length > 1)
@ -6015,7 +6020,7 @@ namespace Nikse.SubtitleEdit.Forms
int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(oneColorText, out modiCorrectWords);
string modiTextOcrFixed = oneColorText;
if (checkBoxAutoFixCommonErrors.Checked)
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked);
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, index, _lastLine, false, GetAutoGuessLevel());
int modiOcrCorrectedCorrectWords;
int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords);
if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound)
@ -6031,7 +6036,7 @@ namespace Nikse.SubtitleEdit.Forms
wordsNotFound = modiWordsNotFound;
correctWords = modiCorrectWords;
if (checkBoxAutoFixCommonErrors.Checked)
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
}
else if (wordsNotFound == modiWordsNotFound && oneColorText.EndsWith("!") && (line.EndsWith("l") || line.EndsWith("fl")))
{
@ -6039,7 +6044,7 @@ namespace Nikse.SubtitleEdit.Forms
wordsNotFound = modiWordsNotFound;
correctWords = modiCorrectWords;
if (checkBoxAutoFixCommonErrors.Checked)
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
}
}
}
@ -6063,7 +6068,7 @@ namespace Nikse.SubtitleEdit.Forms
int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(unItalicText, out modiCorrectWords);
string modiTextOcrFixed = unItalicText;
if (checkBoxAutoFixCommonErrors.Checked)
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked);
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, index, _lastLine, false, GetAutoGuessLevel());
int modiOcrCorrectedCorrectWords;
int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords);
if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound)
@ -6295,7 +6300,7 @@ namespace Nikse.SubtitleEdit.Forms
{
line = line.Replace("'.", ":");
}
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
}
line = "<i>" + line + "</i>";
}
@ -6392,7 +6397,7 @@ namespace Nikse.SubtitleEdit.Forms
{
string modiTextOcrFixed = modiText;
if (checkBoxAutoFixCommonErrors.Checked)
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked);
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, GetAutoGuessLevel());
int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out correctWords);
if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound)
modiText = modiTextOcrFixed;
@ -6405,11 +6410,11 @@ namespace Nikse.SubtitleEdit.Forms
}
// take the best option - before ocr fixing, which we do again to save suggestions and prompt for user input
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked);
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel());
}
else
{ // fix some error manually (modi not available)
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked);
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel());
}
}
@ -6475,7 +6480,7 @@ namespace Nikse.SubtitleEdit.Forms
else
{ // no dictionary :(
if (checkBoxAutoFixCommonErrors.Checked)
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
if (badWords >= numberOfWords)
subtitleListView1.SetBackgroundColor(index, Color.Red);
@ -8417,5 +8422,13 @@ namespace Nikse.SubtitleEdit.Forms
form.Show(this);
}
private OcrFixEngine.AutoGuessLevel GetAutoGuessLevel()
{
var autoGuessLevel = OcrFixEngine.AutoGuessLevel.None;
if (checkBoxGuessUnknownWords.Checked)
autoGuessLevel = OcrFixEngine.AutoGuessLevel.Aggressive;
return autoGuessLevel;
}
}
}

View File

@ -14,6 +14,13 @@ namespace Nikse.SubtitleEdit.Logic.OCR
{
public class OcrFixEngine
{
public enum AutoGuessLevel
{
None,
Cautious,
Aggressive
}
// Dictionaries/spellchecking/fixing
Dictionary<string, string> _wordReplaceList;
Dictionary<string, string> _partialLineWordBoundaryReplaceList;
@ -386,7 +393,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
return list;
}
public string FixOcrErrors(string text, int index, string lastLine, bool logSuggestions, bool useAutoGuess)
public string FixOcrErrors(string text, int index, string lastLine, bool logSuggestions, AutoGuessLevel autoGuess)
{
var sb = new StringBuilder();
var word = new StringBuilder();
@ -452,7 +459,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
text = FixCommenOcrLineErrors(sb.ToString(), lastLine);
int wordsNotFound;
text = FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, text, index, null, true, false, logSuggestions, useAutoGuess);
text = FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, text, index, null, true, false, logSuggestions, autoGuess);
if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
{
text = FixLowercaseIToUppercaseI(text, lastLine);
@ -1424,7 +1431,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
return newText;
}
public string FixUnknownWordsViaGuessOrPrompt(out int wordsNotFound, string line, int index, Bitmap bitmap, bool autoFix, bool promptForFixingErrors, bool log, bool useAutoGuess)
public string FixUnknownWordsViaGuessOrPrompt(out int wordsNotFound, string line, int index, Bitmap bitmap, bool autoFix, bool promptForFixingErrors, bool log, AutoGuessLevel autoGuess)
{
var localIgnoreWords = new List<string>();
wordsNotFound = 0;
@ -1549,10 +1556,10 @@ namespace Nikse.SubtitleEdit.Logic.OCR
UnknownWordsFound.Add(string.Format("#{0}: {1}", index + 1, nf));
}
if (autoFix && useAutoGuess)
if (autoFix && autoGuess != AutoGuessLevel.None)
{
var guesses = new List<string>();
if (word.Length > 5)
if (word.Length > 5 && autoGuess == AutoGuessLevel.Aggressive)
{
guesses = (List<string>)CreateGuessesFromLetters(word);
@ -1566,7 +1573,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
if (DoSpell(word.ToLower()))
guesses.Insert(0, wordWithCasingChanged);
}
else
else if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
{
if (word[0] == 'L')
guesses.Add("I" + word.Substring(1));
@ -1585,6 +1592,8 @@ namespace Nikse.SubtitleEdit.Logic.OCR
guesses.Add(word.Replace("$", "s"));
if (!word.EndsWith("€") && !word.StartsWith("€"))
guesses.Add(word.Replace("€", "e"));
guesses.Add(word.Replace("/", "l"));
guesses.Add(word.Replace(")/", "y"));
}
foreach (string guess in guesses)
{

View File

@ -368,16 +368,15 @@ namespace Test
Assert.AreEqual(target._subtitle.Paragraphs[0].Text, "(laughing/clapping)");
}
//Auto-guess unknown words in "Fix common errors" is now disabled
//[TestMethod]
//[DeploymentItem("SubtitleEdit.exe")]
//public void FixCommonOcrErrorsSlashIsL()
//{
// var target = GetFixCommonErrorsLib();
// InitializeFixCommonErrorsLine(target, "The font is ita/ic!");
// target.FixOcrErrorsViaReplaceList("eng");
// Assert.AreEqual(target._subtitle.Paragraphs[0].Text, "The font is italic!"); // will fail if English dictionary is not found
//}
[TestMethod]
[DeploymentItem("SubtitleEdit.exe")]
public void FixCommonOcrErrorsSlashIsL() // requires hardcoded rules enabled
{
var target = GetFixCommonErrorsLib();
InitializeFixCommonErrorsLine(target, "The font is ita/ic!");
target.FixOcrErrorsViaReplaceList("eng");
Assert.AreEqual(target._subtitle.Paragraphs[0].Text, "The font is italic!"); // will fail if English dictionary is not found
}
[TestMethod]
[DeploymentItem("SubtitleEdit.exe")]