mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-11-25 20:52:44 +01:00
Fix common errors uses cautious auto-guess-unknown-words - thx XhmikosR :)
This commit is contained in:
parent
34609e5bf9
commit
69877e2fc7
@ -2737,14 +2737,14 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
|
||||
public void FixOcrErrorsViaReplaceList(string threeLetterISOLanguageName)
|
||||
{
|
||||
OcrFixEngine ocrFixEngine = new OcrFixEngine(threeLetterISOLanguageName, null, this);
|
||||
var ocrFixEngine = new OcrFixEngine(threeLetterISOLanguageName, null, this);
|
||||
string fixAction = _language.FixCommonOcrErrors;
|
||||
int noOfFixes = 0;
|
||||
string lastLine = string.Empty;
|
||||
for (int i = 0; i < _subtitle.Paragraphs.Count; i++)
|
||||
{
|
||||
Paragraph p = _subtitle.Paragraphs[i];
|
||||
string text = ocrFixEngine.FixOcrErrors(p.Text, i, lastLine, false, false);
|
||||
var p = _subtitle.Paragraphs[i];
|
||||
string text = ocrFixEngine.FixOcrErrors(p.Text, i, lastLine, false, OcrFixEngine.AutoGuessLevel.Cautious);
|
||||
lastLine = text;
|
||||
if (p.Text != text)
|
||||
{
|
||||
|
@ -3915,7 +3915,8 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
if (_ocrFixEngine != null && _ocrFixEngine.IsDictionaryLoaded)
|
||||
{
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, GetAutoGuessLevel());
|
||||
|
||||
int correctWords;
|
||||
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
|
||||
|
||||
@ -3923,7 +3924,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
{
|
||||
_ocrFixEngine.AutoGuessesUsed.Clear();
|
||||
_ocrFixEngine.UnknownWordsFound.Clear();
|
||||
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked);
|
||||
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel());
|
||||
}
|
||||
|
||||
if (_ocrFixEngine.Abort)
|
||||
@ -4120,8 +4121,12 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
string textWithOutFixes = line;
|
||||
if (_ocrFixEngine.IsDictionaryLoaded)
|
||||
{
|
||||
var autoGuessLevel = OcrFixEngine.AutoGuessLevel.None;
|
||||
if (checkBoxGuessUnknownWords.Checked)
|
||||
autoGuessLevel = OcrFixEngine.AutoGuessLevel.Aggressive;
|
||||
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, autoGuessLevel);
|
||||
int correctWords;
|
||||
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
|
||||
|
||||
@ -4129,7 +4134,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
{
|
||||
_ocrFixEngine.AutoGuessesUsed.Clear();
|
||||
_ocrFixEngine.UnknownWordsFound.Clear();
|
||||
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked);
|
||||
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, autoGuessLevel);
|
||||
}
|
||||
|
||||
if (_ocrFixEngine.Abort)
|
||||
@ -4427,7 +4432,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
if (_ocrFixEngine.IsDictionaryLoaded)
|
||||
{
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, GetAutoGuessLevel());
|
||||
int correctWords;
|
||||
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
|
||||
|
||||
@ -4435,7 +4440,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
{
|
||||
_ocrFixEngine.AutoGuessesUsed.Clear();
|
||||
_ocrFixEngine.UnknownWordsFound.Clear();
|
||||
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked);
|
||||
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel());
|
||||
}
|
||||
|
||||
if (_ocrFixEngine.Abort)
|
||||
@ -5930,7 +5935,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
if (_ocrFixEngine.IsDictionaryLoaded)
|
||||
{
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
|
||||
int correctWords;
|
||||
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
|
||||
int oldCorrectWords = correctWords;
|
||||
@ -5942,7 +5947,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
_ocrFixEngine.UnknownWordsFound.Clear();
|
||||
|
||||
string newUnfixedText = TesseractResizeAndRetry(bitmap);
|
||||
string newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
||||
string newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, true, GetAutoGuessLevel());
|
||||
int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords);
|
||||
|
||||
if (wordsNotFound == 1 && newWordsNotFound == 1 && newUnfixedText.EndsWith("!!") && textWithOutFixes.EndsWith("u") && newText.Length > 1)
|
||||
@ -6015,7 +6020,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(oneColorText, out modiCorrectWords);
|
||||
string modiTextOcrFixed = oneColorText;
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked);
|
||||
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, index, _lastLine, false, GetAutoGuessLevel());
|
||||
int modiOcrCorrectedCorrectWords;
|
||||
int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords);
|
||||
if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound)
|
||||
@ -6031,7 +6036,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
wordsNotFound = modiWordsNotFound;
|
||||
correctWords = modiCorrectWords;
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
|
||||
}
|
||||
else if (wordsNotFound == modiWordsNotFound && oneColorText.EndsWith("!") && (line.EndsWith("l") || line.EndsWith("fl")))
|
||||
{
|
||||
@ -6039,7 +6044,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
wordsNotFound = modiWordsNotFound;
|
||||
correctWords = modiCorrectWords;
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -6063,7 +6068,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(unItalicText, out modiCorrectWords);
|
||||
string modiTextOcrFixed = unItalicText;
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked);
|
||||
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, index, _lastLine, false, GetAutoGuessLevel());
|
||||
int modiOcrCorrectedCorrectWords;
|
||||
int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out modiOcrCorrectedCorrectWords);
|
||||
if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound)
|
||||
@ -6295,7 +6300,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
{
|
||||
line = line.Replace("'.", ":");
|
||||
}
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
|
||||
}
|
||||
line = "<i>" + line + "</i>";
|
||||
}
|
||||
@ -6392,7 +6397,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
{
|
||||
string modiTextOcrFixed = modiText;
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked);
|
||||
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, GetAutoGuessLevel());
|
||||
int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out correctWords);
|
||||
if (modiOcrCorrectedWordsNotFound <= modiWordsNotFound)
|
||||
modiText = modiTextOcrFixed;
|
||||
@ -6405,11 +6410,11 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
}
|
||||
|
||||
// take the best option - before ocr fixing, which we do again to save suggestions and prompt for user input
|
||||
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked);
|
||||
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel());
|
||||
}
|
||||
else
|
||||
{ // fix some error manually (modi not available)
|
||||
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, checkBoxGuessUnknownWords.Checked);
|
||||
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, index, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel());
|
||||
}
|
||||
}
|
||||
|
||||
@ -6475,7 +6480,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
else
|
||||
{ // no dictionary :(
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, GetAutoGuessLevel());
|
||||
|
||||
if (badWords >= numberOfWords)
|
||||
subtitleListView1.SetBackgroundColor(index, Color.Red);
|
||||
@ -8417,5 +8422,13 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
form.Show(this);
|
||||
}
|
||||
|
||||
private OcrFixEngine.AutoGuessLevel GetAutoGuessLevel()
|
||||
{
|
||||
var autoGuessLevel = OcrFixEngine.AutoGuessLevel.None;
|
||||
if (checkBoxGuessUnknownWords.Checked)
|
||||
autoGuessLevel = OcrFixEngine.AutoGuessLevel.Aggressive;
|
||||
return autoGuessLevel;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -14,6 +14,13 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
{
|
||||
public class OcrFixEngine
|
||||
{
|
||||
public enum AutoGuessLevel
|
||||
{
|
||||
None,
|
||||
Cautious,
|
||||
Aggressive
|
||||
}
|
||||
|
||||
// Dictionaries/spellchecking/fixing
|
||||
Dictionary<string, string> _wordReplaceList;
|
||||
Dictionary<string, string> _partialLineWordBoundaryReplaceList;
|
||||
@ -386,7 +393,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
return list;
|
||||
}
|
||||
|
||||
public string FixOcrErrors(string text, int index, string lastLine, bool logSuggestions, bool useAutoGuess)
|
||||
public string FixOcrErrors(string text, int index, string lastLine, bool logSuggestions, AutoGuessLevel autoGuess)
|
||||
{
|
||||
var sb = new StringBuilder();
|
||||
var word = new StringBuilder();
|
||||
@ -452,7 +459,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
|
||||
text = FixCommenOcrLineErrors(sb.ToString(), lastLine);
|
||||
int wordsNotFound;
|
||||
text = FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, text, index, null, true, false, logSuggestions, useAutoGuess);
|
||||
text = FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, text, index, null, true, false, logSuggestions, autoGuess);
|
||||
if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
|
||||
{
|
||||
text = FixLowercaseIToUppercaseI(text, lastLine);
|
||||
@ -1424,7 +1431,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
return newText;
|
||||
}
|
||||
|
||||
public string FixUnknownWordsViaGuessOrPrompt(out int wordsNotFound, string line, int index, Bitmap bitmap, bool autoFix, bool promptForFixingErrors, bool log, bool useAutoGuess)
|
||||
public string FixUnknownWordsViaGuessOrPrompt(out int wordsNotFound, string line, int index, Bitmap bitmap, bool autoFix, bool promptForFixingErrors, bool log, AutoGuessLevel autoGuess)
|
||||
{
|
||||
var localIgnoreWords = new List<string>();
|
||||
wordsNotFound = 0;
|
||||
@ -1549,10 +1556,10 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
UnknownWordsFound.Add(string.Format("#{0}: {1}", index + 1, nf));
|
||||
}
|
||||
|
||||
if (autoFix && useAutoGuess)
|
||||
if (autoFix && autoGuess != AutoGuessLevel.None)
|
||||
{
|
||||
var guesses = new List<string>();
|
||||
if (word.Length > 5)
|
||||
if (word.Length > 5 && autoGuess == AutoGuessLevel.Aggressive)
|
||||
{
|
||||
guesses = (List<string>)CreateGuessesFromLetters(word);
|
||||
|
||||
@ -1566,7 +1573,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
if (DoSpell(word.ToLower()))
|
||||
guesses.Insert(0, wordWithCasingChanged);
|
||||
}
|
||||
else
|
||||
else if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
|
||||
{
|
||||
if (word[0] == 'L')
|
||||
guesses.Add("I" + word.Substring(1));
|
||||
@ -1585,6 +1592,8 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
guesses.Add(word.Replace("$", "s"));
|
||||
if (!word.EndsWith("€") && !word.StartsWith("€"))
|
||||
guesses.Add(word.Replace("€", "e"));
|
||||
guesses.Add(word.Replace("/", "l"));
|
||||
guesses.Add(word.Replace(")/", "y"));
|
||||
}
|
||||
foreach (string guess in guesses)
|
||||
{
|
||||
|
@ -368,16 +368,15 @@ namespace Test
|
||||
Assert.AreEqual(target._subtitle.Paragraphs[0].Text, "(laughing/clapping)");
|
||||
}
|
||||
|
||||
//Auto-guess unknown words in "Fix common errors" is now disabled
|
||||
//[TestMethod]
|
||||
//[DeploymentItem("SubtitleEdit.exe")]
|
||||
//public void FixCommonOcrErrorsSlashIsL()
|
||||
//{
|
||||
// var target = GetFixCommonErrorsLib();
|
||||
// InitializeFixCommonErrorsLine(target, "The font is ita/ic!");
|
||||
// target.FixOcrErrorsViaReplaceList("eng");
|
||||
// Assert.AreEqual(target._subtitle.Paragraphs[0].Text, "The font is italic!"); // will fail if English dictionary is not found
|
||||
//}
|
||||
[TestMethod]
|
||||
[DeploymentItem("SubtitleEdit.exe")]
|
||||
public void FixCommonOcrErrorsSlashIsL() // requires hardcoded rules enabled
|
||||
{
|
||||
var target = GetFixCommonErrorsLib();
|
||||
InitializeFixCommonErrorsLine(target, "The font is ita/ic!");
|
||||
target.FixOcrErrorsViaReplaceList("eng");
|
||||
Assert.AreEqual(target._subtitle.Paragraphs[0].Text, "The font is italic!"); // will fail if English dictionary is not found
|
||||
}
|
||||
|
||||
[TestMethod]
|
||||
[DeploymentItem("SubtitleEdit.exe")]
|
||||
|
Loading…
Reference in New Issue
Block a user