Minor fix for OCR engine

Add check when applying a new line ending period
This commit is contained in:
Nikolaj Olsson 2023-12-14 18:08:02 +01:00
parent 563633fc0f
commit 58bac8c6ed
6 changed files with 68 additions and 30 deletions

View File

@ -1917,4 +1917,6 @@
<name>Zoie</name>
<name>Zuri</name>
<name>Åland Islands</name>
<name>LGBTQ</name>
<name>TSA</name>
</names>

View File

@ -509,7 +509,9 @@ namespace Test.FixCommonErrors
Configuration.Settings.Tools.OcrFixUseHardcodedRules = true;
const string input = "i.e., your killer.";
var ofe = new Nikse.SubtitleEdit.Logic.Ocr.OcrFixEngine("eng", "not there", form);
var res = ofe.FixOcrErrors(input, 1, "Ends with comma,", null, false, Nikse.SubtitleEdit.Logic.Ocr.OcrFixEngine.AutoGuessLevel.Cautious);
var subtitle = new Subtitle();
subtitle.Paragraphs.Add(new Paragraph(input, 0, 3000));
var res = ofe.FixOcrErrors(input, subtitle, 0, "Ends with comma,", null, false, Nikse.SubtitleEdit.Logic.Ocr.OcrFixEngine.AutoGuessLevel.Cautious);
Assert.AreEqual(res, "i.e., your killer.");
}
}

View File

@ -243,7 +243,7 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries
return false;
}
public string FixOcrErrorViaLineReplaceList(string input)
public string FixOcrErrorViaLineReplaceList(string input, Subtitle subtitle, int index)
{
// Whole fromLine
foreach (var from in _wholeLineReplaceList.Keys)
@ -311,8 +311,13 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries
{
if (newText.EndsWith(from, StringComparison.Ordinal))
{
int position = (newText.Length - from.Length);
newText = newText.Remove(position).Insert(position, _endLineReplaceList[from]);
var position = (newText.Length - from.Length);
var toText = _endLineReplaceList[from];
if (!SkipAddLineEnding(subtitle, from, toText, index))
{
newText = newText.Remove(position).Insert(position, toText);
}
}
}
newText += post;
@ -357,6 +362,40 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries
return newText;
}
private bool SkipAddLineEnding(Subtitle subtitle, string from, string toText, int index)
{
if (!toText.EndsWith('.') || from.EndsWith('.'))
{
return false;
}
var p = subtitle.GetParagraphOrDefault(index);
var next = subtitle.GetParagraphOrDefault(index+1);
if (p == null || next == null)
{
return false;
}
if (next.StartTime.TotalMilliseconds - p.EndTime.TotalMilliseconds > 600)
{
return false;
}
var nextText = HtmlUtil.RemoveHtmlTags(next.Text, true);
if (string.IsNullOrEmpty(nextText))
{
return false;
}
var firstLetter = nextText[0];
if (char.IsLetter(firstLetter) && firstLetter == char.ToLowerInvariant(firstLetter))
{
return true;
}
return false;
}
private List<Regex> _replaceRegExes;
private static void AddToGuessList(List<string> list, string guess)

View File

@ -768,7 +768,7 @@ namespace Nikse.SubtitleEdit.Forms
lastLastLine = lastLastP.Text;
}
var text = _ocrFixEngine.FixOcrErrors(p.Text, i, lastLine, lastLastLine, false, OcrFixEngine.AutoGuessLevel.Cautious);
var text = _ocrFixEngine.FixOcrErrors(p.Text, Subtitle, i, lastLine, lastLastLine, false, OcrFixEngine.AutoGuessLevel.Cautious);
lastLine = text;
if (AllowFix(p, fixAction) && p.Text != text)
{

View File

@ -3573,7 +3573,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
if (checkBoxAutoFixCommonErrors.Checked)
{
var lastLastLine = GetLastLastText(listViewIndex);
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, lastLastLine, true, autoGuessLevel);
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, listViewIndex, _lastLine, lastLastLine, true, autoGuessLevel);
}
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out var correctWords);
@ -3610,7 +3610,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
if (checkBoxAutoFixCommonErrors.Checked)
{
var lastLastLine = GetLastLastText(listViewIndex);
tempLine = _ocrFixEngine.FixOcrErrors(tempLine, listViewIndex, _lastLine, lastLastLine, true, autoGuessLevel);
tempLine = _ocrFixEngine.FixOcrErrors(tempLine, _subtitle, listViewIndex, _lastLine, lastLastLine, true, autoGuessLevel);
}
int tempWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(tempLine, out var tempCorrectWords);
@ -3981,7 +3981,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
if (fixCommonErrors)
{
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, lastLastLine, true, GetAutoGuessLevel());
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, listViewIndex, _lastLine, lastLastLine, true, GetAutoGuessLevel());
}
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out var correctWords);
@ -4016,7 +4016,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
_ocrFixEngine.UnknownWordsFound.Clear();
if (fixCommonErrors)
{
tempLine = _ocrFixEngine.FixOcrErrors(tempLine, listViewIndex, _lastLine, lastLastLine, true, GetAutoGuessLevel());
tempLine = _ocrFixEngine.FixOcrErrors(tempLine, _subtitle, listViewIndex, _lastLine, lastLastLine, true, GetAutoGuessLevel());
}
int tempWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(tempLine, out var tempCorrectWords);
@ -5486,7 +5486,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
if (fixCommonErrors)
{
var lastLastLine = GetLastLastText(index);
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
}
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out int correctWords);
@ -5503,7 +5503,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
var newText = newUnfixedText;
if (fixCommonErrors)
{
_ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
_ocrFixEngine.FixOcrErrors(newUnfixedText, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
}
int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords);
@ -5514,7 +5514,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
var oldOcrMethodIndex = _ocrMethodIndex;
_ocrMethodIndex = _ocrMethodIndex == _ocrMethodTesseract5 ? _ocrMethodTesseract302 : _ocrMethodTesseract5;
newUnfixedText = Tesseract3DoOcrViaExe(bitmap, _languageId, "6", _tesseractEngineMode); // 6 = Assume a single uniform block of text.
newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords);
_ocrMethodIndex = oldOcrMethodIndex;
}
@ -5595,7 +5595,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
string modiTextOcrFixed = oneColorText;
if (fixCommonErrors)
{
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, index, _lastLine, lastLastLine, false, GetAutoGuessLevel());
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, _subtitle, index, _lastLine, lastLastLine, false, GetAutoGuessLevel());
}
int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out var modiOcrCorrectedCorrectWords);
@ -5613,7 +5613,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
correctWords = modiCorrectWords;
if (fixCommonErrors)
{
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
}
}
else if (wordsNotFound == modiWordsNotFound && oneColorText.EndsWith('!') && (line.EndsWith('l') || line.EndsWith('fl')))
@ -5623,7 +5623,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
correctWords = modiCorrectWords;
if (fixCommonErrors)
{
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
}
}
}
@ -5650,7 +5650,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
if (fixCommonErrors)
{
var lastLastLine = GetLastLastText(index);
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, index, _lastLine, lastLastLine, false, GetAutoGuessLevel());
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, _subtitle, index, _lastLine, lastLastLine, false, GetAutoGuessLevel());
}
int modiOcrCorrectedCorrectWords;
@ -5968,7 +5968,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
var lastLastLine = GetLastLastText(index);
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
}
line = "<i>" + line + "</i>";
@ -6084,7 +6084,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
if (fixCommonErrors)
{
var lastLastLine = GetLastLastText(index);
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
}
ColorLineByNumberOfUnknownWords(index, badWords, line);
@ -6373,7 +6373,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
if (checkBoxAutoFixCommonErrors.Checked)
{
var lastLastLine = GetLastLastText(listViewIndex);
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, lastLastLine, true, autoGuessLevel);
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, listViewIndex, _lastLine, lastLastLine, true, autoGuessLevel);
}
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out var correctWords);

View File

@ -433,7 +433,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
}
}
public string FixOcrErrors(string input, int index, string lastLine, string lastLastLine, bool logSuggestions, AutoGuessLevel autoGuess)
public string FixOcrErrors(string input, Subtitle subtitle, int index, string lastLine, string lastLastLine, bool logSuggestions, AutoGuessLevel autoGuess)
{
var text = input;
while (text.Contains(Environment.NewLine + " ", StringComparison.Ordinal))
@ -519,7 +519,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
text = ReplaceWordsBeforeLineFixes(text);
text = FixCommonOcrLineErrors(text, lastLine, lastLastLine);
text = FixCommonOcrLineErrors(text, subtitle, index, lastLine, lastLastLine);
// check words split by only space and new line (as other split chars might by a part of from-replace-string, like "\/\/e're" contains slash)
sb = new StringBuilder();
@ -574,7 +574,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
FixOcrErrorsWord(lastWord, word, sb);
}
text = FixCommonOcrLineErrors(sb.ToString(), lastLine, lastLastLine);
text = FixCommonOcrLineErrors(sb.ToString(), subtitle, index, lastLine, lastLastLine);
if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
{
text = FixLowercaseIToUppercaseI(text, lastLine);
@ -865,12 +865,12 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
return sb.ToString();
}
private string FixCommonOcrLineErrors(string input, string lastLine, string lastLastLine)
private string FixCommonOcrLineErrors(string input, Subtitle subtitle, int index, string lastLine, string lastLastLine)
{
var text = input;
text = FixOcrErrorViaLineReplaceList(text);
text = _ocrFixReplaceList.FixOcrErrorViaLineReplaceList(input, subtitle, index);
text = FixOcrErrorsViaHardcodedRules(text, lastLine, lastLastLine, _abbreviationList);
text = FixOcrErrorViaLineReplaceList(text);
text = _ocrFixReplaceList.FixOcrErrorViaLineReplaceList(input, subtitle, index);
if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
{
@ -1310,11 +1310,6 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
return text;
}
public string FixOcrErrorViaLineReplaceList(string input)
{
return _ocrFixReplaceList.FixOcrErrorViaLineReplaceList(input);
}
public string FixUnknownWordsViaGuessOrPrompt(out int wordsNotFound, string line, int index, Bitmap bitmap, bool autoFix, bool promptForFixingErrors, bool log, AutoGuessLevel autoGuess)
{
var localIgnoreWords = new List<string>();