mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-11-25 20:52:44 +01:00
Minor fix for OCR engine
Add check when applying a new line ending period
This commit is contained in:
parent
563633fc0f
commit
58bac8c6ed
@ -1917,4 +1917,6 @@
|
||||
<name>Zoie</name>
|
||||
<name>Zuri</name>
|
||||
<name>Åland Islands</name>
|
||||
<name>LGBTQ</name>
|
||||
<name>TSA</name>
|
||||
</names>
|
@ -509,7 +509,9 @@ namespace Test.FixCommonErrors
|
||||
Configuration.Settings.Tools.OcrFixUseHardcodedRules = true;
|
||||
const string input = "i.e., your killer.";
|
||||
var ofe = new Nikse.SubtitleEdit.Logic.Ocr.OcrFixEngine("eng", "not there", form);
|
||||
var res = ofe.FixOcrErrors(input, 1, "Ends with comma,", null, false, Nikse.SubtitleEdit.Logic.Ocr.OcrFixEngine.AutoGuessLevel.Cautious);
|
||||
var subtitle = new Subtitle();
|
||||
subtitle.Paragraphs.Add(new Paragraph(input, 0, 3000));
|
||||
var res = ofe.FixOcrErrors(input, subtitle, 0, "Ends with comma,", null, false, Nikse.SubtitleEdit.Logic.Ocr.OcrFixEngine.AutoGuessLevel.Cautious);
|
||||
Assert.AreEqual(res, "i.e., your killer.");
|
||||
}
|
||||
}
|
||||
|
@ -243,7 +243,7 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries
|
||||
return false;
|
||||
}
|
||||
|
||||
public string FixOcrErrorViaLineReplaceList(string input)
|
||||
public string FixOcrErrorViaLineReplaceList(string input, Subtitle subtitle, int index)
|
||||
{
|
||||
// Whole fromLine
|
||||
foreach (var from in _wholeLineReplaceList.Keys)
|
||||
@ -311,8 +311,13 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries
|
||||
{
|
||||
if (newText.EndsWith(from, StringComparison.Ordinal))
|
||||
{
|
||||
int position = (newText.Length - from.Length);
|
||||
newText = newText.Remove(position).Insert(position, _endLineReplaceList[from]);
|
||||
var position = (newText.Length - from.Length);
|
||||
var toText = _endLineReplaceList[from];
|
||||
|
||||
if (!SkipAddLineEnding(subtitle, from, toText, index))
|
||||
{
|
||||
newText = newText.Remove(position).Insert(position, toText);
|
||||
}
|
||||
}
|
||||
}
|
||||
newText += post;
|
||||
@ -357,6 +362,40 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries
|
||||
return newText;
|
||||
}
|
||||
|
||||
private bool SkipAddLineEnding(Subtitle subtitle, string from, string toText, int index)
|
||||
{
|
||||
if (!toText.EndsWith('.') || from.EndsWith('.'))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var p = subtitle.GetParagraphOrDefault(index);
|
||||
var next = subtitle.GetParagraphOrDefault(index+1);
|
||||
if (p == null || next == null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
if (next.StartTime.TotalMilliseconds - p.EndTime.TotalMilliseconds > 600)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var nextText = HtmlUtil.RemoveHtmlTags(next.Text, true);
|
||||
if (string.IsNullOrEmpty(nextText))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var firstLetter = nextText[0];
|
||||
if (char.IsLetter(firstLetter) && firstLetter == char.ToLowerInvariant(firstLetter))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private List<Regex> _replaceRegExes;
|
||||
|
||||
private static void AddToGuessList(List<string> list, string guess)
|
||||
|
@ -768,7 +768,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
lastLastLine = lastLastP.Text;
|
||||
}
|
||||
|
||||
var text = _ocrFixEngine.FixOcrErrors(p.Text, i, lastLine, lastLastLine, false, OcrFixEngine.AutoGuessLevel.Cautious);
|
||||
var text = _ocrFixEngine.FixOcrErrors(p.Text, Subtitle, i, lastLine, lastLastLine, false, OcrFixEngine.AutoGuessLevel.Cautious);
|
||||
lastLine = text;
|
||||
if (AllowFix(p, fixAction) && p.Text != text)
|
||||
{
|
||||
|
@ -3573,7 +3573,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
{
|
||||
var lastLastLine = GetLastLastText(listViewIndex);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, lastLastLine, true, autoGuessLevel);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, listViewIndex, _lastLine, lastLastLine, true, autoGuessLevel);
|
||||
}
|
||||
|
||||
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out var correctWords);
|
||||
@ -3610,7 +3610,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
{
|
||||
var lastLastLine = GetLastLastText(listViewIndex);
|
||||
tempLine = _ocrFixEngine.FixOcrErrors(tempLine, listViewIndex, _lastLine, lastLastLine, true, autoGuessLevel);
|
||||
tempLine = _ocrFixEngine.FixOcrErrors(tempLine, _subtitle, listViewIndex, _lastLine, lastLastLine, true, autoGuessLevel);
|
||||
}
|
||||
|
||||
int tempWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(tempLine, out var tempCorrectWords);
|
||||
@ -3981,7 +3981,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
{
|
||||
if (fixCommonErrors)
|
||||
{
|
||||
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, listViewIndex, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
}
|
||||
|
||||
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out var correctWords);
|
||||
@ -4016,7 +4016,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
_ocrFixEngine.UnknownWordsFound.Clear();
|
||||
if (fixCommonErrors)
|
||||
{
|
||||
tempLine = _ocrFixEngine.FixOcrErrors(tempLine, listViewIndex, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
tempLine = _ocrFixEngine.FixOcrErrors(tempLine, _subtitle, listViewIndex, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
}
|
||||
|
||||
int tempWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(tempLine, out var tempCorrectWords);
|
||||
@ -5486,7 +5486,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
if (fixCommonErrors)
|
||||
{
|
||||
var lastLastLine = GetLastLastText(index);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
}
|
||||
|
||||
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out int correctWords);
|
||||
@ -5503,7 +5503,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
var newText = newUnfixedText;
|
||||
if (fixCommonErrors)
|
||||
{
|
||||
_ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
_ocrFixEngine.FixOcrErrors(newUnfixedText, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
}
|
||||
|
||||
int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords);
|
||||
@ -5514,7 +5514,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
var oldOcrMethodIndex = _ocrMethodIndex;
|
||||
_ocrMethodIndex = _ocrMethodIndex == _ocrMethodTesseract5 ? _ocrMethodTesseract302 : _ocrMethodTesseract5;
|
||||
newUnfixedText = Tesseract3DoOcrViaExe(bitmap, _languageId, "6", _tesseractEngineMode); // 6 = Assume a single uniform block of text.
|
||||
newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords);
|
||||
_ocrMethodIndex = oldOcrMethodIndex;
|
||||
}
|
||||
@ -5595,7 +5595,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
string modiTextOcrFixed = oneColorText;
|
||||
if (fixCommonErrors)
|
||||
{
|
||||
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, index, _lastLine, lastLastLine, false, GetAutoGuessLevel());
|
||||
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(oneColorText, _subtitle, index, _lastLine, lastLastLine, false, GetAutoGuessLevel());
|
||||
}
|
||||
|
||||
int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out var modiOcrCorrectedCorrectWords);
|
||||
@ -5613,7 +5613,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
correctWords = modiCorrectWords;
|
||||
if (fixCommonErrors)
|
||||
{
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
}
|
||||
}
|
||||
else if (wordsNotFound == modiWordsNotFound && oneColorText.EndsWith('!') && (line.EndsWith('l') || line.EndsWith('fl')))
|
||||
@ -5623,7 +5623,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
correctWords = modiCorrectWords;
|
||||
if (fixCommonErrors)
|
||||
{
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -5650,7 +5650,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
if (fixCommonErrors)
|
||||
{
|
||||
var lastLastLine = GetLastLastText(index);
|
||||
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, index, _lastLine, lastLastLine, false, GetAutoGuessLevel());
|
||||
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(unItalicText, _subtitle, index, _lastLine, lastLastLine, false, GetAutoGuessLevel());
|
||||
}
|
||||
|
||||
int modiOcrCorrectedCorrectWords;
|
||||
@ -5968,7 +5968,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
}
|
||||
|
||||
var lastLastLine = GetLastLastText(index);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
}
|
||||
|
||||
line = "<i>" + line + "</i>";
|
||||
@ -6084,7 +6084,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
if (fixCommonErrors)
|
||||
{
|
||||
var lastLastLine = GetLastLastText(index);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, index, _lastLine, lastLastLine, true, GetAutoGuessLevel());
|
||||
}
|
||||
|
||||
ColorLineByNumberOfUnknownWords(index, badWords, line);
|
||||
@ -6373,7 +6373,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
{
|
||||
var lastLastLine = GetLastLastText(listViewIndex);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, lastLastLine, true, autoGuessLevel);
|
||||
line = _ocrFixEngine.FixOcrErrors(line, _subtitle, listViewIndex, _lastLine, lastLastLine, true, autoGuessLevel);
|
||||
}
|
||||
|
||||
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out var correctWords);
|
||||
|
@ -433,7 +433,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
||||
}
|
||||
}
|
||||
|
||||
public string FixOcrErrors(string input, int index, string lastLine, string lastLastLine, bool logSuggestions, AutoGuessLevel autoGuess)
|
||||
public string FixOcrErrors(string input, Subtitle subtitle, int index, string lastLine, string lastLastLine, bool logSuggestions, AutoGuessLevel autoGuess)
|
||||
{
|
||||
var text = input;
|
||||
while (text.Contains(Environment.NewLine + " ", StringComparison.Ordinal))
|
||||
@ -519,7 +519,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
||||
|
||||
text = ReplaceWordsBeforeLineFixes(text);
|
||||
|
||||
text = FixCommonOcrLineErrors(text, lastLine, lastLastLine);
|
||||
text = FixCommonOcrLineErrors(text, subtitle, index, lastLine, lastLastLine);
|
||||
|
||||
// check words split by only space and new line (as other split chars might by a part of from-replace-string, like "\/\/e're" contains slash)
|
||||
sb = new StringBuilder();
|
||||
@ -574,7 +574,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
||||
FixOcrErrorsWord(lastWord, word, sb);
|
||||
}
|
||||
|
||||
text = FixCommonOcrLineErrors(sb.ToString(), lastLine, lastLastLine);
|
||||
text = FixCommonOcrLineErrors(sb.ToString(), subtitle, index, lastLine, lastLastLine);
|
||||
if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
|
||||
{
|
||||
text = FixLowercaseIToUppercaseI(text, lastLine);
|
||||
@ -865,12 +865,12 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
||||
return sb.ToString();
|
||||
}
|
||||
|
||||
private string FixCommonOcrLineErrors(string input, string lastLine, string lastLastLine)
|
||||
private string FixCommonOcrLineErrors(string input, Subtitle subtitle, int index, string lastLine, string lastLastLine)
|
||||
{
|
||||
var text = input;
|
||||
text = FixOcrErrorViaLineReplaceList(text);
|
||||
text = _ocrFixReplaceList.FixOcrErrorViaLineReplaceList(input, subtitle, index);
|
||||
text = FixOcrErrorsViaHardcodedRules(text, lastLine, lastLastLine, _abbreviationList);
|
||||
text = FixOcrErrorViaLineReplaceList(text);
|
||||
text = _ocrFixReplaceList.FixOcrErrorViaLineReplaceList(input, subtitle, index);
|
||||
|
||||
if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
|
||||
{
|
||||
@ -1310,11 +1310,6 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
||||
return text;
|
||||
}
|
||||
|
||||
public string FixOcrErrorViaLineReplaceList(string input)
|
||||
{
|
||||
return _ocrFixReplaceList.FixOcrErrorViaLineReplaceList(input);
|
||||
}
|
||||
|
||||
public string FixUnknownWordsViaGuessOrPrompt(out int wordsNotFound, string line, int index, Bitmap bitmap, bool autoFix, bool promptForFixingErrors, bool log, AutoGuessLevel autoGuess)
|
||||
{
|
||||
var localIgnoreWords = new List<string>();
|
||||
|
Loading…
Reference in New Issue
Block a user