Refactor OCR text fixing for readability and efficiency

The commit includes renaming variables in the OCR text correction functionality for better clarity, changing "lastLine" to "previousLine" and "lastLastLine" to "prePreviousLine". It also streamlines the handling of French apostrophes by using an array of affixes to replace repetitive if-else statements.

Signed-off-by: Ivandro Jao <ivandrofly@gmail.com>
This commit is contained in:
Ivandro Jao 2024-04-02 10:47:17 +01:00
parent 31ed11f60a
commit 7cc045c9de
2 changed files with 38 additions and 38 deletions

View File

@ -756,20 +756,22 @@ namespace Nikse.SubtitleEdit.Forms
var fixAction = _language.FixCommonOcrErrors; var fixAction = _language.FixCommonOcrErrors;
var noOfFixes = 0; var noOfFixes = 0;
var lastLine = string.Empty; var previousLine = string.Empty;
for (var i = 0; i < Subtitle.Paragraphs.Count; i++) for (var i = 0; i < Subtitle.Paragraphs.Count; i++)
{ {
var p = Subtitle.Paragraphs[i]; var p = Subtitle.Paragraphs[i];
var lastLastP = Subtitle.GetParagraphOrDefault(i - 2); var prePrevParagraph = Subtitle.GetParagraphOrDefault(i - 2);
string lastLastLine = null; string prePreviousLine = null;
if (lastLastP != null && !string.IsNullOrEmpty(lastLastP.Text)) if (prePrevParagraph != null && !string.IsNullOrEmpty(prePrevParagraph.Text))
{ {
lastLastLine = lastLastP.Text; prePreviousLine = prePrevParagraph.Text;
} }
var text = _ocrFixEngine.FixOcrErrors(p.Text, Subtitle, i, lastLine, lastLastLine, false, OcrFixEngine.AutoGuessLevel.Cautious); var text = _ocrFixEngine.FixOcrErrors(p.Text, Subtitle, i, previousLine, prePreviousLine, false,
lastLine = text; OcrFixEngine.AutoGuessLevel.Cautious);
previousLine = text;
if (AllowFix(p, fixAction) && p.Text != text) if (AllowFix(p, fixAction) && p.Text != text)
{ {
var oldText = p.Text; var oldText = p.Text;

View File

@ -433,7 +433,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
} }
} }
public string FixOcrErrors(string input, Subtitle subtitle, int index, string lastLine, string lastLastLine, bool logSuggestions, AutoGuessLevel autoGuess) public string FixOcrErrors(string input, Subtitle subtitle, int index, string prevLine, string lastLastLine, bool logSuggestions, AutoGuessLevel autoGuess)
{ {
var text = input; var text = input;
while (text.Contains(Environment.NewLine + " ", StringComparison.Ordinal)) while (text.Contains(Environment.NewLine + " ", StringComparison.Ordinal))
@ -447,15 +447,13 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
} }
text = text.RemoveRecursiveLineBreaks().Trim(); text = text.RemoveRecursiveLineBreaks().Trim();
var textNoAssa = Utilities.RemoveSsaTags(text, true); var textNoAssa = Utilities.RemoveSsaTags(text, true);
if (textNoAssa.Length == 0) if (textNoAssa.Length == 0)
{ {
return text; return text;
} }
// Try to prevent resizing when fixing Ocr-hardcoded. // Try to prevent resizing when fixing Ocr-hardcoded.
var sb = new StringBuilder(text.Length + 2); var sb = new StringBuilder(text.Length + 2);
@ -519,7 +517,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
text = ReplaceWordsBeforeLineFixes(text); text = ReplaceWordsBeforeLineFixes(text);
text = FixCommonOcrLineErrors(text, subtitle, index, lastLine, lastLastLine); text = FixCommonOcrLineErrors(text, subtitle, index, prevLine, lastLastLine);
// check words split by only space and new line (as other split chars might by a part of from-replace-string, like "\/\/e're" contains slash) // check words split by only space and new line (as other split chars might by a part of from-replace-string, like "\/\/e're" contains slash)
sb = new StringBuilder(); sb = new StringBuilder();
@ -574,10 +572,10 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
FixOcrErrorsWord(lastWord, word, sb); FixOcrErrorsWord(lastWord, word, sb);
} }
text = FixCommonOcrLineErrors(sb.ToString(), subtitle, index, lastLine, lastLastLine); text = FixCommonOcrLineErrors(sb.ToString(), subtitle, index, prevLine, lastLastLine);
if (Configuration.Settings.Tools.OcrFixUseHardcodedRules) if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
{ {
text = FixLowercaseIToUppercaseI(text, lastLine); text = FixLowercaseIToUppercaseI(text, prevLine);
if (SpellCheckDictionaryName.StartsWith("en_", StringComparison.Ordinal) || _threeLetterIsoLanguageName == "eng") if (SpellCheckDictionaryName.StartsWith("en_", StringComparison.Ordinal) || _threeLetterIsoLanguageName == "eng")
{ {
var oldText = text; var oldText = text;
@ -586,12 +584,12 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
} }
else if (_threeLetterIsoLanguageName == "fra") else if (_threeLetterIsoLanguageName == "fra")
{ {
text = FixFrenchLApostrophe(text, " I'", lastLine); // the item can be prefix, infix and suffix
text = FixFrenchLApostrophe(text, " L'", lastLine); var affixes = new[] { " I'", " L'", " l'", " I", " L", " l" };
text = FixFrenchLApostrophe(text, " l'", lastLine); foreach (var affix in affixes)
text = FixFrenchLApostrophe(text, " I", lastLine); {
text = FixFrenchLApostrophe(text, " L", lastLine); text = FixFrenchLApostrophe(text, affix, prevLine);
text = FixFrenchLApostrophe(text, " l", lastLine); }
} }
text = Utilities.RemoveSpaceBetweenNumbers(text); text = Utilities.RemoveSpaceBetweenNumbers(text);
@ -677,12 +675,12 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
return false; return false;
} }
public static string FixFrenchLApostrophe(string input, string tag, string lastLine) public static string FixFrenchLApostrophe(string input, string affix, string prevLine)
{ {
var text = input; var text = input;
var isPreviousLineClose = lastLine.HasSentenceEnding(); var isPreviousLineClose = prevLine.HasSentenceEnding();
if (text.StartsWith(tag.TrimStart(), StringComparison.Ordinal) && text.Length > 3) if (text.StartsWith(affix.TrimStart(), StringComparison.Ordinal) && text.Length > 3)
{ {
if (isPreviousLineClose || char.IsUpper(text[2])) if (isPreviousLineClose || char.IsUpper(text[2]))
{ {
@ -693,7 +691,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
text = @"l" + text.Substring(1); text = @"l" + text.Substring(1);
} }
} }
else if (text.StartsWith("<i>" + tag.TrimStart(), StringComparison.Ordinal) && text.Length > 6) else if (text.StartsWith("<i>" + affix.TrimStart(), StringComparison.Ordinal) && text.Length > 6)
{ {
if (isPreviousLineClose || char.IsUpper(text[5])) if (isPreviousLineClose || char.IsUpper(text[5]))
{ {
@ -705,11 +703,11 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
} }
} }
var start = text.IndexOf(tag, StringComparison.Ordinal); var start = text.IndexOf(affix, StringComparison.Ordinal);
while (start > 0) while (start > 0)
{ {
lastLine = HtmlUtil.RemoveHtmlTags(text.Substring(0, start)).TrimEnd().TrimEnd('-').TrimEnd(); prevLine = HtmlUtil.RemoveHtmlTags(text.Substring(0, start)).TrimEnd().TrimEnd('-').TrimEnd();
isPreviousLineClose = string.IsNullOrEmpty(lastLine) || lastLine.EndsWith('.') || lastLine.EndsWith('!') || lastLine.EndsWith('?'); isPreviousLineClose = string.IsNullOrEmpty(prevLine) || prevLine.EndsWith('.') || prevLine.EndsWith('!') || prevLine.EndsWith('?');
if (start < text.Length - 4) if (start < text.Length - 4)
{ {
if (start == 1 && text.StartsWith('-')) if (start == 1 && text.StartsWith('-'))
@ -732,15 +730,15 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
text = text.Remove(start + 1, 1).Insert(start + 1, "l"); text = text.Remove(start + 1, 1).Insert(start + 1, "l");
} }
} }
start = text.IndexOf(tag, start + 1, StringComparison.Ordinal); start = text.IndexOf(affix, start + 1, StringComparison.Ordinal);
} }
tag = Environment.NewLine + tag.Trim(); affix = Environment.NewLine + affix.Trim();
start = text.IndexOf(tag, StringComparison.Ordinal); start = text.IndexOf(affix, StringComparison.Ordinal);
while (start > 0) while (start > 0)
{ {
lastLine = HtmlUtil.RemoveHtmlTags(text.Substring(0, start)).TrimEnd().TrimEnd('-').TrimEnd(); prevLine = HtmlUtil.RemoveHtmlTags(text.Substring(0, start)).TrimEnd().TrimEnd('-').TrimEnd();
isPreviousLineClose = string.IsNullOrEmpty(lastLine) || lastLine.EndsWith('.') || lastLine.EndsWith('!') || lastLine.EndsWith('?') || lastLine.EndsWith(".</i>", StringComparison.Ordinal); isPreviousLineClose = string.IsNullOrEmpty(prevLine) || prevLine.EndsWith('.') || prevLine.EndsWith('!') || prevLine.EndsWith('?') || prevLine.EndsWith(".</i>", StringComparison.Ordinal);
if (start < text.Length - 5) if (start < text.Length - 5)
{ {
if (start > 1) if (start > 1)
@ -758,15 +756,15 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
text = text.Remove(start + Environment.NewLine.Length, 1).Insert(start + Environment.NewLine.Length, "l"); text = text.Remove(start + Environment.NewLine.Length, 1).Insert(start + Environment.NewLine.Length, "l");
} }
} }
start = text.IndexOf(tag, start + 1, StringComparison.Ordinal); start = text.IndexOf(affix, start + 1, StringComparison.Ordinal);
} }
tag = Environment.NewLine + "<i>" + tag.Trim(); affix = Environment.NewLine + "<i>" + affix.Trim();
start = text.IndexOf(tag, StringComparison.Ordinal); start = text.IndexOf(affix, StringComparison.Ordinal);
while (start > 0) while (start > 0)
{ {
lastLine = HtmlUtil.RemoveHtmlTags(text.Substring(0, start)).TrimEnd().TrimEnd('-').TrimEnd(); prevLine = HtmlUtil.RemoveHtmlTags(text.Substring(0, start)).TrimEnd().TrimEnd('-').TrimEnd();
isPreviousLineClose = string.IsNullOrEmpty(lastLine) || lastLine.EndsWith('.') || lastLine.EndsWith('!') || lastLine.EndsWith('?') || lastLine.EndsWith(".</i>", StringComparison.Ordinal); isPreviousLineClose = string.IsNullOrEmpty(prevLine) || prevLine.EndsWith('.') || prevLine.EndsWith('!') || prevLine.EndsWith('?') || prevLine.EndsWith(".</i>", StringComparison.Ordinal);
if (start < text.Length - 8) if (start < text.Length - 8)
{ {
if (isPreviousLineClose || char.IsUpper(text[start + 5 + Environment.NewLine.Length])) if (isPreviousLineClose || char.IsUpper(text[start + 5 + Environment.NewLine.Length]))
@ -778,7 +776,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
text = text.Remove(start + Environment.NewLine.Length + 3, 1).Insert(start + Environment.NewLine.Length + 3, "l"); text = text.Remove(start + Environment.NewLine.Length + 3, 1).Insert(start + Environment.NewLine.Length + 3, "l");
} }
} }
start = text.IndexOf(tag, start + 1, StringComparison.Ordinal); start = text.IndexOf(affix, start + 1, StringComparison.Ordinal);
} }
return text; return text;
} }