mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-11-22 03:02:35 +01:00
Refactor OCR text fixing for readability and efficiency
The commit includes renaming variables in the OCR text correction functionality for better clarity, changing "lastLine" to "previousLine" and "lastLastLine" to "prePreviousLine". It also streamlines the handling of French apostrophes by using an array of affixes to replace repetitive if-else statements. Signed-off-by: Ivandro Jao <ivandrofly@gmail.com>
This commit is contained in:
parent
31ed11f60a
commit
7cc045c9de
@ -756,20 +756,22 @@ namespace Nikse.SubtitleEdit.Forms
|
|||||||
|
|
||||||
var fixAction = _language.FixCommonOcrErrors;
|
var fixAction = _language.FixCommonOcrErrors;
|
||||||
var noOfFixes = 0;
|
var noOfFixes = 0;
|
||||||
var lastLine = string.Empty;
|
var previousLine = string.Empty;
|
||||||
for (var i = 0; i < Subtitle.Paragraphs.Count; i++)
|
for (var i = 0; i < Subtitle.Paragraphs.Count; i++)
|
||||||
{
|
{
|
||||||
var p = Subtitle.Paragraphs[i];
|
var p = Subtitle.Paragraphs[i];
|
||||||
|
|
||||||
var lastLastP = Subtitle.GetParagraphOrDefault(i - 2);
|
var prePrevParagraph = Subtitle.GetParagraphOrDefault(i - 2);
|
||||||
string lastLastLine = null;
|
string prePreviousLine = null;
|
||||||
if (lastLastP != null && !string.IsNullOrEmpty(lastLastP.Text))
|
if (prePrevParagraph != null && !string.IsNullOrEmpty(prePrevParagraph.Text))
|
||||||
{
|
{
|
||||||
lastLastLine = lastLastP.Text;
|
prePreviousLine = prePrevParagraph.Text;
|
||||||
}
|
}
|
||||||
|
|
||||||
var text = _ocrFixEngine.FixOcrErrors(p.Text, Subtitle, i, lastLine, lastLastLine, false, OcrFixEngine.AutoGuessLevel.Cautious);
|
var text = _ocrFixEngine.FixOcrErrors(p.Text, Subtitle, i, previousLine, prePreviousLine, false,
|
||||||
lastLine = text;
|
OcrFixEngine.AutoGuessLevel.Cautious);
|
||||||
|
|
||||||
|
previousLine = text;
|
||||||
if (AllowFix(p, fixAction) && p.Text != text)
|
if (AllowFix(p, fixAction) && p.Text != text)
|
||||||
{
|
{
|
||||||
var oldText = p.Text;
|
var oldText = p.Text;
|
||||||
|
@ -433,7 +433,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public string FixOcrErrors(string input, Subtitle subtitle, int index, string lastLine, string lastLastLine, bool logSuggestions, AutoGuessLevel autoGuess)
|
public string FixOcrErrors(string input, Subtitle subtitle, int index, string prevLine, string lastLastLine, bool logSuggestions, AutoGuessLevel autoGuess)
|
||||||
{
|
{
|
||||||
var text = input;
|
var text = input;
|
||||||
while (text.Contains(Environment.NewLine + " ", StringComparison.Ordinal))
|
while (text.Contains(Environment.NewLine + " ", StringComparison.Ordinal))
|
||||||
@ -447,15 +447,13 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
|||||||
}
|
}
|
||||||
|
|
||||||
text = text.RemoveRecursiveLineBreaks().Trim();
|
text = text.RemoveRecursiveLineBreaks().Trim();
|
||||||
|
|
||||||
var textNoAssa = Utilities.RemoveSsaTags(text, true);
|
var textNoAssa = Utilities.RemoveSsaTags(text, true);
|
||||||
if (textNoAssa.Length == 0)
|
if (textNoAssa.Length == 0)
|
||||||
{
|
{
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Try to prevent resizing when fixing Ocr-hardcoded.
|
// Try to prevent resizing when fixing Ocr-hardcoded.
|
||||||
var sb = new StringBuilder(text.Length + 2);
|
var sb = new StringBuilder(text.Length + 2);
|
||||||
|
|
||||||
@ -519,7 +517,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
|||||||
|
|
||||||
text = ReplaceWordsBeforeLineFixes(text);
|
text = ReplaceWordsBeforeLineFixes(text);
|
||||||
|
|
||||||
text = FixCommonOcrLineErrors(text, subtitle, index, lastLine, lastLastLine);
|
text = FixCommonOcrLineErrors(text, subtitle, index, prevLine, lastLastLine);
|
||||||
|
|
||||||
// check words split by only space and new line (as other split chars might by a part of from-replace-string, like "\/\/e're" contains slash)
|
// check words split by only space and new line (as other split chars might by a part of from-replace-string, like "\/\/e're" contains slash)
|
||||||
sb = new StringBuilder();
|
sb = new StringBuilder();
|
||||||
@ -574,10 +572,10 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
|||||||
FixOcrErrorsWord(lastWord, word, sb);
|
FixOcrErrorsWord(lastWord, word, sb);
|
||||||
}
|
}
|
||||||
|
|
||||||
text = FixCommonOcrLineErrors(sb.ToString(), subtitle, index, lastLine, lastLastLine);
|
text = FixCommonOcrLineErrors(sb.ToString(), subtitle, index, prevLine, lastLastLine);
|
||||||
if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
|
if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
|
||||||
{
|
{
|
||||||
text = FixLowercaseIToUppercaseI(text, lastLine);
|
text = FixLowercaseIToUppercaseI(text, prevLine);
|
||||||
if (SpellCheckDictionaryName.StartsWith("en_", StringComparison.Ordinal) || _threeLetterIsoLanguageName == "eng")
|
if (SpellCheckDictionaryName.StartsWith("en_", StringComparison.Ordinal) || _threeLetterIsoLanguageName == "eng")
|
||||||
{
|
{
|
||||||
var oldText = text;
|
var oldText = text;
|
||||||
@ -586,12 +584,12 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
|||||||
}
|
}
|
||||||
else if (_threeLetterIsoLanguageName == "fra")
|
else if (_threeLetterIsoLanguageName == "fra")
|
||||||
{
|
{
|
||||||
text = FixFrenchLApostrophe(text, " I'", lastLine);
|
// the item can be prefix, infix and suffix
|
||||||
text = FixFrenchLApostrophe(text, " L'", lastLine);
|
var affixes = new[] { " I'", " L'", " l'", " I’", " L’", " l’" };
|
||||||
text = FixFrenchLApostrophe(text, " l'", lastLine);
|
foreach (var affix in affixes)
|
||||||
text = FixFrenchLApostrophe(text, " I’", lastLine);
|
{
|
||||||
text = FixFrenchLApostrophe(text, " L’", lastLine);
|
text = FixFrenchLApostrophe(text, affix, prevLine);
|
||||||
text = FixFrenchLApostrophe(text, " l’", lastLine);
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
text = Utilities.RemoveSpaceBetweenNumbers(text);
|
text = Utilities.RemoveSpaceBetweenNumbers(text);
|
||||||
@ -677,12 +675,12 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static string FixFrenchLApostrophe(string input, string tag, string lastLine)
|
public static string FixFrenchLApostrophe(string input, string affix, string prevLine)
|
||||||
{
|
{
|
||||||
var text = input;
|
var text = input;
|
||||||
var isPreviousLineClose = lastLine.HasSentenceEnding();
|
var isPreviousLineClose = prevLine.HasSentenceEnding();
|
||||||
|
|
||||||
if (text.StartsWith(tag.TrimStart(), StringComparison.Ordinal) && text.Length > 3)
|
if (text.StartsWith(affix.TrimStart(), StringComparison.Ordinal) && text.Length > 3)
|
||||||
{
|
{
|
||||||
if (isPreviousLineClose || char.IsUpper(text[2]))
|
if (isPreviousLineClose || char.IsUpper(text[2]))
|
||||||
{
|
{
|
||||||
@ -693,7 +691,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
|||||||
text = @"l" + text.Substring(1);
|
text = @"l" + text.Substring(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (text.StartsWith("<i>" + tag.TrimStart(), StringComparison.Ordinal) && text.Length > 6)
|
else if (text.StartsWith("<i>" + affix.TrimStart(), StringComparison.Ordinal) && text.Length > 6)
|
||||||
{
|
{
|
||||||
if (isPreviousLineClose || char.IsUpper(text[5]))
|
if (isPreviousLineClose || char.IsUpper(text[5]))
|
||||||
{
|
{
|
||||||
@ -705,11 +703,11 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var start = text.IndexOf(tag, StringComparison.Ordinal);
|
var start = text.IndexOf(affix, StringComparison.Ordinal);
|
||||||
while (start > 0)
|
while (start > 0)
|
||||||
{
|
{
|
||||||
lastLine = HtmlUtil.RemoveHtmlTags(text.Substring(0, start)).TrimEnd().TrimEnd('-').TrimEnd();
|
prevLine = HtmlUtil.RemoveHtmlTags(text.Substring(0, start)).TrimEnd().TrimEnd('-').TrimEnd();
|
||||||
isPreviousLineClose = string.IsNullOrEmpty(lastLine) || lastLine.EndsWith('.') || lastLine.EndsWith('!') || lastLine.EndsWith('?');
|
isPreviousLineClose = string.IsNullOrEmpty(prevLine) || prevLine.EndsWith('.') || prevLine.EndsWith('!') || prevLine.EndsWith('?');
|
||||||
if (start < text.Length - 4)
|
if (start < text.Length - 4)
|
||||||
{
|
{
|
||||||
if (start == 1 && text.StartsWith('-'))
|
if (start == 1 && text.StartsWith('-'))
|
||||||
@ -732,15 +730,15 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
|||||||
text = text.Remove(start + 1, 1).Insert(start + 1, "l");
|
text = text.Remove(start + 1, 1).Insert(start + 1, "l");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
start = text.IndexOf(tag, start + 1, StringComparison.Ordinal);
|
start = text.IndexOf(affix, start + 1, StringComparison.Ordinal);
|
||||||
}
|
}
|
||||||
|
|
||||||
tag = Environment.NewLine + tag.Trim();
|
affix = Environment.NewLine + affix.Trim();
|
||||||
start = text.IndexOf(tag, StringComparison.Ordinal);
|
start = text.IndexOf(affix, StringComparison.Ordinal);
|
||||||
while (start > 0)
|
while (start > 0)
|
||||||
{
|
{
|
||||||
lastLine = HtmlUtil.RemoveHtmlTags(text.Substring(0, start)).TrimEnd().TrimEnd('-').TrimEnd();
|
prevLine = HtmlUtil.RemoveHtmlTags(text.Substring(0, start)).TrimEnd().TrimEnd('-').TrimEnd();
|
||||||
isPreviousLineClose = string.IsNullOrEmpty(lastLine) || lastLine.EndsWith('.') || lastLine.EndsWith('!') || lastLine.EndsWith('?') || lastLine.EndsWith(".</i>", StringComparison.Ordinal);
|
isPreviousLineClose = string.IsNullOrEmpty(prevLine) || prevLine.EndsWith('.') || prevLine.EndsWith('!') || prevLine.EndsWith('?') || prevLine.EndsWith(".</i>", StringComparison.Ordinal);
|
||||||
if (start < text.Length - 5)
|
if (start < text.Length - 5)
|
||||||
{
|
{
|
||||||
if (start > 1)
|
if (start > 1)
|
||||||
@ -758,15 +756,15 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
|||||||
text = text.Remove(start + Environment.NewLine.Length, 1).Insert(start + Environment.NewLine.Length, "l");
|
text = text.Remove(start + Environment.NewLine.Length, 1).Insert(start + Environment.NewLine.Length, "l");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
start = text.IndexOf(tag, start + 1, StringComparison.Ordinal);
|
start = text.IndexOf(affix, start + 1, StringComparison.Ordinal);
|
||||||
}
|
}
|
||||||
|
|
||||||
tag = Environment.NewLine + "<i>" + tag.Trim();
|
affix = Environment.NewLine + "<i>" + affix.Trim();
|
||||||
start = text.IndexOf(tag, StringComparison.Ordinal);
|
start = text.IndexOf(affix, StringComparison.Ordinal);
|
||||||
while (start > 0)
|
while (start > 0)
|
||||||
{
|
{
|
||||||
lastLine = HtmlUtil.RemoveHtmlTags(text.Substring(0, start)).TrimEnd().TrimEnd('-').TrimEnd();
|
prevLine = HtmlUtil.RemoveHtmlTags(text.Substring(0, start)).TrimEnd().TrimEnd('-').TrimEnd();
|
||||||
isPreviousLineClose = string.IsNullOrEmpty(lastLine) || lastLine.EndsWith('.') || lastLine.EndsWith('!') || lastLine.EndsWith('?') || lastLine.EndsWith(".</i>", StringComparison.Ordinal);
|
isPreviousLineClose = string.IsNullOrEmpty(prevLine) || prevLine.EndsWith('.') || prevLine.EndsWith('!') || prevLine.EndsWith('?') || prevLine.EndsWith(".</i>", StringComparison.Ordinal);
|
||||||
if (start < text.Length - 8)
|
if (start < text.Length - 8)
|
||||||
{
|
{
|
||||||
if (isPreviousLineClose || char.IsUpper(text[start + 5 + Environment.NewLine.Length]))
|
if (isPreviousLineClose || char.IsUpper(text[start + 5 + Environment.NewLine.Length]))
|
||||||
@ -778,7 +776,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
|||||||
text = text.Remove(start + Environment.NewLine.Length + 3, 1).Insert(start + Environment.NewLine.Length + 3, "l");
|
text = text.Remove(start + Environment.NewLine.Length + 3, 1).Insert(start + Environment.NewLine.Length + 3, "l");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
start = text.IndexOf(tag, start + 1, StringComparison.Ordinal);
|
start = text.IndexOf(affix, start + 1, StringComparison.Ordinal);
|
||||||
}
|
}
|
||||||
return text;
|
return text;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user