From 3ad0515c6becd1ca00428019fc27e6ec0564f9fe Mon Sep 17 00:00:00 2001 From: niksedk Date: Fri, 9 Mar 2012 15:17:46 +0000 Subject: [PATCH] Optimized fix common errors a bit git-svn-id: https://subtitleedit.googlecode.com/svn/trunk@1026 99eadd0c-20b8-1223-b5c4-2a2b2df33de2 --- src/Logic/OCR/OcrFixEngine.cs | 132 +++++++++++++++++----------------- 1 file changed, 68 insertions(+), 64 deletions(-) diff --git a/src/Logic/OCR/OcrFixEngine.cs b/src/Logic/OCR/OcrFixEngine.cs index 7832bc5b0..5d65931e7 100644 --- a/src/Logic/OCR/OcrFixEngine.cs +++ b/src/Logic/OCR/OcrFixEngine.cs @@ -37,18 +37,18 @@ namespace Nikse.SubtitleEdit.Logic.OCR readonly Form _parentForm; private string _spellCheckDictionaryName; - static Regex regexAloneI = new Regex(@"\bi\b", RegexOptions.Compiled); - static Regex regexAloneIAsL = new Regex(@"\bl\b", RegexOptions.Compiled); - static Regex regexSpaceBetweenNumbers = new Regex(@"\d \d", RegexOptions.Compiled); - static Regex regExLowercaseL = new Regex("[A-ZÆØÅÄÖÉÁ]l[A-ZÆØÅÄÖÉÁ]", RegexOptions.Compiled); - static Regex regExUppercaseI = new Regex("[a-zæøåöäé]I.", RegexOptions.Compiled); - static Regex regExNumber1 = new Regex(@"\d\ 1", RegexOptions.Compiled); - static Regex regExQuestion = new Regex(@"\S\?[A-ZÆØÅÄÖÉÈÀÙÂÊÎÔÛËÏa-zæøåäöéèàùâêîôûëï]", RegexOptions.Compiled); - static Regex regExIandZero = new Regex(@"[a-zæøåäöé][I1]", RegexOptions.Compiled); - static Regex regExTime1 = new Regex(@"[a-zæøåäöé][0]", RegexOptions.Compiled); - static Regex regExTime2 = new Regex(@"0[a-zæøåäöé]", RegexOptions.Compiled); - static Regex hexNumber = new Regex(@"^#?[\dABDEFabcdef]+$", RegexOptions.Compiled); - static Regex startEndEndsWithNumber = new Regex(@"^\d+.+\d$", RegexOptions.Compiled); + static readonly Regex RegexAloneI = new Regex(@"\bi\b", RegexOptions.Compiled); + static readonly Regex RegexAloneIasL = new Regex(@"\bl\b", RegexOptions.Compiled); + static readonly Regex RegexSpaceBetweenNumbers = new Regex(@"\d \d", RegexOptions.Compiled); + static readonly Regex RegExLowercaseL = new Regex("[A-ZÆØÅÄÖÉÁ]l[A-ZÆØÅÄÖÉÁ]", RegexOptions.Compiled); + static readonly Regex RegExUppercaseI = new Regex("[a-zæøåöäé]I.", RegexOptions.Compiled); + static readonly Regex RegExNumber1 = new Regex(@"\d\ 1", RegexOptions.Compiled); + static readonly Regex RegExQuestion = new Regex(@"\S\?[A-ZÆØÅÄÖÉÈÀÙÂÊÎÔÛËÏa-zæøåäöéèàùâêîôûëï]", RegexOptions.Compiled); + static readonly Regex RegExIandZero = new Regex(@"[a-zæøåäöé][I1]", RegexOptions.Compiled); + static readonly Regex RegExTime1 = new Regex(@"[a-zæøåäöé][0]", RegexOptions.Compiled); + static readonly Regex RegExTime2 = new Regex(@"0[a-zæøåäöé]", RegexOptions.Compiled); + static readonly Regex HexNumber = new Regex(@"^#?[\dABDEFabcdef]+$", RegexOptions.Compiled); + static readonly Regex StartEndEndsWithNumber = new Regex(@"^\d+.+\d$", RegexOptions.Compiled); public bool Abort { get; set; } public List AutoGuessesUsed { get; set; } @@ -164,13 +164,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR string dictionary = Utilities.DictionaryFolder + _fiveLetterWordListLanguageName; if (resetSkipList) { - _wordSkipList = new List(); - _wordSkipList.Add(Configuration.Settings.Tools.MusicSymbol); - _wordSkipList.Add("*"); - _wordSkipList.Add("%"); - _wordSkipList.Add("#"); - _wordSkipList.Add("+"); - _wordSkipList.Add("$"); + _wordSkipList = new List {Configuration.Settings.Tools.MusicSymbol, "*", "%", "#", "+", "$"}; } // Load names etc list (names/noise words) @@ -361,8 +355,8 @@ namespace Nikse.SubtitleEdit.Logic.OCR if (SpellCheckDictionaryName.StartsWith("en_")) { string oldText = text; - text = FixCommonErrors.FixAloneLowercaseIToUppercaseLine(regexAloneI, oldText, text, 'i'); - text = FixCommonErrors.FixAloneLowercaseIToUppercaseLine(regexAloneIAsL, oldText, text, 'l'); + text = FixCommonErrors.FixAloneLowercaseIToUppercaseLine(RegexAloneI, oldText, text, 'i'); + text = FixCommonErrors.FixAloneLowercaseIToUppercaseLine(RegexAloneIasL, oldText, text, 'l'); } text = RemoveSpaceBetweenNumbers(text); } @@ -406,7 +400,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR private string RemoveSpaceBetweenNumbers(string text) { - Match match = regexSpaceBetweenNumbers.Match(text); + Match match = RegexSpaceBetweenNumbers.Match(text); while (match.Success) { bool doFix = true; @@ -417,11 +411,11 @@ namespace Nikse.SubtitleEdit.Logic.OCR if (doFix) { text = text.Remove(match.Index + 1, 1); - match = regexSpaceBetweenNumbers.Match(text); + match = RegexSpaceBetweenNumbers.Match(text); } else { - match = regexSpaceBetweenNumbers.Match(text, match.Index+1); + match = RegexSpaceBetweenNumbers.Match(text, match.Index+1); } } return text; @@ -550,21 +544,26 @@ namespace Nikse.SubtitleEdit.Logic.OCR if (word.Contains("?")) { - Match match = regExQuestion.Match(word); + Match match = RegExQuestion.Match(word); if (match.Success) word = word.Insert(match.Index + 2, " "); } foreach (string from in _wordReplaceList.Keys) { - if (from.Contains(word)) + if (word.Length == from.Length) { if (word == from) return pre + _wordReplaceList[from] + post; + } + else if (word.Length + post.Length == from.Length) + { if (word + post == from) return pre + _wordReplaceList[from]; - if (pre + word + post == from) - return _wordReplaceList[from]; + } + if (pre.Length + word.Length + post.Length == from.Length && pre + word + post == from) + { + return _wordReplaceList[from]; } } @@ -585,14 +584,19 @@ namespace Nikse.SubtitleEdit.Logic.OCR // Retry word replace list foreach (string from in _wordReplaceList.Keys) { - if (from.Contains(word)) + if (word.Length == from.Length) { if (word == from) return pre + _wordReplaceList[from] + post; + } + else if (word.Length + post.Length == from.Length) + { if (word + post == from) return pre + _wordReplaceList[from]; - if (pre + word + post == from) - return _wordReplaceList[from]; + } + if (pre.Length + word.Length + post.Length == from.Length && pre + word + post == from) + { + return _wordReplaceList[from]; } } @@ -689,24 +693,28 @@ namespace Nikse.SubtitleEdit.Logic.OCR foreach (string from in _wordReplaceList.Keys) { - if (from.Contains(word)) + if (word.Length == from.Length) { if (word == from) return pre + _wordReplaceList[from] + post; + } + else if (word.Length + post.Length == from.Length) + { if (word + post == from) return pre + _wordReplaceList[from]; - if (pre + word + post == from) - return _wordReplaceList[from]; + } + if (pre.Length + word.Length + post.Length == from.Length && pre + word + post == from) + { + return _wordReplaceList[from]; } } - return pre + word + post; } public static string Fix0InsideLowerCaseWord(string word) { - if (startEndEndsWithNumber.IsMatch(word)) + if (StartEndEndsWithNumber.IsMatch(word)) return word; if (word.Contains("1") || @@ -724,12 +732,12 @@ namespace Nikse.SubtitleEdit.Logic.OCR word.EndsWith("pm")) return word; - if (hexNumber.IsMatch(word)) + if (HexNumber.IsMatch(word)) return word; if (word.LastIndexOf('0') > 0) { - Match match = regExTime1.Match(word); + Match match = RegExTime1.Match(word); if (match.Success) { while (match.Success) @@ -741,11 +749,11 @@ namespace Nikse.SubtitleEdit.Logic.OCR if (match.Index + 2 < oldText.Length) word += oldText.Substring(match.Index + 2); } - match = regExTime1.Match(word); + match = RegExTime1.Match(word); } } - match = regExTime2.Match(word); + match = RegExTime2.Match(word); if (match.Success) { while (match.Success) @@ -760,7 +768,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR word += oldText.Substring(match.Index + 1); } } - match = regExTime2.Match(word, match.Index + 1); + match = RegExTime2.Match(word, match.Index + 1); } } } @@ -770,7 +778,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR public static string FixIor1InsideLowerCaseWord(string word) { - if (startEndEndsWithNumber.IsMatch(word)) + if (StartEndEndsWithNumber.IsMatch(word)) return word; if (word.Contains("2") || @@ -783,12 +791,12 @@ namespace Nikse.SubtitleEdit.Logic.OCR word.Contains("9")) return word; - if (hexNumber.IsMatch(word)) + if (HexNumber.IsMatch(word)) return word; if (word.LastIndexOf('I') > 0 || word.LastIndexOf('1') > 0) { - Match match = regExIandZero.Match(word); + Match match = RegExIandZero.Match(word); if (match.Success) { while (match.Success) @@ -800,7 +808,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR if (match.Index + 2 < oldText.Length) word += oldText.Substring(match.Index + 2); } - match = regExIandZero.Match(word, match.Index + 1); + match = RegExIandZero.Match(word, match.Index + 1); } } } @@ -875,7 +883,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR lastLine.EndsWith("!") || lastLine.EndsWith("?")) { - StripableText st = new StripableText(l); + var st = new StripableText(l); if (st.StrippedText.StartsWith("i") && !st.Pre.EndsWith("[") && !st.Pre.EndsWith("(")) { if (string.IsNullOrEmpty(lastLine) || (!lastLine.EndsWith("...") && !EndsWithAbbreviation(lastLine, _abbreviationList))) @@ -1023,7 +1031,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR lastLine.EndsWith("♪")) { lastLine = Utilities.RemoveHtmlTags(lastLine); - StripableText st = new StripableText(input); + var st = new StripableText(input); if (lastLine == null || (!lastLine.EndsWith("...") && !EndsWithAbbreviation(lastLine, abbreviationList))) { if (st.StrippedText.Length > 0 && st.StrippedText[0].ToString() != st.StrippedText[0].ToString().ToUpper() && !st.Pre.EndsWith("[") && !st.Pre.EndsWith("(") && !st.Pre.EndsWith("...")) @@ -1052,7 +1060,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR // change '1' to '1' if (input.Contains("1")) { - Match match = regExNumber1.Match(input); + Match match = RegExNumber1.Match(input); while (match.Success) { bool doFix = true; @@ -1063,11 +1071,11 @@ namespace Nikse.SubtitleEdit.Logic.OCR if (doFix) { input = input.Substring(0, match.Index + 1) + input.Substring(match.Index + 2); - match = regExNumber1.Match(input); + match = RegExNumber1.Match(input); } else { - match = regExNumber1.Match(input, match.Index + 1); + match = RegExNumber1.Match(input, match.Index + 1); } } } @@ -1078,22 +1086,22 @@ namespace Nikse.SubtitleEdit.Logic.OCR // change 'sequeI of' to 'sequel of' if (input.Contains("I")) { - var match = regExUppercaseI.Match(input); + var match = RegExUppercaseI.Match(input); while (match.Success) { input = input.Substring(0, match.Index + 1) + "l" + input.Substring(match.Index + 2); - match = regExUppercaseI.Match(input); + match = RegExUppercaseI.Match(input); } } // change 'NlCE' to 'NICE' if (input.Contains("l")) { - var match = regExLowercaseL.Match(input); + var match = RegExLowercaseL.Match(input); while (match.Success) { input = input.Substring(0, match.Index + 1) + "I" + input.Substring(match.Index + 2); - match = regExLowercaseL.Match(input); + match = RegExLowercaseL.Match(input); } } @@ -1129,7 +1137,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR // begin line string[] lines = newText.Split(Environment.NewLine.ToCharArray(), StringSplitOptions.RemoveEmptyEntries); - StringBuilder sb = new StringBuilder(); + var sb = new StringBuilder(); foreach (string l in lines) { string s = l; @@ -1183,7 +1191,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR public string FixUnknownWordsViaGuessOrPrompt(out int wordsNotFound, string line, int index, Bitmap bitmap, bool autoFix, bool promptForFixingErrors, bool log, bool useAutoGuess) { - List localIgnoreWords = new List(); + var localIgnoreWords = new List(); wordsNotFound = 0; if (promptForFixingErrors && line.Length == 1 && !IsWordKnownOrNumber(line, line)) @@ -1276,7 +1284,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR if (autoFix && useAutoGuess) { - List guesses = new List(); + var guesses = new List(); if (word.Length > 5) { guesses = (List)CreateGuessesFromLetters(word); @@ -1332,7 +1340,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR } if (!correct && promptForFixingErrors) { - List suggestions = new List(); + var suggestions = new List(); if ((word == "Lt's" || word == "Lt'S") && SpellCheckDictionaryName.StartsWith("en_")) { @@ -1398,8 +1406,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR } if (uppercase > lowercase) return word.ToUpper(); - else - return word.ToLower(); + return word.ToLower(); } /// @@ -1500,8 +1507,6 @@ namespace Nikse.SubtitleEdit.Logic.OCR result.Word = _spellCheck.Word; result.Fixed = true; break; - default: - break; } if (result.Fixed) { @@ -1512,7 +1517,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR private string ReplaceWord(string text, string word, string newWord) { - StringBuilder sb = new StringBuilder(); + var sb = new StringBuilder(); if (word != null && text != null && text.Contains(word)) { int appendFrom = 0; @@ -1780,7 +1785,6 @@ namespace Nikse.SubtitleEdit.Logic.OCR numberOfCorrectWords++; else wordsNotFound++; - } } return wordsNotFound;