diff --git a/src/Forms/FixCommonErrors.cs b/src/Forms/FixCommonErrors.cs index 597284a94..e4de89cc8 100644 --- a/src/Forms/FixCommonErrors.cs +++ b/src/Forms/FixCommonErrors.cs @@ -774,6 +774,10 @@ namespace Nikse.SubtitleEdit.Forms public void FixUnneededSpaces() { + const string zeroWhiteSpace = "\u200B"; + const string zeroWidthNoBreakSpace = "\uFEFF"; + + string fixAction = _language.UnneededSpace; int doubleSpaces = 0; for (int i = 0; i < _subtitle.Paragraphs.Count; i++) @@ -782,6 +786,10 @@ namespace Nikse.SubtitleEdit.Forms string oldText = p.Text; p.Text = p.Text.Trim(); + + p.Text = p.Text.Replace(zeroWhiteSpace, string.Empty); + p.Text = p.Text.Replace(zeroWidthNoBreakSpace, string.Empty); + p.Text = p.Text.Replace("", string.Empty); // some kind of hidden space!!! while (p.Text.Contains(" ")) { p.Text = p.Text.Replace(" ", " "); @@ -1319,11 +1327,18 @@ namespace Nikse.SubtitleEdit.Forms { if ((Utilities.GetLetters(true, true, true) + ",").Contains(st.StrippedText[match.Index - (Environment.NewLine.Length + 1)].ToString())) { - st.StrippedText = st.StrippedText.Remove(match.Index, 1).Insert(match.Index, "l"); - p.Text = st.MergedString; - uppercaseIsInsideLowercaseWords++; - _totalFixes++; - AddFixToListView(p, i + 1, fixAction, oldText, p.Text); + string next = string.Empty; + if (match.Length >= 2) + next = match.Value.Substring(1, 1); + + if (Utilities.LowerCaseVowels.Contains(next)) + { + st.StrippedText = st.StrippedText.Remove(match.Index, 1).Insert(match.Index, "l"); + p.Text = st.MergedString; + uppercaseIsInsideLowercaseWords++; + _totalFixes++; + AddFixToListView(p, i + 1, fixAction, oldText, p.Text); + } } } else if (match.Index > 1 && ((st.StrippedText[match.Index - 1] == '\"') || (st.StrippedText[match.Index - 1] == '>') || (st.StrippedText[match.Index - 1] == '-'))) @@ -1348,11 +1363,17 @@ namespace Nikse.SubtitleEdit.Forms } else { - st.StrippedText = st.StrippedText.Remove(match.Index, 1).Insert(match.Index, "l"); - p.Text = st.MergedString; - uppercaseIsInsideLowercaseWords++; - _totalFixes++; - AddFixToListView(p, i + 1, fixAction, oldText, p.Text); + if (before == " " && !Utilities.LowerCaseVowels.Contains(after.ToLower())) + { + } + else + { + st.StrippedText = st.StrippedText.Remove(match.Index, 1).Insert(match.Index, "l"); + p.Text = st.MergedString; + uppercaseIsInsideLowercaseWords++; + _totalFixes++; + AddFixToListView(p, i + 1, fixAction, oldText, p.Text); + } } } } @@ -1699,7 +1720,7 @@ namespace Nikse.SubtitleEdit.Forms { string text = p.Text.Substring(indexOfNewLine + 2); StripableText st = new StripableText(text); - if (st.StrippedText.Length > 0 && st.StrippedText[0].ToString() != st.StrippedText[0].ToString().ToUpper()) + if (st.StrippedText.Length > 0 && st.StrippedText[0].ToString() != st.StrippedText[0].ToString().ToUpper() && !st.Pre.EndsWith("[") && !st.Pre.Contains("...")) { text = st.Pre + st.StrippedText.Remove(0, 1).Insert(0, st.StrippedText[0].ToString().ToUpper()) + st.Post; @@ -1858,7 +1879,12 @@ namespace Nikse.SubtitleEdit.Forms prev = s[match.Index - 1].ToString(); if (match.Index + 1 < s.Length) next = s[match.Index + 1].ToString(); - if (prev != ">" && next != ">" && next != "}") + + string wholePrev = string.Empty; + if (match.Index > 1) + wholePrev = s.Substring(0, match.Index - 1); + + if (prev != ">" && next != ">" && next != "}" && !wholePrev.Trim().EndsWith("...")) { string temp = s.Substring(0, match.Index) + "I"; if (match.Index + 1 < oldText.Length) diff --git a/src/Forms/VobSubOcr.cs b/src/Forms/VobSubOcr.cs index 41089e732..210d70dd2 100644 --- a/src/Forms/VobSubOcr.cs +++ b/src/Forms/VobSubOcr.cs @@ -752,7 +752,7 @@ namespace Nikse.SubtitleEdit.Forms } string line = GetStringWithItalicTags(matches); if (checkBoxAutoFixCommonErrors.Checked) - line = OcrFixEngine.FixOcrErrorsViaHardcodedRules(line, _lastLine); + line = OcrFixEngine.FixOcrErrorsViaHardcodedRules(line, _lastLine, null); // TODO: add abbreviations list return line; } diff --git a/src/Logic/OCR/OcrFixEngine.cs b/src/Logic/OCR/OcrFixEngine.cs index d354d9dc1..a1e9617c9 100644 --- a/src/Logic/OCR/OcrFixEngine.cs +++ b/src/Logic/OCR/OcrFixEngine.cs @@ -27,6 +27,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR List _namesEtcList = new List(); List _namesEtcListUppercase = new List(); List _namesEtcMultiWordList = new List(); // case sensitive phrases + List _abbreviationList; List _userWordList = new List(); List _wordSkipList = new List(); Hunspell _hunspell; @@ -120,6 +121,19 @@ namespace Nikse.SubtitleEdit.Logic.OCR _userWordList = new List(); _userWordListXmlFileName = Utilities.LoadUserWordList(_userWordList, _fiveLetterWordListLanguageName); + // Find abbreviations + _abbreviationList = new List(); + foreach (string name in _namesEtcList) + { + if (name.EndsWith(".")) + _abbreviationList.Add(name); + } + foreach (string name in _userWordList) + { + if (name.EndsWith(".")) + _abbreviationList.Add(name); + } + // Load NHunspell spellchecker _hunspell = new Hunspell(dictionary + ".aff", dictionary + ".dic"); IsDictionaryLoaded = true; @@ -455,7 +469,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR private string FixCommenOcrLineErrors(string input, string lastLine) { - input = FixOcrErrorsViaHardcodedRules(input, lastLine); + input = FixOcrErrorsViaHardcodedRules(input, lastLine, _abbreviationList); input = FixOcrErrorViaLineReplaceList(input); // e.g. "selectionsu." -> "selections..." @@ -481,16 +495,31 @@ namespace Nikse.SubtitleEdit.Logic.OCR return input; } - public static string FixOcrErrorsViaHardcodedRules(string input, string lastLine) + private static bool EndsWithAbbreviation(string line, List abbreviationList) + { + if (string.IsNullOrEmpty(line) || abbreviationList == null) + return false; + + abbreviationList.Add("a.m."); + abbreviationList.Add("p.m."); + abbreviationList.Add("o.r."); + foreach (string abbreviation in abbreviationList) + { + if (line.ToLower().EndsWith(" " + abbreviation.ToLower())) + return true; + } + return false; + } + + public static string FixOcrErrorsViaHardcodedRules(string input, string lastLine, List abbreviationList) { if (lastLine == null || lastLine.EndsWith(".") || lastLine.EndsWith("!") || lastLine.EndsWith("?")) { - if (lastLine == null || !lastLine.EndsWith("...")) + if (lastLine == null || (!lastLine.EndsWith("...") && !EndsWithAbbreviation(lastLine, abbreviationList))) { - if (input.Length > 0 && input[0].ToString() != input[0].ToString().ToUpper()) input = input.Remove(0, 1).Insert(0, input[0].ToString().ToUpper()); }