From 4b5ab0d75f0d32afd046e33367efccf93fe3787c Mon Sep 17 00:00:00 2001 From: niksedk Date: Wed, 28 Sep 2022 22:29:07 +0200 Subject: [PATCH] Fix for `do use *_se.xml words in OCR` Somewhat related to #6292 --- src/libse/Common/Utilities.cs | 24 ------------- src/libse/SpellCheck/SpellCheckWordLists.cs | 6 +++- src/ui/Logic/Ocr/OcrFixEngine.cs | 38 ++++++++------------- 3 files changed, 19 insertions(+), 49 deletions(-) diff --git a/src/libse/Common/Utilities.cs b/src/libse/Common/Utilities.cs index b99f2f385..e35e587c9 100644 --- a/src/libse/Common/Utilities.cs +++ b/src/libse/Common/Utilities.cs @@ -1084,30 +1084,6 @@ namespace Nikse.SubtitleEdit.Core.Common return userWordListXmlFileName; } - public static string LoadUserWordList(HashSet userWordList, string languageName) - { - userWordList.Clear(); - var userWordDictionary = new XmlDocument(); - string userWordListXmlFileName = DictionaryFolder + languageName + "_user.xml"; - if (File.Exists(userWordListXmlFileName)) - { - userWordDictionary.Load(userWordListXmlFileName); - var nodes = userWordDictionary.DocumentElement?.SelectNodes("word"); - if (nodes != null) - { - foreach (XmlNode node in nodes) - { - string s = node.InnerText.ToLowerInvariant(); - if (!userWordList.Contains(s)) - { - userWordList.Add(s); - } - } - } - } - return userWordListXmlFileName; - } - public static readonly string UppercaseLetters = Configuration.Settings.General.UppercaseLetters.ToUpperInvariant() + "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"; public static readonly string LowercaseLetters = Configuration.Settings.General.UppercaseLetters.ToLowerInvariant() + "αβγδεζηθικλμνξοπρσςτυφχψωήάόέ"; public static readonly string LowercaseLettersWithNumbers = LowercaseLetters + "0123456789"; diff --git a/src/libse/SpellCheck/SpellCheckWordLists.cs b/src/libse/SpellCheck/SpellCheckWordLists.cs index b315c7304..fb00f648d 100644 --- a/src/libse/SpellCheck/SpellCheckWordLists.cs +++ b/src/libse/SpellCheck/SpellCheckWordLists.cs @@ -31,7 +31,6 @@ namespace Nikse.SubtitleEdit.Core.SpellCheck private readonly HashSet _namesListWithApostrophe = new HashSet(); private readonly HashSet _wordsWithDashesOrPeriods = new HashSet(); private readonly HashSet _userWordList = new HashSet(); - private readonly HashSet _seWordList = new HashSet(); private readonly HashSet _userPhraseList = new HashSet(); private readonly string _dictionaryFolder; private HashSet _skipAllList = new HashSet(); @@ -247,6 +246,11 @@ namespace Nikse.SubtitleEdit.Core.SpellCheck Utilities.RemoveFromUserDictionary(word, _languageName); } + public HashSet GetSeAndUserWords() + { + return _userWordList; + } + public void RemoveName(string word) { if (word == null || word.Length <= 1 || !_names.Contains(word)) diff --git a/src/ui/Logic/Ocr/OcrFixEngine.cs b/src/ui/Logic/Ocr/OcrFixEngine.cs index 8532cb119..aa0bb6e45 100644 --- a/src/ui/Logic/Ocr/OcrFixEngine.cs +++ b/src/ui/Logic/Ocr/OcrFixEngine.cs @@ -16,7 +16,6 @@ using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Windows.Forms; -using Nikse.SubtitleEdit.Core.SubtitleFormats; namespace Nikse.SubtitleEdit.Logic.Ocr { @@ -62,7 +61,6 @@ namespace Nikse.SubtitleEdit.Logic.Ocr Aggressive } - private string _userWordListXmlFileName; private string _fiveLetterWordListLanguageName; private readonly OcrFixReplaceList _ocrFixReplaceList; @@ -73,7 +71,6 @@ namespace Nikse.SubtitleEdit.Logic.Ocr private HashSet _nameMultiWordList = new HashSet(); // case sensitive phrases private List _nameMultiWordListAndWordsWithPeriods; private HashSet _abbreviationList; - private HashSet _userWordList = new HashSet(); private HashSet _wordSkipList = new HashSet(); private readonly HashSet _wordSpellOkList = new HashSet(); private string[] _wordSplitList; @@ -365,17 +362,6 @@ namespace Nikse.SubtitleEdit.Logic.Ocr } } - // Load user words - _userWordList = new HashSet(); - _userWordListXmlFileName = Utilities.LoadUserWordList(_userWordList, _fiveLetterWordListLanguageName); - foreach (var name in _userWordList) - { - if (name.EndsWith('.')) - { - _abbreviationList.Add(name); - } - } - // Load Hunspell spell checker try { @@ -416,6 +402,14 @@ namespace Nikse.SubtitleEdit.Logic.Ocr _changeAllDictionary = _spellCheckWordLists.GetUseAlwaysList(); } } + + foreach (var word in _spellCheckWordLists?.GetSeAndUserWords()) + { + if (word.EndsWith('.')) + { + _abbreviationList.Add(word); + } + } } private static string[] LoadWordSplitList(string threeLetterIsoLanguageName, NameList nameList) @@ -1449,7 +1443,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr correct = !Configuration.Settings.Tools.CheckOneLetterWords; // hunspell allows too many single letter words } - if (!correct && _userWordList.Contains(word)) + if (!correct && _spellCheckWordLists.HasUserWord(word)) { correct = true; } @@ -1479,7 +1473,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr var trimmed = word.Trim('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', ',', '،', '؟', '»'); if (trimmed != word) { - if (_userWordList.Contains(trimmed)) + if (_spellCheckWordLists.HasUserWord(trimmed)) { correct = true; } @@ -1830,11 +1824,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr Abort = true; break; case OcrSpellCheck.Action.AddToUserDictionary: - if (_userWordListXmlFileName != null) - { - Utilities.AddToUserDictionary(_spellCheck.Word.Trim().ToLowerInvariant(), _fiveLetterWordListLanguageName); - _userWordList.Add(_spellCheck.Word.Trim().ToLowerInvariant()); - } + _spellCheckWordLists.AddUserWord(_spellCheck.Word.Trim().ToLowerInvariant()); result.Word = _spellCheck.Word; result.Fixed = true; result.Line = line; @@ -1973,7 +1963,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr { if (!DoSpell(s) && !_nameList.Contains(s) && - !_userWordList.Contains(s) && + !_spellCheckWordLists.HasUserWord(s) && !IsWordKnownOrNumber(s, word)) { if (s.Length > 10 && s.Contains('/')) @@ -2033,12 +2023,12 @@ namespace Nikse.SubtitleEdit.Logic.Ocr return true; } - if (_userWordList.Contains(word.ToLowerInvariant())) + if (_spellCheckWordLists.HasUserWord(word.ToLowerInvariant())) { return true; } - if (_userWordList.Contains(word.Trim('\'').ToLowerInvariant())) + if (_spellCheckWordLists.HasUserWord(word.Trim('\'').ToLowerInvariant())) { return true; }