Fix for do use *_se.xml words in OCR

Somewhat related to #6292
This commit is contained in:
niksedk 2022-09-28 22:29:07 +02:00
parent c011edf0f1
commit 4b5ab0d75f
3 changed files with 19 additions and 49 deletions

View File

@ -1084,30 +1084,6 @@ namespace Nikse.SubtitleEdit.Core.Common
return userWordListXmlFileName;
}
public static string LoadUserWordList(HashSet<string> userWordList, string languageName)
{
userWordList.Clear();
var userWordDictionary = new XmlDocument();
string userWordListXmlFileName = DictionaryFolder + languageName + "_user.xml";
if (File.Exists(userWordListXmlFileName))
{
userWordDictionary.Load(userWordListXmlFileName);
var nodes = userWordDictionary.DocumentElement?.SelectNodes("word");
if (nodes != null)
{
foreach (XmlNode node in nodes)
{
string s = node.InnerText.ToLowerInvariant();
if (!userWordList.Contains(s))
{
userWordList.Add(s);
}
}
}
}
return userWordListXmlFileName;
}
public static readonly string UppercaseLetters = Configuration.Settings.General.UppercaseLetters.ToUpperInvariant() + "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ";
public static readonly string LowercaseLetters = Configuration.Settings.General.UppercaseLetters.ToLowerInvariant() + "αβγδεζηθικλμνξοπρσςτυφχψωήάόέ";
public static readonly string LowercaseLettersWithNumbers = LowercaseLetters + "0123456789";

View File

@ -31,7 +31,6 @@ namespace Nikse.SubtitleEdit.Core.SpellCheck
private readonly HashSet<string> _namesListWithApostrophe = new HashSet<string>();
private readonly HashSet<string> _wordsWithDashesOrPeriods = new HashSet<string>();
private readonly HashSet<string> _userWordList = new HashSet<string>();
private readonly HashSet<string> _seWordList = new HashSet<string>();
private readonly HashSet<string> _userPhraseList = new HashSet<string>();
private readonly string _dictionaryFolder;
private HashSet<string> _skipAllList = new HashSet<string>();
@ -247,6 +246,11 @@ namespace Nikse.SubtitleEdit.Core.SpellCheck
Utilities.RemoveFromUserDictionary(word, _languageName);
}
public HashSet<string> GetSeAndUserWords()
{
return _userWordList;
}
public void RemoveName(string word)
{
if (word == null || word.Length <= 1 || !_names.Contains(word))

View File

@ -16,7 +16,6 @@ using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Windows.Forms;
using Nikse.SubtitleEdit.Core.SubtitleFormats;
namespace Nikse.SubtitleEdit.Logic.Ocr
{
@ -62,7 +61,6 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
Aggressive
}
private string _userWordListXmlFileName;
private string _fiveLetterWordListLanguageName;
private readonly OcrFixReplaceList _ocrFixReplaceList;
@ -73,7 +71,6 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
private HashSet<string> _nameMultiWordList = new HashSet<string>(); // case sensitive phrases
private List<string> _nameMultiWordListAndWordsWithPeriods;
private HashSet<string> _abbreviationList;
private HashSet<string> _userWordList = new HashSet<string>();
private HashSet<string> _wordSkipList = new HashSet<string>();
private readonly HashSet<string> _wordSpellOkList = new HashSet<string>();
private string[] _wordSplitList;
@ -365,17 +362,6 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
}
}
// Load user words
_userWordList = new HashSet<string>();
_userWordListXmlFileName = Utilities.LoadUserWordList(_userWordList, _fiveLetterWordListLanguageName);
foreach (var name in _userWordList)
{
if (name.EndsWith('.'))
{
_abbreviationList.Add(name);
}
}
// Load Hunspell spell checker
try
{
@ -416,6 +402,14 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
_changeAllDictionary = _spellCheckWordLists.GetUseAlwaysList();
}
}
foreach (var word in _spellCheckWordLists?.GetSeAndUserWords())
{
if (word.EndsWith('.'))
{
_abbreviationList.Add(word);
}
}
}
private static string[] LoadWordSplitList(string threeLetterIsoLanguageName, NameList nameList)
@ -1449,7 +1443,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
correct = !Configuration.Settings.Tools.CheckOneLetterWords; // hunspell allows too many single letter words
}
if (!correct && _userWordList.Contains(word))
if (!correct && _spellCheckWordLists.HasUserWord(word))
{
correct = true;
}
@ -1479,7 +1473,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
var trimmed = word.Trim('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.', ',', '،', '؟', '»');
if (trimmed != word)
{
if (_userWordList.Contains(trimmed))
if (_spellCheckWordLists.HasUserWord(trimmed))
{
correct = true;
}
@ -1830,11 +1824,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
Abort = true;
break;
case OcrSpellCheck.Action.AddToUserDictionary:
if (_userWordListXmlFileName != null)
{
Utilities.AddToUserDictionary(_spellCheck.Word.Trim().ToLowerInvariant(), _fiveLetterWordListLanguageName);
_userWordList.Add(_spellCheck.Word.Trim().ToLowerInvariant());
}
_spellCheckWordLists.AddUserWord(_spellCheck.Word.Trim().ToLowerInvariant());
result.Word = _spellCheck.Word;
result.Fixed = true;
result.Line = line;
@ -1973,7 +1963,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
{
if (!DoSpell(s) &&
!_nameList.Contains(s) &&
!_userWordList.Contains(s) &&
!_spellCheckWordLists.HasUserWord(s) &&
!IsWordKnownOrNumber(s, word))
{
if (s.Length > 10 && s.Contains('/'))
@ -2033,12 +2023,12 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
return true;
}
if (_userWordList.Contains(word.ToLowerInvariant()))
if (_spellCheckWordLists.HasUserWord(word.ToLowerInvariant()))
{
return true;
}
if (_userWordList.Contains(word.Trim('\'').ToLowerInvariant()))
if (_spellCheckWordLists.HasUserWord(word.Trim('\'').ToLowerInvariant()))
{
return true;
}