SubtitleEdit/libse/SpellCheck/SpellCheckWordLists.cs

359 lines
14 KiB
C#
Raw Normal View History

2016-02-02 20:40:47 +01:00
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Xml;
using Nikse.SubtitleEdit.Core.Dictionaries;
using Nikse.SubtitleEdit.Core.Interfaces;
namespace Nikse.SubtitleEdit.Core.SpellCheck
{
public class SpellCheckWordLists
{
2017-07-22 14:58:46 +02:00
public static readonly string SplitChars = " -.,?!:;\"“”()[]{}|<>/+\r\n¿¡…—♪♫„«»؛،؟";
2016-02-02 20:40:47 +01:00
2016-02-03 21:39:36 +01:00
private static readonly char[] PeriodAndDash = { '.', '-' };
private static readonly char[] SplitChars2 = { ' ', '.', ',', '?', '!', ':', ';', '"', '“', '”', '(', ')', '[', ']', '{', '}', '|', '<', '>', '/', '+', '\r', '\n', '¿', '¡', '…', '—', '', '♪', '♫', '„', '«', '»', '', '', '؛', '،', '؟' };
2016-02-03 21:39:36 +01:00
2017-05-08 20:45:28 +02:00
private readonly NameList _nameList;
2017-04-19 23:27:16 +02:00
private readonly HashSet<string> _names;
private readonly HashSet<string> _namesListUppercase = new HashSet<string>();
private readonly HashSet<string> _namesListWithApostrophe = new HashSet<string>();
private readonly HashSet<string> _wordsWithDashesOrPeriods = new HashSet<string>();
private readonly HashSet<string> _userWordList = new HashSet<string>();
private readonly HashSet<string> _userPhraseList = new HashSet<string>();
2016-02-02 20:40:47 +01:00
private readonly string _languageName;
private readonly IDoSpell _doSpell;
public SpellCheckWordLists(string dictionaryFolder, string languageName, IDoSpell doSpell)
{
if (languageName == null)
throw new NullReferenceException(nameof(languageName));
2016-02-02 20:40:47 +01:00
if (doSpell == null)
throw new NullReferenceException(nameof(doSpell));
2016-02-02 20:40:47 +01:00
_languageName = languageName;
_doSpell = doSpell;
2017-05-08 20:45:28 +02:00
_nameList = new NameList(Configuration.DictionariesDirectory, languageName, Configuration.Settings.WordLists.UseOnlineNames, Configuration.Settings.WordLists.NamesUrl);
_names = _nameList.GetNames();
var namesMultiWordList = _nameList.GetMultiNames();
2016-02-02 20:40:47 +01:00
2017-04-19 23:27:16 +02:00
foreach (string namesItem in _names)
_namesListUppercase.Add(namesItem.ToUpper());
2016-02-02 20:40:47 +01:00
if (languageName.StartsWith("en_", StringComparison.OrdinalIgnoreCase))
{
2017-04-19 23:27:16 +02:00
foreach (string namesItem in _names)
2016-02-02 20:40:47 +01:00
{
if (!namesItem.EndsWith('s'))
{
2017-04-19 23:27:16 +02:00
_namesListWithApostrophe.Add(namesItem + "'s");
_namesListWithApostrophe.Add(namesItem + "s");
2016-02-02 20:40:47 +01:00
}
else if (!namesItem.EndsWith('\''))
{
2017-04-19 23:27:16 +02:00
_namesListWithApostrophe.Add(namesItem + "'");
2016-02-02 20:40:47 +01:00
}
}
}
if (File.Exists(dictionaryFolder + languageName + "_user.xml"))
{
var userWordDictionary = new XmlDocument();
userWordDictionary.Load(dictionaryFolder + languageName + "_user.xml");
if (userWordDictionary.DocumentElement != null)
{
var xmlNodeList = userWordDictionary.DocumentElement.SelectNodes("word");
if (xmlNodeList != null)
{
foreach (XmlNode node in xmlNodeList)
{
string word = node.InnerText.Trim().ToLower();
if (word.Contains(' '))
_userPhraseList.Add(word);
else
_userWordList.Add(word);
}
}
}
}
// Add names/userdic with "." or " " or "-"
2017-04-19 23:27:16 +02:00
foreach (var word in namesMultiWordList)
{
if (word.Contains(PeriodAndDash))
_wordsWithDashesOrPeriods.Add(word);
}
2017-04-19 23:27:16 +02:00
foreach (string name in _names)
2016-02-02 20:40:47 +01:00
{
2016-02-03 21:39:36 +01:00
if (name.Contains(PeriodAndDash))
2016-02-02 20:40:47 +01:00
_wordsWithDashesOrPeriods.Add(name);
}
foreach (string word in _userWordList)
{
2016-02-03 21:39:36 +01:00
if (word.Contains(PeriodAndDash))
2016-02-02 20:40:47 +01:00
_wordsWithDashesOrPeriods.Add(word);
}
foreach (var phrase in _userPhraseList)
{
if (phrase.Contains(PeriodAndDash))
_wordsWithDashesOrPeriods.Add(phrase);
}
2016-02-02 20:40:47 +01:00
}
public void RemoveUserWord(string word)
{
_userWordList.Remove(word);
_userPhraseList.Remove(word);
Utilities.RemoveFromUserDictionary(word, _languageName);
}
public void RemoveName(string word)
{
2017-04-19 23:27:16 +02:00
if (word == null || word.Length <= 1 || !_names.Contains(word))
2016-02-02 20:40:47 +01:00
return;
2017-04-19 23:27:16 +02:00
_names.Remove(word);
_namesListUppercase.Remove(word.ToUpper());
2016-02-02 20:40:47 +01:00
if (_languageName.StartsWith("en_", StringComparison.Ordinal) && !word.EndsWith('s'))
{
2017-04-19 23:27:16 +02:00
_names.Remove(word + "s");
_namesListUppercase.Remove(word.ToUpper() + "S");
2016-02-02 20:40:47 +01:00
}
if (!word.EndsWith('s'))
{
2017-04-19 23:27:16 +02:00
_namesListWithApostrophe.Remove(word + "'s");
_namesListUppercase.Remove(word.ToUpper() + "'S");
2016-02-02 20:40:47 +01:00
}
if (!word.EndsWith('\''))
2017-04-19 23:27:16 +02:00
_namesListWithApostrophe.Remove(word + "'");
2016-02-02 20:40:47 +01:00
2017-05-08 20:45:28 +02:00
_nameList.Remove(word);
2016-02-02 20:40:47 +01:00
}
public string ReplaceKnownWordsOrNamesWithBlanks(string s)
{
var replaceIds = new List<string>();
var replaceNames = new List<string>();
GetTextWithoutUserWordsAndNames(replaceIds, replaceNames, s);
foreach (string name in replaceNames)
{
int start = s.IndexOf(name, StringComparison.Ordinal);
while (start >= 0)
{
bool startOk = start == 0 || SplitChars.Contains(s[start - 1]);
if (startOk)
{
int end = start + name.Length;
bool endOk = end >= s.Length || SplitChars.Contains(s[end]);
if (endOk)
s = s.Remove(start, name.Length).Insert(start, string.Empty.PadLeft(name.Length));
}
if (start + 1 < s.Length)
start = s.IndexOf(name, start + 1, StringComparison.Ordinal);
else
start = -1;
}
}
return s;
}
public string ReplaceHtmlTagsWithBlanks(string s)
{
int start = s.IndexOf('<');
while (start >= 0)
{
int end = s.IndexOf('>', start + 1);
if (end < start)
break;
int l = end - start + 1;
s = s.Remove(start, l).Insert(start, string.Empty.PadLeft(l));
end++;
if (end >= s.Length)
break;
start = s.IndexOf('<', end);
}
return s;
}
public string ReplaceAssTagsWithBlanks(string s)
{
int start = s.IndexOf("{\\", StringComparison.Ordinal);
int end = s.IndexOf('}');
if (start < 0 || end < 0 || end < start)
{
return s;
}
while (start >= 0)
{
end = s.IndexOf('}', start + 1);
if (end < start)
break;
int l = end - start + 1;
s = s.Remove(start, l).Insert(start, string.Empty.PadLeft(l));
end++;
if (end >= s.Length)
break;
start = s.IndexOf("{\\", end, StringComparison.Ordinal);
}
return s;
}
2016-02-02 20:40:47 +01:00
public bool IsWordInUserPhrases(int index, List<SpellCheckWord> words)
{
string current = words[index].Text;
string prev = "-";
if (index > 0)
prev = words[index - 1].Text;
string next = "-";
if (index < words.Count - 1)
next = words[index + 1].Text;
foreach (string userPhrase in _userPhraseList)
{
if (userPhrase == current + " " + next)
return true;
if (userPhrase == prev + " " + current)
return true;
}
return false;
}
/// <summary>
/// Removes words with dash'es that are correct, so spell check can ignore the combination (do not split correct words with dash'es)
/// </summary>
private void GetTextWithoutUserWordsAndNames(List<string> replaceIds, List<string> replaceNames, string text)
{
string[] wordsWithDash = text.Split(SplitChars2, StringSplitOptions.RemoveEmptyEntries);
foreach (string w in wordsWithDash)
{
if (w.Contains('-') && _doSpell.DoSpell(w) && !_wordsWithDashesOrPeriods.Contains(w))
_wordsWithDashesOrPeriods.Add(w);
}
2016-02-03 21:39:36 +01:00
if (text.Contains(PeriodAndDash))
2016-02-02 20:40:47 +01:00
{
int i = 0;
foreach (string wordWithDashesOrPeriods in _wordsWithDashesOrPeriods)
{
bool found = true;
int startSearchIndex = 0;
while (found)
{
int indexStart = text.IndexOf(wordWithDashesOrPeriods, startSearchIndex, StringComparison.Ordinal);
if (indexStart >= 0)
{
int endIndexPlus = indexStart + wordWithDashesOrPeriods.Length;
2016-02-03 21:39:36 +01:00
bool startOk = indexStart == 0 || (@" (['""" + "\r\n").Contains(text[indexStart - 1]);
2016-02-02 20:40:47 +01:00
bool endOk = endIndexPlus == text.Length;
if (!endOk && endIndexPlus < text.Length && @",!?:;. ])<'""".Contains(text[endIndexPlus]))
endOk = true;
if (startOk && endOk)
{
i++;
string id = string.Format("_@{0}_", i);
replaceIds.Add(id);
replaceNames.Add(wordWithDashesOrPeriods);
text = text.Remove(indexStart, wordWithDashesOrPeriods.Length).Insert(indexStart, id);
}
else
{
startSearchIndex = indexStart + 1;
}
}
else
{
found = false;
}
}
}
}
}
public bool AddName(string word)
{
2017-04-19 23:27:16 +02:00
if (string.IsNullOrEmpty(word) || _names.Contains(word))
2016-02-02 20:40:47 +01:00
return false;
2017-04-19 23:27:16 +02:00
_names.Add(word);
_namesListUppercase.Add(word.ToUpper());
2016-02-02 20:40:47 +01:00
if (_languageName.StartsWith("en_", StringComparison.Ordinal) && !word.EndsWith('s'))
{
2017-04-19 23:27:16 +02:00
_names.Add(word + "s");
_namesListUppercase.Add(word.ToUpper() + "S");
2016-02-02 20:40:47 +01:00
}
if (!word.EndsWith('s'))
{
2017-04-19 23:27:16 +02:00
_namesListWithApostrophe.Add(word + "'s");
_namesListUppercase.Add(word.ToUpper() + "'S");
2016-02-02 20:40:47 +01:00
}
if (!word.EndsWith('\''))
2017-04-19 23:27:16 +02:00
_namesListWithApostrophe.Add(word + "'");
2016-02-02 20:40:47 +01:00
2017-05-07 16:26:23 +02:00
var namesList = new NameList(Configuration.DictionariesDirectory, _languageName, Configuration.Settings.WordLists.UseOnlineNames, Configuration.Settings.WordLists.NamesUrl);
2016-02-02 20:40:47 +01:00
namesList.Add(word);
return true;
}
public bool AddUserWord(string word)
{
if (word == null)
return false;
word = word.Trim().ToLower();
if (word.Length == 0 || _userWordList.Contains(word))
2016-02-02 20:40:47 +01:00
return false;
if (word.Contains(' '))
_userPhraseList.Add(word);
else
_userWordList.Add(word);
Utilities.AddToUserDictionary(word, _languageName);
return true;
}
public bool HasName(string word)
{
2017-04-19 23:27:16 +02:00
return _names.Contains(word) || ((word.StartsWith('\'') || word.EndsWith('\'')) && _names.Contains(word.Trim('\'')));
2016-02-02 20:40:47 +01:00
}
public bool HasNameExtended(string word, string text)
{
2017-05-08 20:45:28 +02:00
return _namesListUppercase.Contains(word) || _namesListWithApostrophe.Contains(word) || _nameList.IsInNamesMultiWordList(text, word);
2016-02-02 20:40:47 +01:00
}
public bool HasUserWord(string word)
{
string s = word.ToLower();
return _userWordList.Contains(s) || (s.StartsWith('\'') || s.EndsWith('\'')) && _userWordList.Contains(s.Trim('\''));
}
public static List<SpellCheckWord> Split(string s)
{
var list = new List<SpellCheckWord>();
var sb = new StringBuilder();
for (int i = 0; i < s.Length; i++)
{
if (SplitChars.Contains(s[i]))
{
if (sb.Length > 0)
list.Add(new SpellCheckWord { Text = sb.ToString(), Index = i - sb.Length });
sb.Clear();
}
else
{
sb.Append(s[i]);
}
}
if (sb.Length > 0)
list.Add(new SpellCheckWord { Text = sb.ToString(), Index = s.Length - sb.Length });
2016-02-02 20:40:47 +01:00
return list;
}
}
}