Work on wordlists

This commit is contained in:
niksedk 2022-11-01 19:12:38 +01:00
parent df5a89d768
commit e3bdff09c7
6 changed files with 385 additions and 360 deletions

View File

@ -191,6 +191,7 @@
<word>cyanotic</word>
<word>darmstadtium</word>
<word>darndest</word>
<word>dealmaking</word>
<word>debride</word>
<word>debridement</word>
<word>decompensating</word>
@ -418,6 +419,7 @@
<word>mycelial</word>
<word>nah</word>
<word>namaste</word>
<word>narrowminded</word>
<word>nasties</word>
<word>neighbour</word>
<word>neighbourhood</word>
@ -499,6 +501,7 @@
<word>polenta</word>
<word>polonium</word>
<word>polysulfide</word>
<word>postsurgical</word>
<word>postulator</word>
<word>potassium</word>
<word>praseodymium</word>
@ -511,6 +514,7 @@
<word>pseudoachondroplasia</word>
<word>pupillary</word>
<word>purée</word>
<word>pushback</word>
<word>radium</word>
<word>radon</word>
<word>ragdoll</word>
@ -576,6 +580,7 @@
<word>slipspace</word>
<word>smartphone</word>
<word>smartphones</word>
<word>snakelet</word>
<word>snuck</word>
<word>sociopathic</word>
<word>sodium</word>
@ -603,6 +608,7 @@
<word>stepmom</word>
<word>stereotactic</word>
<word>sternotomy</word>
<word>storyboarded</word>
<word>strontium</word>
<word>subclavian</word>
<word>subdural hematoma</word>
@ -708,6 +714,7 @@
<word>where'd</word>
<word>where're</word>
<word>which</word>
<word>whodunit</word>
<word>why'd</word>
<word>why's</word>
<word>wizarding</word>

View File

@ -1437,6 +1437,7 @@
<name>Rafferty</name>
<name>Raiden</name>
<name>Raina</name>
<name>Rajan</name>
<name>Ramiro</name>
<name>Rashad</name>
<name>Rayan</name>

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,6 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Nikse.SubtitleEdit.Core.Common;
@ -38,7 +39,13 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries
var s = input;
var check = s;
var spaces = new List<int>();
for (int i = 0; i < words.Length; i++)
if (words.Contains(input))
{
return input;
}
for (var i = 0; i < words.Length; i++)
{
var w = words[i];
if (w.Length >= input.Length)
@ -77,5 +84,32 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries
return s.Trim();
}
public static string[] LoadWordSplitList(string threeLetterIsoLanguageName, NameList nameList)
{
var fileName = $"{Configuration.DictionariesDirectory}{threeLetterIsoLanguageName}_WordSplitList.txt";
if (!File.Exists(fileName))
{
return Array.Empty<string>();
}
var wordList = File.ReadAllText(fileName).SplitToLines().Where(p => p.Trim().Length > 0).ToList();
if (threeLetterIsoLanguageName == "eng")
{
wordList.AddRange(new List<string>
{
// Ignore list
"Andor", "honour", "putain", "whoah", "eastside", "Starpath", "comlink"
});
}
if (nameList != null)
{
wordList.AddRange(nameList.GetNames().Where(p => p.Length > 4));
}
return wordList.OrderByDescending(p => p.Length).ToArray();
}
}
}

View File

@ -340,14 +340,7 @@ namespace Nikse.SubtitleEdit.Forms
_wordSplitListLanguage = languageName;
var threeLetterIsoLanguageName = Iso639Dash2LanguageCode.GetThreeLetterCodeFromTwoLetterCode(twoLetterLanguageName);
var fileName = $"{Configuration.DictionariesDirectory}{threeLetterIsoLanguageName}_WordSplitList.txt";
if (!File.Exists(fileName))
{
return Array.Empty<string>();
}
var wordList = File.ReadAllText(fileName).SplitToLines().Where(p => p.Trim().Length > 0).ToList();
return wordList.ToArray();
return StringWithoutSpaceSplitToWords.LoadWordSplitList(threeLetterIsoLanguageName, null);
}
private void FillSpellCheckDictionaries(string languageName)

View File

@ -313,7 +313,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
_nameListWithApostrophe = new HashSet<string>();
var nameListWithPeriods = new List<string>();
_abbreviationList = new HashSet<string>();
_wordSplitList = LoadWordSplitList(threeLetterIsoLanguageName, _nameListObj);
_wordSplitList = StringWithoutSpaceSplitToWords.LoadWordSplitList(threeLetterIsoLanguageName, _nameListObj);
var isEnglish = threeLetterIsoLanguageName.Equals("eng", StringComparison.OrdinalIgnoreCase);
foreach (var name in _nameList)
@ -415,19 +415,6 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
}
}
private static string[] LoadWordSplitList(string threeLetterIsoLanguageName, NameList nameList)
{
var fileName = $"{Configuration.DictionariesDirectory}{threeLetterIsoLanguageName}_WordSplitList.txt";
if (!File.Exists(fileName))
{
return Array.Empty<string>();
}
var wordList = File.ReadAllText(fileName).SplitToLines().Where(p => p.Trim().Length > 0).ToList();
wordList.AddRange(nameList.GetNames().Where(p => p.Length > 4));
return wordList.OrderByDescending(p => p.Length).ToArray();
}
public string SpellCheckDictionaryName
{
get