More work related to word-split-list

This commit is contained in:
niksedk 2021-12-26 20:10:49 +01:00
parent 4f4f22120b
commit 5ca26ec918
3 changed files with 34 additions and 0 deletions

View File

@ -88,6 +88,7 @@
<word>colonoscopy</word>
<word>colours</word>
<word>contractualism</word>
<word>copain</word>
<word>copernicium</word>
<word>copper</word>
<word>copperish</word>
@ -96,6 +97,9 @@
<word>cottonoid</word>
<word>could've</word>
<word>craniotomy</word>
<word>crossbone</word>
<word>crosshair</word>
<word>crosshairs</word>
<word>cryo</word>
<word>cryotube</word>
<word>cudgelled</word>
@ -113,6 +117,7 @@
<word>douchey</word>
<word>dreamt</word>
<word>dubnium</word>
<word>dumbass</word>
<word>dysesthesia</word>
<word>dysprosium</word>
<word>einsteinium</word>
@ -130,8 +135,10 @@
<word>everything's</word>
<word>extradural</word>
<word>faggot</word>
<word>failsafe</word>
<word>falafel</word>
<word>fallin'</word>
<word>fanboy</word>
<word>favour</word>
<word>favoured</word>
<word>favourite</word>
@ -145,6 +152,7 @@
<word>fluorine</word>
<word>flushin'</word>
<word>flyer</word>
<word>forevermore</word>
<word>francium</word>
<word>fuckable</word>
<word>fundraiser</word>
@ -178,6 +186,7 @@
<word>hematoma</word>
<word>hiei</word>
<word>hijab</word>
<word>hitman</word>
<word>hm</word>
<word>holmium</word>
<word>hottie</word>
@ -190,6 +199,7 @@
<word>hypothermic</word>
<word>immersive</word>
<word>immunotherapy</word>
<word>inbox</word>
<word>incontinentia</word>
<word>indium</word>
<word>intercostal</word>
@ -219,6 +229,7 @@
<word>laryngopharyngeal</word>
<word>lawrencium</word>
<word>lead</word>
<word>lightsaber</word>
<word>lithium</word>
<word>lobectomy</word>
<word>lockdown</word>
@ -265,6 +276,8 @@
<word>neuroscientific</word>
<word>neurotypical</word>
<word>neurovascular</word>
<word>newsfeed</word>
<word>newsfeeds</word>
<word>nickel</word>
<word>niobium</word>
<word>nitrogen</word>
@ -330,6 +343,8 @@
<word>rubidium</word>
<word>ruthenium</word>
<word>rutherfordium</word>
<word>safehouse</word>
<word>safeword</word>
<word>saké</word>
<word>saltimbocca</word>
<word>samarium</word>
@ -346,6 +361,8 @@
<word>señorita</word>
<word>sensei</word>
<word>sharpshoot</word>
<word>shitbox</word>
<word>shithead</word>
<word>shithead's</word>
<word>shithole</word>
<word>should've</word>
@ -379,6 +396,7 @@
<word>sudoku</word>
<word>sulfur</word>
<word>sulphur</word>
<word>sunglass</word>
<word>supervolcano</word>
<word>synchronicity</word>
<word>syncopal</word>
@ -440,6 +458,7 @@
<word>vegetations</word>
<word>voicemail</word>
<word>voila</word>
<word>walkthrough</word>
<word>weirding</word>
<word>what'd</word>
<word>what're</word>

View File

@ -135,6 +135,7 @@ namespace Nikse.SubtitleEdit.Core.Common
public string OcrTrainFonts { get; set; }
public string OcrTrainMergedLetters { get; set; }
public string OcrTrainSrtFile { get; set; }
public bool OcrUseWordSplitList { get; set; }
public string BDOpenIn { get; set; }
public string Interjections { get; set; }
public string MicrosoftBingApiId { get; set; }
@ -384,6 +385,7 @@ namespace Nikse.SubtitleEdit.Core.Common
OcrAddLetterRow2 = "♫;Á;É;Í;Ó;Ö;Ő;Ú;Ü;Ű;Ç;Ñ;Å;¡";
OcrTrainFonts = "Arial;Calibri;Corbel;Futura Std Book;Futura Bis;Helvetica Neue;Lucida Console;Tahoma;Trebuchet MS;Verdana";
OcrTrainMergedLetters = "ff ft fi fj fy fl rf rt rv rw ry rt rz ryt tt TV tw yt yw wy wf ryt xy";
OcrUseWordSplitList = true;
Interjections = "Ah;Ahem;Ahh;Ahhh;Ahhhh;Eh;Ehh;Ehhh;Hm;Hmm;Hmmm;Huh;Mm;Mmm;Mmmm;Phew;Gah;Oh;Ohh;Ohhh;Ow;Oww;Owww;Ugh;Ughh;Uh;Uhh;Uhhh;Whew";
MicrosoftTranslatorTokenEndpoint = "https://api.cognitive.microsoft.com/sts/v1.0/issueToken";
GoogleTranslateNoKeyWarningShow = true;
@ -4143,6 +4145,12 @@ $HorzAlign = Center
settings.Tools.OcrTrainSrtFile = subNode.InnerText;
}
subNode = node.SelectSingleNode("OcrUseWordSplitList");
if (subNode != null)
{
settings.Tools.OcrUseWordSplitList = Convert.ToBoolean(subNode.InnerText, CultureInfo.InvariantCulture);
}
subNode = node.SelectSingleNode("BDOpenIn");
if (subNode != null)
{
@ -9198,6 +9206,7 @@ $HorzAlign = Center
textWriter.WriteElementString("OcrTrainFonts", settings.Tools.OcrTrainFonts);
textWriter.WriteElementString("OcrTrainMergedLetters", settings.Tools.OcrTrainMergedLetters);
textWriter.WriteElementString("OcrTrainSrtFile", settings.Tools.OcrTrainSrtFile);
textWriter.WriteElementString("OcrUseWordSplitList", settings.Tools.OcrUseWordSplitList.ToString(CultureInfo.InvariantCulture));
textWriter.WriteElementString("BDOpenIn", settings.Tools.BDOpenIn);
textWriter.WriteElementString("Interjections", settings.Tools.Interjections);
textWriter.WriteElementString("MicrosoftBingApiId", settings.Tools.MicrosoftBingApiId);

View File

@ -1,6 +1,7 @@
using System;
using System.Collections.Generic;
using System.Linq;
using Nikse.SubtitleEdit.Core.Common;
namespace Nikse.SubtitleEdit.Core.Dictionaries
{
@ -8,6 +9,11 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries
{
public static string SplitWord(string[] words, string input)
{
if (!Configuration.Settings.Tools.OcrUseWordSplitList)
{
return input;
}
var usedWords = new List<string>();
var result = SplitWord(words, input, string.Empty, usedWords);
if (result != input)