Improve ocr string-split-when-space-is-missing - thx Dnkhatri :)

Related to #5616
This commit is contained in:
niksedk 2021-12-18 13:49:06 +01:00
parent e352299fca
commit 91d9f69431
13 changed files with 3565 additions and 6 deletions

View File

@ -24,6 +24,7 @@
* Fix WebVTT browser preview - thx Jeremy
* Fix crash in PAC when saving with negative values - thx sandrickn
* Fix frame Rate/Multiplier/dropMode in TimedText properties - thx OmrSi
* Fix unwanted lowercase for ASSA properties window when reopen - thx Thomas
3.6.4 (3rd December 2021)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,20 @@
using Microsoft.VisualStudio.TestTools.UnitTesting;
using Nikse.SubtitleEdit.Core.Dictionaries;
using System;
using System.Linq;
namespace Test.Dictionaries
{
[TestClass]
public class StringWithoutSpaceSplitToWordsTest
{
[TestMethod]
public void DictionariesValidXml()
{
var words = "we the people of the united states in order to form a more perfect union establish justice in sure domestic tranquility provide for the common defence promote the general welfare and secure the blessings of liberty to ourselves and our posterity do ordain and establish this constitution for the united states of america".Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).OrderByDescending(p => p.Length).ToArray();
var input = "wethepeopleoftheunitedstatesinordertoformamoreperfectunionestablishjusticeinsuredomestictranquilityprovideforthecommondefencepromotethegeneralwelfareandsecuretheblessingsoflibertytoourselvesandourposteritydoordainandestablishthisconstitutionfortheunitedstatesofamerica";
var result = StringWithoutSpaceSplitToWords.SplitWord(words, input);
Assert.AreEqual("we the people of the united states in order to form a more perfect union establish justice in sure domestic tranquility provide for the common defence promote the general welfare and secure the blessings of liberty to ourselves and our posterity do ordain and establish this constitution for the united states of america", result);
}
}
}

View File

@ -63,6 +63,7 @@
<ItemGroup>
<Compile Include="Assa\TagHelperRemoveTagTest.cs" />
<Compile Include="Assa\ResamplerTest.cs" />
<Compile Include="Dictionaries\StringWithoutSpaceSplitToWordsTest.cs" />
<Compile Include="LanguageFiles\LanguageFileTest.cs" />
<Compile Include="Logic\SubtitleFormats\PacTest.cs" />
<Compile Include="Core\UUEncodingTest.cs" />

View File

@ -9,6 +9,7 @@ namespace Nikse.SubtitleEdit.Core.ContainerFormats.Mp4
{
/// <summary>
/// http://wiki.multimedia.cx/index.php?title=QuickTime_container
/// https://gpac.github.io/mp4box.js/test/filereader.html
/// </summary>
public class MP4Parser : Box
{

View File

@ -0,0 +1,43 @@
using System;
using System.Collections.Generic;
using System.Linq;
namespace Nikse.SubtitleEdit.Core.Dictionaries
{
public static class StringWithoutSpaceSplitToWords
{
public static string SplitWord(string[] words, string input)
{
var s = input;
var check = s;
var spaces = new List<int>();
for (int i = 0; i < words.Length; i++)
{
var w = words[i];
var idx = check.IndexOf(w, StringComparison.Ordinal);
while (idx != -1)
{
spaces.Add(idx);
spaces.Add(idx + w.Length);
check = check.Remove(idx, w.Length).Insert(idx, string.Empty.PadLeft(w.Length, '¤'));
idx = check.IndexOf(w, idx + w.Length - 1);
}
}
var last = -1;
spaces = spaces.OrderBy(p => p).ToList();
for (int i = spaces.Count - 1; i >= 0; i--)
{
var idx = spaces[i];
if (idx != last)
{
s = s.Insert(idx, " ");
}
last = idx;
}
return check.Trim('¤', ' ').Length == 0 ? s.Trim() : input;
}
}
}

View File

@ -2,10 +2,7 @@
## How to load a subtitle file
```csharp
var subtitle = new Subtitle();
var subRip = new SubRip();
var lines = File.ReadAllLines(@"C:\test.srt").ToList();
subRip.LoadSubtitle(subtitle, lines, "untitled");
var subtitle = Subtitle.Parse(fileName);
var numberOfSubtitleLines = subtitle.Paragraphs.Count;
var firstText = subtitle.Paragraphs.First().Text;
var firstStartMilliseconds = subtitle.Paragraphs.First().StartTime.TotalMilliseconds;
@ -13,5 +10,5 @@ var firstStartMilliseconds = subtitle.Paragraphs.First().StartTime.TotalMillisec
## How to save a subtitle file
```csharp
File.WriteAllText(@"C:\Data\new.srt", subRip.ToText(subtitle, "untitled"));
File.WriteAllText(@"C:\Data\new.srt", new SubRip().ToText(subtitle, "untitled"));
```

View File

@ -17014,6 +17014,11 @@ namespace Nikse.SubtitleEdit.Forms
e.SuppressKeyPress = true;
}
// put new entries above tabs
if (e.Modifiers == (Keys.Alt | Keys.Shift | Keys.Control) && e.KeyCode == Keys.F12)
{
new WordSplitDictionaryGenerator().ShowDialog(this);
}
}
private void ToggleVideoControlsOnOff(bool on)

View File

@ -0,0 +1,294 @@
namespace Nikse.SubtitleEdit.Forms.Ocr
{
partial class WordSplitDictionaryGenerator
{
/// <summary>
/// Required designer variable.
/// </summary>
private System.ComponentModel.IContainer components = null;
/// <summary>
/// Clean up any resources being used.
/// </summary>
/// <param name="disposing">true if managed resources should be disposed; otherwise, false.</param>
protected override void Dispose(bool disposing)
{
if (disposing && (components != null))
{
components.Dispose();
}
base.Dispose(disposing);
}
#region Windows Form Designer generated code
/// <summary>
/// Required method for Designer support - do not modify
/// the contents of this method with the code editor.
/// </summary>
private void InitializeComponent()
{
this.comboBoxDictionaries = new System.Windows.Forms.ComboBox();
this.labelDictionaryLoaded = new System.Windows.Forms.Label();
this.buttonInputBrowse = new System.Windows.Forms.Button();
this.listViewInputFiles = new System.Windows.Forms.ListView();
this.columnHeaderFName = ((System.Windows.Forms.ColumnHeader)(new System.Windows.Forms.ColumnHeader()));
this.columnHeaderSize = ((System.Windows.Forms.ColumnHeader)(new System.Windows.Forms.ColumnHeader()));
this.columnHeaderFormat = ((System.Windows.Forms.ColumnHeader)(new System.Windows.Forms.ColumnHeader()));
this.ButtonGenerate = new System.Windows.Forms.Button();
this.labelStatus = new System.Windows.Forms.Label();
this.comboBoxMinOccurrences = new System.Windows.Forms.ComboBox();
this.labelMinOccurSmall = new System.Windows.Forms.Label();
this.comboBoxMinOccurrencesLongWords = new System.Windows.Forms.ComboBox();
this.labelMinOccurLarge = new System.Windows.Forms.Label();
this.SuspendLayout();
//
// comboBoxDictionaries
//
this.comboBoxDictionaries.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right)));
this.comboBoxDictionaries.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList;
this.comboBoxDictionaries.FormattingEnabled = true;
this.comboBoxDictionaries.Location = new System.Drawing.Point(725, 12);
this.comboBoxDictionaries.Name = "comboBoxDictionaries";
this.comboBoxDictionaries.Size = new System.Drawing.Size(203, 21);
this.comboBoxDictionaries.TabIndex = 44;
//
// labelDictionaryLoaded
//
this.labelDictionaryLoaded.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right)));
this.labelDictionaryLoaded.AutoSize = true;
this.labelDictionaryLoaded.Location = new System.Drawing.Point(589, 15);
this.labelDictionaryLoaded.Name = "labelDictionaryLoaded";
this.labelDictionaryLoaded.Size = new System.Drawing.Size(111, 13);
this.labelDictionaryLoaded.TabIndex = 43;
this.labelDictionaryLoaded.Text = "Spell check dictionary";
//
// buttonInputBrowse
//
this.buttonInputBrowse.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right)));
this.buttonInputBrowse.Location = new System.Drawing.Point(940, 111);
this.buttonInputBrowse.Name = "buttonInputBrowse";
this.buttonInputBrowse.Size = new System.Drawing.Size(26, 23);
this.buttonInputBrowse.TabIndex = 47;
this.buttonInputBrowse.Text = "...";
this.buttonInputBrowse.UseVisualStyleBackColor = true;
this.buttonInputBrowse.Click += new System.EventHandler(this.buttonInputBrowse_Click);
//
// listViewInputFiles
//
this.listViewInputFiles.AllowDrop = true;
this.listViewInputFiles.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.listViewInputFiles.Columns.AddRange(new System.Windows.Forms.ColumnHeader[] {
this.columnHeaderFName,
this.columnHeaderSize,
this.columnHeaderFormat});
this.listViewInputFiles.FullRowSelect = true;
this.listViewInputFiles.HideSelection = false;
this.listViewInputFiles.Location = new System.Drawing.Point(12, 111);
this.listViewInputFiles.Name = "listViewInputFiles";
this.listViewInputFiles.Size = new System.Drawing.Size(922, 502);
this.listViewInputFiles.TabIndex = 46;
this.listViewInputFiles.UseCompatibleStateImageBehavior = false;
this.listViewInputFiles.View = System.Windows.Forms.View.Details;
//
// columnHeaderFName
//
this.columnHeaderFName.Text = "File name";
this.columnHeaderFName.Width = 500;
//
// columnHeaderSize
//
this.columnHeaderSize.Text = "Size";
this.columnHeaderSize.Width = 75;
//
// columnHeaderFormat
//
this.columnHeaderFormat.Text = "Format";
this.columnHeaderFormat.Width = 200;
//
// ButtonGenerate
//
this.ButtonGenerate.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Bottom | System.Windows.Forms.AnchorStyles.Right)));
this.ButtonGenerate.Enabled = false;
this.ButtonGenerate.Location = new System.Drawing.Point(734, 619);
this.ButtonGenerate.Name = "ButtonGenerate";
this.ButtonGenerate.Size = new System.Drawing.Size(200, 23);
this.ButtonGenerate.TabIndex = 48;
this.ButtonGenerate.Text = "&Generate word split list...";
this.ButtonGenerate.Click += new System.EventHandler(this.okButton_Click);
//
// labelStatus
//
this.labelStatus.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Bottom | System.Windows.Forms.AnchorStyles.Left)));
this.labelStatus.AutoSize = true;
this.labelStatus.Location = new System.Drawing.Point(12, 624);
this.labelStatus.Name = "labelStatus";
this.labelStatus.Size = new System.Drawing.Size(59, 13);
this.labelStatus.TabIndex = 49;
this.labelStatus.Text = "labelStatus";
//
// comboBoxMinOccurrences
//
this.comboBoxMinOccurrences.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right)));
this.comboBoxMinOccurrences.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList;
this.comboBoxMinOccurrences.FormattingEnabled = true;
this.comboBoxMinOccurrences.Items.AddRange(new object[] {
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"40",
"50",
"60",
"70",
"80",
"90",
"100",
"500",
"1000",
"10000"});
this.comboBoxMinOccurrences.Location = new System.Drawing.Point(725, 39);
this.comboBoxMinOccurrences.Name = "comboBoxMinOccurrences";
this.comboBoxMinOccurrences.Size = new System.Drawing.Size(203, 21);
this.comboBoxMinOccurrences.TabIndex = 51;
//
// labelMinOccurSmall
//
this.labelMinOccurSmall.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right)));
this.labelMinOccurSmall.AutoSize = true;
this.labelMinOccurSmall.Location = new System.Drawing.Point(576, 42);
this.labelMinOccurSmall.Name = "labelMinOccurSmall";
this.labelMinOccurSmall.Size = new System.Drawing.Size(124, 13);
this.labelMinOccurSmall.TabIndex = 50;
this.labelMinOccurSmall.Text = "Min occurrences, len < 7";
//
// comboBoxMinOccurrencesLongWords
//
this.comboBoxMinOccurrencesLongWords.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right)));
this.comboBoxMinOccurrencesLongWords.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList;
this.comboBoxMinOccurrencesLongWords.FormattingEnabled = true;
this.comboBoxMinOccurrencesLongWords.Items.AddRange(new object[] {
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"29",
"30",
"40",
"50",
"60",
"70",
"80",
"90",
"100",
"500",
"1000",
"10000"});
this.comboBoxMinOccurrencesLongWords.Location = new System.Drawing.Point(725, 66);
this.comboBoxMinOccurrencesLongWords.Name = "comboBoxMinOccurrencesLongWords";
this.comboBoxMinOccurrencesLongWords.Size = new System.Drawing.Size(203, 21);
this.comboBoxMinOccurrencesLongWords.TabIndex = 53;
//
// labelMinOccurLarge
//
this.labelMinOccurLarge.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right)));
this.labelMinOccurLarge.AutoSize = true;
this.labelMinOccurLarge.Location = new System.Drawing.Point(570, 69);
this.labelMinOccurLarge.Name = "labelMinOccurLarge";
this.labelMinOccurLarge.Size = new System.Drawing.Size(130, 13);
this.labelMinOccurLarge.TabIndex = 52;
this.labelMinOccurLarge.Text = "Min occurrences, len >= 7";
//
// WordSplitDictionaryGenerator
//
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
this.ClientSize = new System.Drawing.Size(978, 654);
this.Controls.Add(this.comboBoxMinOccurrencesLongWords);
this.Controls.Add(this.labelMinOccurLarge);
this.Controls.Add(this.comboBoxMinOccurrences);
this.Controls.Add(this.labelMinOccurSmall);
this.Controls.Add(this.labelStatus);
this.Controls.Add(this.ButtonGenerate);
this.Controls.Add(this.buttonInputBrowse);
this.Controls.Add(this.listViewInputFiles);
this.Controls.Add(this.comboBoxDictionaries);
this.Controls.Add(this.labelDictionaryLoaded);
this.KeyPreview = true;
this.Name = "WordSplitDictionaryGenerator";
this.ShowIcon = false;
this.ShowInTaskbar = false;
this.StartPosition = System.Windows.Forms.FormStartPosition.CenterParent;
this.Text = "Word split dictionary generator";
this.ResizeEnd += new System.EventHandler(this.WordSplitDictionaryGenerator_ResizeEnd);
this.KeyDown += new System.Windows.Forms.KeyEventHandler(this.WordSplitDictionaryGenerator_KeyDown);
this.ResumeLayout(false);
this.PerformLayout();
}
#endregion
private System.Windows.Forms.ComboBox comboBoxDictionaries;
private System.Windows.Forms.Label labelDictionaryLoaded;
private System.Windows.Forms.Button buttonInputBrowse;
private System.Windows.Forms.ListView listViewInputFiles;
private System.Windows.Forms.ColumnHeader columnHeaderFName;
private System.Windows.Forms.ColumnHeader columnHeaderSize;
private System.Windows.Forms.ColumnHeader columnHeaderFormat;
private System.Windows.Forms.Button ButtonGenerate;
private System.Windows.Forms.Label labelStatus;
private System.Windows.Forms.ComboBox comboBoxMinOccurrences;
private System.Windows.Forms.Label labelMinOccurSmall;
private System.Windows.Forms.ComboBox comboBoxMinOccurrencesLongWords;
private System.Windows.Forms.Label labelMinOccurLarge;
}
}

View File

@ -0,0 +1,262 @@
using Nikse.SubtitleEdit.Core.Common;
using Nikse.SubtitleEdit.Core.SpellCheck;
using Nikse.SubtitleEdit.Core.SubtitleFormats;
using Nikse.SubtitleEdit.Logic;
using Nikse.SubtitleEdit.Logic.SpellCheck;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Windows.Forms;
namespace Nikse.SubtitleEdit.Forms.Ocr
{
public partial class WordSplitDictionaryGenerator : Form
{
private List<Subtitle> _subtitleList;
private Hunspell _hunspell;
public WordSplitDictionaryGenerator()
{
UiUtil.PreInitialize(this);
InitializeComponent();
UiUtil.FixFonts(this);
FillSpellCheckDictionaries();
_subtitleList = new List<Subtitle>();
comboBoxMinOccurrences.SelectedIndex = 13;
comboBoxMinOccurrencesLongWords.SelectedIndex = 5;
listViewInputFiles.AutoSizeLastColumn();
labelStatus.Text = string.Empty;
}
private void FillSpellCheckDictionaries()
{
comboBoxDictionaries.Items.Clear();
foreach (string name in Utilities.GetDictionaryLanguages())
{
comboBoxDictionaries.Items.Add(name);
}
if (comboBoxDictionaries.Items.Count > 0)
{
comboBoxDictionaries.SelectedIndex = 0;
}
}
private void buttonInputBrowse_Click(object sender, EventArgs e)
{
buttonInputBrowse.Enabled = false;
using (var openFileDialog1 = new OpenFileDialog
{
Title = LanguageSettings.Current.General.OpenSubtitle,
FileName = string.Empty,
Filter = UiUtil.SubtitleExtensionFilter.Value,
Multiselect = true
})
{
if (openFileDialog1.ShowDialog(this) == DialogResult.OK)
{
try
{
Cursor = Cursors.WaitCursor;
labelStatus.Text = LanguageSettings.Current.General.PleaseWait;
listViewInputFiles.BeginUpdate();
foreach (string fileName in openFileDialog1.FileNames)
{
AddInputFile(fileName);
Application.DoEvents();
}
}
finally
{
listViewInputFiles.EndUpdate();
Cursor = Cursors.Default;
labelStatus.Text = string.Empty;
}
}
buttonInputBrowse.Enabled = true;
}
labelStatus.Text = $"{listViewInputFiles.Items.Count} input files";
ButtonGenerate.Enabled = listViewInputFiles.Items.Count > 0;
}
private void AddInputFile(string fileName)
{
foreach (ListViewItem lvi in listViewInputFiles.Items)
{
if (lvi.Text.Equals(fileName, StringComparison.OrdinalIgnoreCase))
{
return;
}
}
var fi = new FileInfo(fileName);
var ext = fi.Extension.ToLowerInvariant();
var item = new ListViewItem(fileName);
item.SubItems.Add(Utilities.FormatBytesToDisplayFileSize(fi.Length));
var sub = new Subtitle();
if (fi.Length < 500_000)
{
if (!FileUtil.IsBluRaySup(fileName) && !FileUtil.IsVobSub(fileName) &&
!((ext == ".mkv" || ext == ".mks") && FileUtil.IsMatroskaFile(fileName)))
{
SubtitleFormat format = sub.LoadSubtitle(fileName, out _, null);
if (format == null)
{
foreach (var f in SubtitleFormat.GetBinaryFormats(true))
{
if (f.IsMine(null, fileName))
{
f.LoadSubtitle(sub, null, fileName);
format = f;
break;
}
}
}
if (format == null)
{
var encoding = LanguageAutoDetect.GetEncodingFromFile(fileName);
var lines = FileUtil.ReadAllTextShared(fileName, encoding).SplitToLines();
foreach (var f in SubtitleFormat.GetTextOtherFormats())
{
if (f.IsMine(lines, fileName))
{
f.LoadSubtitle(sub, lines, fileName);
format = f;
break;
}
}
}
if (format != null)
{
item.SubItems.Add(format.Name);
listViewInputFiles.Items.Add(item);
_subtitleList.Add(sub);
}
}
}
}
private void okButton_Click(object sender, EventArgs e)
{
LoadHunspell();
var wordDictionary = new Dictionary<string, int>();
foreach (var subtitle in _subtitleList)
{
foreach (var p in subtitle.Paragraphs)
{
var words = SpellCheckWordLists.Split(HtmlUtil.RemoveHtmlTags(p.Text, true));
foreach (var word in words)
{
if (!_hunspell.Spell(word.Text) || Utilities.IsNumber(word.Text))
{
continue;
}
if (wordDictionary.ContainsKey(word.Text))
{
wordDictionary[word.Text]++;
}
else
{
wordDictionary.Add(word.Text, 1);
}
}
}
labelStatus.Text = $"{wordDictionary.Count:#,###,##0} words...";
labelStatus.Refresh();
Application.DoEvents();
}
SaveFile(wordDictionary);
}
private void SaveFile(Dictionary<string, int> wordDictionary)
{
int minUseCountSmall = int.Parse(comboBoxMinOccurrences.Text);
int minUseCountLarge = int.Parse(comboBoxMinOccurrencesLongWords.Text);
using (var saveFileDialog = new SaveFileDialog
{
Title = LanguageSettings.Current.General.OpenSubtitle,
FileName = GetThreeLetterLanguageCode() + "_WordSplitList",
Filter = "Text|*.txt",
InitialDirectory = Configuration.DictionariesDirectory,
})
{
if (saveFileDialog.ShowDialog(this) == DialogResult.OK)
{
var list = new List<string>();
foreach (var word in wordDictionary)
{
if (word.Key.Length < 7 && word.Value >= minUseCountSmall ||
word.Key.Length >= 7 && word.Value >= minUseCountLarge)
{
list.Add(word.Key);
}
}
var sb = new StringBuilder();
foreach (var word in list.OrderByDescending(prop => prop.Length))
{
sb.AppendLine(word);
}
File.WriteAllText(saveFileDialog.FileName, sb.ToString());
var info = $"{list.Count:#,###,##0} words saved in {saveFileDialog.FileName}";
labelStatus.Text = info;
using (var f = new ExportPngXmlDialogOpenFolder(info + Environment.NewLine + "File is created with longest words first - do check the bottom of the file regarding valid one/two/three letter words!", Path.GetDirectoryName(saveFileDialog.FileName), saveFileDialog.FileName))
{
f.ShowDialog(this);
}
}
}
}
private string GetLanguageCode()
{
var languageString = comboBoxDictionaries.Text;
if (languageString.IndexOf('[') > 0 && languageString.IndexOf('[') < languageString.IndexOf(']'))
{
languageString = languageString.Substring(languageString.IndexOf('[') + 1);
languageString = languageString.Substring(0, languageString.IndexOf(']'));
}
return languageString;
}
private string GetThreeLetterLanguageCode()
{
var languageString = GetLanguageCode().Split('_', '-').First();
return Iso639Dash2LanguageCode.GetThreeLetterCodeFromTwoLetterCode(languageString);
}
private void LoadHunspell()
{
var dictionary = Utilities.DictionaryFolder + GetLanguageCode();
_hunspell?.Dispose();
_hunspell = null;
_hunspell = Hunspell.GetHunspell(dictionary);
}
private void WordSplitDictionaryGenerator_KeyDown(object sender, KeyEventArgs e)
{
if (e.KeyCode == Keys.Escape)
{
DialogResult = DialogResult.Cancel;
}
}
private void WordSplitDictionaryGenerator_ResizeEnd(object sender, EventArgs e)
{
listViewInputFiles.AutoSizeLastColumn();
}
}
}

View File

@ -0,0 +1,120 @@
<?xml version="1.0" encoding="utf-8"?>
<root>
<!--
Microsoft ResX Schema
Version 2.0
The primary goals of this format is to allow a simple XML format
that is mostly human readable. The generation and parsing of the
various data types are done through the TypeConverter classes
associated with the data types.
Example:
... ado.net/XML headers & schema ...
<resheader name="resmimetype">text/microsoft-resx</resheader>
<resheader name="version">2.0</resheader>
<resheader name="reader">System.Resources.ResXResourceReader, System.Windows.Forms, ...</resheader>
<resheader name="writer">System.Resources.ResXResourceWriter, System.Windows.Forms, ...</resheader>
<data name="Name1"><value>this is my long string</value><comment>this is a comment</comment></data>
<data name="Color1" type="System.Drawing.Color, System.Drawing">Blue</data>
<data name="Bitmap1" mimetype="application/x-microsoft.net.object.binary.base64">
<value>[base64 mime encoded serialized .NET Framework object]</value>
</data>
<data name="Icon1" type="System.Drawing.Icon, System.Drawing" mimetype="application/x-microsoft.net.object.bytearray.base64">
<value>[base64 mime encoded string representing a byte array form of the .NET Framework object]</value>
<comment>This is a comment</comment>
</data>
There are any number of "resheader" rows that contain simple
name/value pairs.
Each data row contains a name, and value. The row also contains a
type or mimetype. Type corresponds to a .NET class that support
text/value conversion through the TypeConverter architecture.
Classes that don't support this are serialized and stored with the
mimetype set.
The mimetype is used for serialized objects, and tells the
ResXResourceReader how to depersist the object. This is currently not
extensible. For a given mimetype the value must be set accordingly:
Note - application/x-microsoft.net.object.binary.base64 is the format
that the ResXResourceWriter will generate, however the reader can
read any of the formats listed below.
mimetype: application/x-microsoft.net.object.binary.base64
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Binary.BinaryFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.soap.base64
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Soap.SoapFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.bytearray.base64
value : The object must be serialized into a byte array
: using a System.ComponentModel.TypeConverter
: and then encoded with base64 encoding.
-->
<xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
<xsd:import namespace="http://www.w3.org/XML/1998/namespace" />
<xsd:element name="root" msdata:IsDataSet="true">
<xsd:complexType>
<xsd:choice maxOccurs="unbounded">
<xsd:element name="metadata">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" />
</xsd:sequence>
<xsd:attribute name="name" use="required" type="xsd:string" />
<xsd:attribute name="type" type="xsd:string" />
<xsd:attribute name="mimetype" type="xsd:string" />
<xsd:attribute ref="xml:space" />
</xsd:complexType>
</xsd:element>
<xsd:element name="assembly">
<xsd:complexType>
<xsd:attribute name="alias" type="xsd:string" />
<xsd:attribute name="name" type="xsd:string" />
</xsd:complexType>
</xsd:element>
<xsd:element name="data">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
<xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
</xsd:sequence>
<xsd:attribute name="name" type="xsd:string" use="required" msdata:Ordinal="1" />
<xsd:attribute name="type" type="xsd:string" msdata:Ordinal="3" />
<xsd:attribute name="mimetype" type="xsd:string" msdata:Ordinal="4" />
<xsd:attribute ref="xml:space" />
</xsd:complexType>
</xsd:element>
<xsd:element name="resheader">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
</xsd:sequence>
<xsd:attribute name="name" type="xsd:string" use="required" />
</xsd:complexType>
</xsd:element>
</xsd:choice>
</xsd:complexType>
</xsd:element>
</xsd:schema>
<resheader name="resmimetype">
<value>text/microsoft-resx</value>
</resheader>
<resheader name="version">
<value>2.0</value>
</resheader>
<resheader name="reader">
<value>System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</resheader>
<resheader name="writer">
<value>System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</resheader>
</root>

View File

@ -15,7 +15,6 @@ using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Windows.Forms;
using System.Xml;
namespace Nikse.SubtitleEdit.Logic.Ocr
{
@ -75,6 +74,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
private HashSet<string> _userWordList = new HashSet<string>();
private HashSet<string> _wordSkipList = new HashSet<string>();
private readonly HashSet<string> _wordSpellOkList = new HashSet<string>();
private string[] _wordSplitList;
private Hunspell _hunspell;
private Dictionary<string, string> _changeAllDictionary;
private SpellCheckWordLists _spellCheckWordLists;
@ -302,6 +302,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
_nameListWithApostrophe = new HashSet<string>();
var nameListWithPeriods = new List<string>();
_abbreviationList = new HashSet<string>();
_wordSplitList = LoadWordSplitList(threeLetterIsoLanguageName);
bool isEnglish = threeLetterIsoLanguageName.Equals("eng", StringComparison.OrdinalIgnoreCase);
foreach (string name in _nameList)
@ -403,6 +404,18 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
}
}
private string[] LoadWordSplitList(string threeLetterIsoLanguageName)
{
var fileName = $"{Configuration.DictionariesDirectory}{threeLetterIsoLanguageName}_WordSplitList.txt";
if (!File.Exists(fileName))
{
return Array.Empty<string>();
}
var wordList = File.ReadAllText(fileName).SplitToLines().Where(p => p.Trim().Length > 0).ToArray();
return wordList;
}
public string SpellCheckDictionaryName
{
get
@ -1558,6 +1571,15 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
guesses.Add(wordWithVerticalLine);
}
if (word.Length > 4)
{
var splitWords = StringWithoutSpaceSplitToWords.SplitWord(_wordSplitList, word);
if (splitWords != word)
{
guesses.Add(splitWords);
}
}
if (word.Length > 4 && autoGuess == AutoGuessLevel.Aggressive)
{
guesses.AddRange((List<string>)_ocrFixReplaceList.CreateGuessesFromLetters(word, _threeLetterIsoLanguageName));

View File

@ -222,6 +222,12 @@
<Compile Include="Forms\AddWaveformBatch.Designer.cs">
<DependentUpon>AddWaveformBatch.cs</DependentUpon>
</Compile>
<Compile Include="Forms\Ocr\WordSplitDictionaryGenerator.cs">
<SubType>Form</SubType>
</Compile>
<Compile Include="Forms\Ocr\WordSplitDictionaryGenerator.Designer.cs">
<DependentUpon>WordSplitDictionaryGenerator.cs</DependentUpon>
</Compile>
<Compile Include="Forms\SceneChangesList.cs">
<SubType>Form</SubType>
</Compile>
@ -1374,6 +1380,9 @@
<EmbeddedResource Include="Forms\AddWaveformBatch.resx">
<DependentUpon>AddWaveformBatch.cs</DependentUpon>
</EmbeddedResource>
<EmbeddedResource Include="Forms\Ocr\WordSplitDictionaryGenerator.resx">
<DependentUpon>WordSplitDictionaryGenerator.cs</DependentUpon>
</EmbeddedResource>
<EmbeddedResource Include="Forms\SceneChangesList.resx">
<DependentUpon>SceneChangesList.cs</DependentUpon>
</EmbeddedResource>