diff --git a/Tesseract/msvcp90.dll b/Tesseract302/msvcp90.dll
similarity index 100%
rename from Tesseract/msvcp90.dll
rename to Tesseract302/msvcp90.dll
diff --git a/Tesseract/msvcr90.dll b/Tesseract302/msvcr90.dll
similarity index 100%
rename from Tesseract/msvcr90.dll
rename to Tesseract302/msvcr90.dll
diff --git a/Tesseract/tessdata/configs/hocr b/Tesseract302/tessdata/configs/hocr
similarity index 100%
rename from Tesseract/tessdata/configs/hocr
rename to Tesseract302/tessdata/configs/hocr
diff --git a/Tesseract/tessdata/eng.traineddata b/Tesseract302/tessdata/eng.traineddata
similarity index 100%
rename from Tesseract/tessdata/eng.traineddata
rename to Tesseract302/tessdata/eng.traineddata
diff --git a/Tesseract/tessdata/music.traineddata b/Tesseract302/tessdata/music.traineddata
similarity index 100%
rename from Tesseract/tessdata/music.traineddata
rename to Tesseract302/tessdata/music.traineddata
diff --git a/Tesseract/tesseract.exe b/Tesseract302/tesseract.exe
similarity index 100%
rename from Tesseract/tesseract.exe
rename to Tesseract302/tesseract.exe
diff --git a/Tesseract4/tesseract.exe b/Tesseract4/tesseract.exe
index 0b4a0fc1c..0529798a6 100644
Binary files a/Tesseract4/tesseract.exe and b/Tesseract4/tesseract.exe differ
diff --git a/installer/Subtitle_Edit_installer.iss b/installer/Subtitle_Edit_installer.iss
index b2e29d649..c6a85b8dc 100644
--- a/installer/Subtitle_Edit_installer.iss
+++ b/installer/Subtitle_Edit_installer.iss
@@ -269,16 +269,12 @@ Source: ..\Changelog.txt; DestDir: {app};
Source: ..\LICENSE.txt; DestDir: {app}; Flags: ignoreversion; Components: main
Source: Icons\uninstall.ico; DestDir: {app}\Icons; Flags: ignoreversion; Components: main
-Source: ..\Tesseract4\tessdata\configs\hocr; DestDir: {app}\Tesseract4\tessdata\configs; Flags: ignoreversion; Components: main
-Source: ..\Tesseract4\tessdata\osd.traineddata; DestDir: {app}\Tesseract4\tessdata; Flags: ignoreversion; Components: main
-Source: ..\Tesseract4\tesseract.exe; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
-Source: ..\Tesseract4\gif.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
-Source: ..\Tesseract4\jpeg62.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
-Source: ..\Tesseract4\leptonica-1.74.4.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
-Source: ..\Tesseract4\libpng16.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
-Source: ..\Tesseract4\lzma.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
-Source: ..\Tesseract4\tiff.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
-Source: ..\Tesseract4\zlib1.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
+Source: ..\Tesseract302\tessdata\configs\hocr; DestDir: {app}\Tesseract302\tessdata\configs; Flags: ignoreversion; Components: main
+Source: ..\Tesseract302\tessdata\eng.traineddata; DestDir: {app}\Tesseract302\tessdata; Flags: ignoreversion; Components: main
+Source: ..\Tesseract302\tessdata\music.traineddata; DestDir: {app}\Tesseract302\tessdata; Flags: ignoreversion; Components: main
+Source: ..\Tesseract302\tesseract.exe; DestDir: {app}\Tesseract302; Flags: ignoreversion; Components: main
+Source: ..\Tesseract302\msvcp90.dll; DestDir: {app}\Tesseract302; Flags: ignoreversion; Components: main
+Source: ..\Tesseract302\msvcr90.dll; DestDir: {app}\Tesseract302; Flags: ignoreversion; Components: main
[Icons]
@@ -325,6 +321,12 @@ Type: files; Name: {app}\TessData\eng.word-dawg; Check: IsU
Type: dirifempty; Name: {app}\TessData; Check: IsUpgrade()
Type: files; Name: {app}\Tesseract\leptonlib.dll; Check: IsUpgrade()
Type: files; Name: {app}\tessnet2_32.dll; Check: IsUpgrade()
+Type: files; Name: {app}\Tesseract302\tessdata\configs\hocr; Check: IsUpgrade()
+Type: files; Name: {app}\Tesseract302\tessdata\eng.traineddata; Check: IsUpgrade()
+Type: files; Name: {app}\Tesseract302\tessdata\music.traineddata; Check: IsUpgrade()
+Type: files; Name: {app}\Tesseract302\tesseract.exe; Check: IsUpgrade()
+Type: files; Name: {app}\Tesseract302\msvcp90.dll; Check: IsUpgrade()
+Type: files; Name: {app}\Tesseract302\msvcr90.dll; Check: IsUpgrade()
Type: files; Name: {app}\Icons\SubtitleEdit.srt.ico; Check: IsUpgrade()
Type: files; Name: {app}\DocumentIcons.dll; Check: IsUpgrade()
Type: files; Name: {app}\Settings.xml; Check: IsUpgrade()
@@ -596,6 +598,12 @@ begin
DelTree(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract4\tessdata\*.traineddata'), False, True, False);
DeleteFile(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract4\tessdata\configs\hocr'));
+ DeleteFile(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\tessdata\configs\hocr'));
+ DelTree(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\tessdata\*.traineddata'), False, True, False);
+ DeleteFile(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\tesseract.exe'));
+ DeleteFile(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\msvcp90.dll'));
+ DeleteFile(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\msvcr90.dll'));
+
// Remove possible installed mpv
DeleteFile(ExpandConstant('{userappdata}\Subtitle Edit\mpv-1.dll'));
@@ -614,6 +622,9 @@ begin
RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract4\tessdata\configs'));
RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract4\tessdata'));
RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract4'));
+ RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\tessdata\configs'));
+ RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\tessdata'));
+ RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302'));
RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit'));
end;
diff --git a/libse/Configuration.cs b/libse/Configuration.cs
index 04c59b319..801b43458 100644
--- a/libse/Configuration.cs
+++ b/libse/Configuration.cs
@@ -17,7 +17,7 @@ namespace Nikse.SubtitleEdit.Core
public static readonly string BaseDirectory = GetBaseDirectory();
public static readonly string DataDirectory = GetDataDirectory();
- public static readonly string TesseractOriginalDirectory = BaseDirectory + "Tesseract4" + Path.DirectorySeparatorChar;
+ public static readonly string TesseractOriginalDirectory = BaseDirectory + "Tesseract302" + Path.DirectorySeparatorChar;
public static readonly string DictionariesDirectory = DataDirectory + "Dictionaries" + Path.DirectorySeparatorChar;
public static readonly string SpectrogramsDirectory = DataDirectory + "Spectrograms" + Path.DirectorySeparatorChar;
public static readonly string SceneChangesDirectory = DataDirectory + "SceneChanges" + Path.DirectorySeparatorChar;
diff --git a/src/Forms/Main.cs b/src/Forms/Main.cs
index da0542406..3e1997879 100644
--- a/src/Forms/Main.cs
+++ b/src/Forms/Main.cs
@@ -3665,7 +3665,7 @@ namespace Nikse.SubtitleEdit.Forms
bool isUnicode = currentEncoding == Encoding.Unicode || currentEncoding == Encoding.UTF32 || currentEncoding == Encoding.GetEncoding(12001) || currentEncoding == Encoding.UTF7 || currentEncoding == Encoding.UTF8;
if (!isUnicode && (allText.Contains(new[] { '♪', '♫', '♥', '—', '―', '…' }))) // ANSI & music/unicode symbols
{
- if (MessageBox.Show(string.Format(_language.UnicodeMusicSymbolsAnsiWarning), Title, MessageBoxButtons.YesNo) == DialogResult.No)
+ if (MessageBox.Show(string.Format(_language.UnicodeMusicSymbolsAnsiWarning), Title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
return DialogResult.No;
}
@@ -3685,7 +3685,7 @@ namespace Nikse.SubtitleEdit.Forms
}
if (containsNegativeTime)
{
- if (MessageBox.Show(_language.NegativeTimeWarning, Title, MessageBoxButtons.YesNo) == DialogResult.No)
+ if (MessageBox.Show(_language.NegativeTimeWarning, Title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
return DialogResult.No;
}
@@ -3698,7 +3698,7 @@ namespace Nikse.SubtitleEdit.Forms
if (MessageBox.Show(string.Format(_language.OverwriteModifiedFile,
_fileName, fileOnDisk.ToShortDateString(), fileOnDisk.ToString("HH:mm:ss"),
Environment.NewLine, _fileDateTime.ToShortDateString(), _fileDateTime.ToString("HH:mm:ss")),
- Title + " - " + _language.FileOnDiskModified, MessageBoxButtons.YesNo) == DialogResult.No)
+ Title + " - " + _language.FileOnDiskModified, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
return DialogResult.No;
}
if (fileInfo.IsReadOnly)
@@ -3797,7 +3797,7 @@ namespace Nikse.SubtitleEdit.Forms
}
if (containsNegativeTime)
{
- if (MessageBox.Show(_language.NegativeTimeWarning, Title, MessageBoxButtons.YesNo) == DialogResult.No)
+ if (MessageBox.Show(_language.NegativeTimeWarning, Title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
return DialogResult.No;
}
@@ -3824,7 +3824,7 @@ namespace Nikse.SubtitleEdit.Forms
bool isUnicode = currentEncoding != null && (currentEncoding == Encoding.Unicode || currentEncoding == Encoding.UTF32 || currentEncoding == Encoding.UTF7 || currentEncoding == Encoding.UTF8);
if (!isUnicode && (allText.Contains(new[] { '♪', '♫', '♥', '—', '―', '…' }))) // ANSI & music/unicode symbols
{
- if (MessageBox.Show(string.Format(_language.UnicodeMusicSymbolsAnsiWarning), Title, MessageBoxButtons.YesNo) == DialogResult.No)
+ if (MessageBox.Show(string.Format(_language.UnicodeMusicSymbolsAnsiWarning), Title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
return DialogResult.No;
}
@@ -4745,7 +4745,7 @@ namespace Nikse.SubtitleEdit.Forms
//if we fail to find the text, we might want to start searching from the top of the file.
if (!found && _findHelper.StartLineIndex >= 1)
{
- if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
+ if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
found = _findHelper.Find(_subtitle, _subtitleAlternate, -1);
}
@@ -4827,7 +4827,7 @@ namespace Nikse.SubtitleEdit.Forms
{
if (_findHelper.StartLineIndex >= 1)
{
- if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
+ if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
_findHelper.StartLineIndex = 0;
if (_findHelper.Find(_subtitle, _subtitleAlternate, 0))
@@ -5094,7 +5094,7 @@ namespace Nikse.SubtitleEdit.Forms
string msgText = _language.ReplaceContinueNotFound;
if (matches.Count > 0)
msgText = string.Format(_language.ReplaceXContinue, matches.Count);
- if (MessageBox.Show(msgText, _language.ReplaceContinueTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
+ if (MessageBox.Show(msgText, _language.ReplaceContinueTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
s = result.Substring(0, start - 1);
var rest = result.Remove(0, start - 1);
@@ -5234,7 +5234,7 @@ namespace Nikse.SubtitleEdit.Forms
string msgText = _language.ReplaceContinueNotFound;
if (replaceCount > 0)
msgText = string.Format(_language.ReplaceXContinue, replaceCount);
- if (MessageBox.Show(msgText, _language.ReplaceContinueTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
+ if (MessageBox.Show(msgText, _language.ReplaceContinueTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
stopAtIndex = firstIndex;
_findHelper.MatchInOriginal = false;
@@ -5277,7 +5277,7 @@ namespace Nikse.SubtitleEdit.Forms
if (_replaceStartLineIndex >= 1) // Prompt for start over
{
_replaceStartLineIndex = 0;
- if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
+ if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
SubtitleListview1.SelectIndexAndEnsureVisible(0, true);
_findHelper.StartLineIndex = 0;
@@ -5361,7 +5361,7 @@ namespace Nikse.SubtitleEdit.Forms
if (_replaceStartLineIndex >= 1)
{
_replaceStartLineIndex = 0;
- if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
+ if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
SubtitleListview1.SelectIndexAndEnsureVisible(0, true);
_findHelper.StartLineIndex = 0;
@@ -6008,7 +6008,7 @@ namespace Nikse.SubtitleEdit.Forms
visualSync.ShowDialog(this);
if (visualSync.OkPressed)
{
- if (MessageBox.Show(_language.AppendSynchronizedSubtitlePrompt, _language.SubtitleAppendPromptTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
+ if (MessageBox.Show(_language.AppendSynchronizedSubtitlePrompt, _language.SubtitleAppendPromptTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
int start = _subtitle.Paragraphs.Count + 1;
var fr = CurrentFrameRate;
@@ -6185,7 +6185,7 @@ namespace Nikse.SubtitleEdit.Forms
if (!isSwedish)
promptText = _language.TranslateSwedishToDanishWarning;
- if (MessageBox.Show(promptText, Title, MessageBoxButtons.YesNo) == DialogResult.Yes)
+ if (MessageBox.Show(promptText, Title, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
try
{
@@ -6537,6 +6537,12 @@ namespace Nikse.SubtitleEdit.Forms
RefreshSelectedParagraph();
}
+ public void DeleteLine(int i, Paragraph p)
+ {
+ MakeHistoryForUndo(Configuration.Settings.Language.Main.OneLineDeleted);
+ DeleteSelectedLines();
+ }
+
public void FocusParagraph(int index)
{
if (tabControlSubtitle.SelectedIndex == TabControlSourceView)
@@ -7445,7 +7451,7 @@ namespace Nikse.SubtitleEdit.Forms
askText = _language.DeleteOneLinePrompt;
}
- if (Configuration.Settings.General.PromptDeleteLines && MessageBox.Show(askText, Title, MessageBoxButtons.YesNo) == DialogResult.No)
+ if (Configuration.Settings.General.PromptDeleteLines && MessageBox.Show(askText, Title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
{
_cutText = string.Empty;
return;
@@ -11843,7 +11849,7 @@ namespace Nikse.SubtitleEdit.Forms
if (File.Exists(idxFileName))
return true;
- var dr = MessageBox.Show(string.Format(_language.IdxFileNotFoundWarning, idxFileName), _title, MessageBoxButtons.YesNo);
+ var dr = MessageBox.Show(string.Format(_language.IdxFileNotFoundWarning, idxFileName), _title, MessageBoxButtons.YesNoCancel);
return dr == DialogResult.Yes;
}
if (verbose)
@@ -14940,7 +14946,7 @@ namespace Nikse.SubtitleEdit.Forms
if (timeCodeSubtitle.Paragraphs.Count != _subtitle.Paragraphs.Count)
{
var text = string.Format(_language.ImportTimeCodesDifferentNumberOfLinesWarning, timeCodeSubtitle.Paragraphs.Count, _subtitle.Paragraphs.Count);
- if (MessageBox.Show(this, text, _title, MessageBoxButtons.YesNo) == DialogResult.No)
+ if (MessageBox.Show(this, text, _title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
return;
}
@@ -22658,7 +22664,7 @@ namespace Nikse.SubtitleEdit.Forms
newP.EndTime.TotalMilliseconds <= p.EndTime.TotalMilliseconds))
{
// new subs will overlap existing subs
- if (MessageBox.Show(_language.PromptInsertSubtitleOverlap, _languageGeneral.Title, MessageBoxButtons.YesNo) == DialogResult.No)
+ if (MessageBox.Show(_language.PromptInsertSubtitleOverlap, _languageGeneral.Title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
{
return;
}
diff --git a/src/Forms/Ocr/DownloadTesseract302.Designer.cs b/src/Forms/Ocr/DownloadTesseract302.Designer.cs
index ff4dfb8f9..492b6f3e9 100644
--- a/src/Forms/Ocr/DownloadTesseract302.Designer.cs
+++ b/src/Forms/Ocr/DownloadTesseract302.Designer.cs
@@ -37,9 +37,9 @@
this.labelDescription1.AutoSize = true;
this.labelDescription1.Location = new System.Drawing.Point(21, 27);
this.labelDescription1.Name = "labelDescription1";
- this.labelDescription1.Size = new System.Drawing.Size(220, 13);
+ this.labelDescription1.Size = new System.Drawing.Size(145, 13);
this.labelDescription1.TabIndex = 29;
- this.labelDescription1.Text = "Get Tesseract OCR dictionaries from the web";
+ this.labelDescription1.Text = "Downloading Tesseract OCR";
//
// labelPleaseWait
//
@@ -63,7 +63,7 @@
this.Name = "DownloadTesseract302";
this.ShowInTaskbar = false;
this.StartPosition = System.Windows.Forms.FormStartPosition.CenterParent;
- this.Text = "DownloadTesseract302";
+ this.Text = "Download Tesseract 3.02";
this.ResumeLayout(false);
this.PerformLayout();
diff --git a/src/Forms/Ocr/DownloadTesseract4.Designer.cs b/src/Forms/Ocr/DownloadTesseract4.Designer.cs
new file mode 100644
index 000000000..c95455fce
--- /dev/null
+++ b/src/Forms/Ocr/DownloadTesseract4.Designer.cs
@@ -0,0 +1,77 @@
+namespace Nikse.SubtitleEdit.Forms.Ocr
+{
+ partial class DownloadTesseract4
+ {
+ ///
+ /// Required designer variable.
+ ///
+ private System.ComponentModel.IContainer components = null;
+
+ ///
+ /// Clean up any resources being used.
+ ///
+ /// true if managed resources should be disposed; otherwise, false.
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing && (components != null))
+ {
+ components.Dispose();
+ }
+ base.Dispose(disposing);
+ }
+
+ #region Windows Form Designer generated code
+
+ ///
+ /// Required method for Designer support - do not modify
+ /// the contents of this method with the code editor.
+ ///
+ private void InitializeComponent()
+ {
+ this.labelDescription1 = new System.Windows.Forms.Label();
+ this.labelPleaseWait = new System.Windows.Forms.Label();
+ this.SuspendLayout();
+ //
+ // labelDescription1
+ //
+ this.labelDescription1.AutoSize = true;
+ this.labelDescription1.Location = new System.Drawing.Point(21, 27);
+ this.labelDescription1.Name = "labelDescription1";
+ this.labelDescription1.Size = new System.Drawing.Size(145, 13);
+ this.labelDescription1.TabIndex = 29;
+ this.labelDescription1.Text = "Downloading Tesseract OCR";
+ //
+ // labelPleaseWait
+ //
+ this.labelPleaseWait.AutoSize = true;
+ this.labelPleaseWait.Location = new System.Drawing.Point(21, 59);
+ this.labelPleaseWait.Name = "labelPleaseWait";
+ this.labelPleaseWait.Size = new System.Drawing.Size(70, 13);
+ this.labelPleaseWait.TabIndex = 28;
+ this.labelPleaseWait.Text = "Please wait...";
+ //
+ // DownloadTesseract4
+ //
+ this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
+ this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
+ this.ClientSize = new System.Drawing.Size(320, 93);
+ this.Controls.Add(this.labelDescription1);
+ this.Controls.Add(this.labelPleaseWait);
+ this.FormBorderStyle = System.Windows.Forms.FormBorderStyle.FixedDialog;
+ this.MaximizeBox = false;
+ this.MinimizeBox = false;
+ this.Name = "DownloadTesseract4";
+ this.ShowInTaskbar = false;
+ this.StartPosition = System.Windows.Forms.FormStartPosition.CenterParent;
+ this.Text = "Download Tesseract 4";
+ this.ResumeLayout(false);
+ this.PerformLayout();
+
+ }
+
+ #endregion
+
+ private System.Windows.Forms.Label labelDescription1;
+ private System.Windows.Forms.Label labelPleaseWait;
+ }
+}
\ No newline at end of file
diff --git a/src/Forms/Ocr/DownloadTesseract4.cs b/src/Forms/Ocr/DownloadTesseract4.cs
new file mode 100644
index 000000000..8fe8138a3
--- /dev/null
+++ b/src/Forms/Ocr/DownloadTesseract4.cs
@@ -0,0 +1,70 @@
+using System;
+using System.IO;
+using System.IO.Compression;
+using System.Net;
+using System.Windows.Forms;
+using Nikse.SubtitleEdit.Core;
+
+namespace Nikse.SubtitleEdit.Forms.Ocr
+{
+ public partial class DownloadTesseract4 : Form
+ {
+ public DownloadTesseract4()
+ {
+ InitializeComponent();
+ var wc = new WebClient { Proxy = Utilities.GetProxy() };
+ wc.DownloadDataAsync(new Uri("https://github.com/SubtitleEdit/support-files/raw/master/Tesseract4.tar.gz"));
+ wc.DownloadDataCompleted += wc_DownloadDataCompleted;
+ wc.DownloadProgressChanged += (o, args) =>
+ {
+ labelPleaseWait.Text = Configuration.Settings.Language.General.PleaseWait + " " + args.ProgressPercentage + "%";
+ };
+ }
+
+ private void wc_DownloadDataCompleted(object sender, DownloadDataCompletedEventArgs e)
+ {
+ if (e.Error != null)
+ {
+ MessageBox.Show(Configuration.Settings.Language.GetTesseractDictionaries.DownloadFailed);
+ DialogResult = DialogResult.Cancel;
+ return;
+ }
+
+ string dictionaryFolder = Configuration.TesseractDirectory;
+ if (!Directory.Exists(dictionaryFolder))
+ Directory.CreateDirectory(dictionaryFolder);
+
+ var tempFileName = Path.GetTempFileName() + ".tar";
+ using (var ms = new MemoryStream(e.Result))
+ using (var fs = new FileStream(tempFileName, FileMode.Create))
+ using (var zip = new GZipStream(ms, CompressionMode.Decompress))
+ {
+ byte[] buffer = new byte[1024];
+ int nRead;
+ while ((nRead = zip.Read(buffer, 0, buffer.Length)) > 0)
+ {
+ fs.Write(buffer, 0, nRead);
+ }
+ }
+
+ using (var tr = new TarReader(tempFileName))
+ {
+ foreach (var th in tr.Files)
+ {
+ string fn = Path.Combine(dictionaryFolder, th.FileName.Replace('/', Path.DirectorySeparatorChar));
+ if (th.IsFolder)
+ {
+ Directory.CreateDirectory(Path.Combine(dictionaryFolder, th.FileName.Replace('/', Path.DirectorySeparatorChar)));
+ }
+ else if (th.FileSizeInBytes > 0)
+ {
+ th.WriteData(fn);
+ }
+ }
+ }
+ File.Delete(tempFileName);
+ Cursor = Cursors.Default;
+ DialogResult = DialogResult.OK;
+ }
+ }
+}
diff --git a/src/Forms/Ocr/DownloadTesseract4.resx b/src/Forms/Ocr/DownloadTesseract4.resx
new file mode 100644
index 000000000..1af7de150
--- /dev/null
+++ b/src/Forms/Ocr/DownloadTesseract4.resx
@@ -0,0 +1,120 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ text/microsoft-resx
+
+
+ 2.0
+
+
+ System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089
+
+
+ System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089
+
+
\ No newline at end of file
diff --git a/src/Forms/Ocr/VobSubOcr.cs b/src/Forms/Ocr/VobSubOcr.cs
index fe89a35ce..20e52e497 100644
--- a/src/Forms/Ocr/VobSubOcr.cs
+++ b/src/Forms/Ocr/VobSubOcr.cs
@@ -7,6 +7,7 @@ using Nikse.SubtitleEdit.Core.VobSub;
using Nikse.SubtitleEdit.Logic;
using Nikse.SubtitleEdit.Logic.Ocr;
using Nikse.SubtitleEdit.Logic.Ocr.Binary;
+using Nikse.SubtitleEdit.Logic.Ocr.Tesseract;
using System;
using System.Collections.Generic;
using System.ComponentModel;
@@ -292,18 +293,13 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private string[] _nocrThreadResults;
public const int NocrMinColor = 300;
- private bool _icThreadsStop;
- private string[] _icThreadResults;
-
private readonly Keys _italicShortcut = UiUtil.GetKeys(Configuration.Settings.Shortcuts.MainTextBoxItalic);
private readonly Keys _mainGeneralGoToNextSubtitle = UiUtil.GetKeys(Configuration.Settings.Shortcuts.GeneralGoToNextSubtitle);
private readonly Keys _mainGeneralGoToPrevSubtitle = UiUtil.GetKeys(Configuration.Settings.Shortcuts.GeneralGoToPrevSubtitle);
private string[] _tesseractAsyncStrings;
private int _tesseractAsyncIndex;
- private BackgroundWorker _tesseractThread;
private int _tesseractEngineMode;
- private int _tesseractErrors;
private readonly DateTime _windowStartTime = DateTime.Now;
private int _linesOcred;
@@ -316,12 +312,11 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private int _ocrMethodIndex;
private bool _autoBreakLines;
- private int _ocrMethodTesseract;
- private int _ocrMethodModi = 2;
- private int _ocrMethodBinaryImageCompare = 1;
- private int _ocrMethodNocr = 3;
- private int _ocrMethodTesseract302 = 4;
- private int _ocrMethodImageCompare = -2; //TODO: Remove
+ private readonly int _ocrMethodTesseract;
+ private readonly int _ocrMethodModi;
+ private readonly int _ocrMethodBinaryImageCompare;
+ private readonly int _ocrMethodNocr;
+ private readonly int _ocrMethodTesseract302;
public static void SetDoubleBuffered(Control c)
{
@@ -418,24 +413,23 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
FillSpellCheckDictionaries();
+ InitializeModi();
comboBoxOcrMethod.Items.Clear();
- comboBoxOcrMethod.Items.Add(language.OcrViaTesseract);
comboBoxOcrMethod.Items.Add("Binary image compare");
- comboBoxOcrMethod.Items.Add(language.OcrViaModi);
+ comboBoxOcrMethod.Items.Add("Tesseract 3.02");
+ comboBoxOcrMethod.Items.Add("Tesseract 4 Beta");
+ if (_modiEnabled)
+ comboBoxOcrMethod.Items.Add(language.OcrViaModi);
if (Configuration.Settings.General.ShowBetaStuff)
{
comboBoxOcrMethod.Items.Add(language.OcrViaNOCR);
comboBoxOcrMethod.Items.Add(language.OcrViaImageCompare);
- _ocrMethodImageCompare = 4;
- _ocrMethodTesseract302 = 5;
- }
- else
- {
- comboBoxOcrMethod.Items.Add(language.OcrViaTesseract + " (3.02)");
- _ocrMethodTesseract302 = comboBoxOcrMethod.Items.Count;
- _ocrMethodImageCompare = -1;
- _ocrMethodNocr = -1;
}
+ _ocrMethodBinaryImageCompare = comboBoxOcrMethod.Items.IndexOf("Binary image compare");
+ _ocrMethodTesseract302 = comboBoxOcrMethod.Items.IndexOf("Tesseract 3.02");
+ _ocrMethodTesseract = comboBoxOcrMethod.Items.IndexOf("Tesseract 4 Beta");
+ _ocrMethodModi = comboBoxOcrMethod.Items.IndexOf(language.OcrViaModi);
+ _ocrMethodNocr = comboBoxOcrMethod.Items.IndexOf(language.OcrViaNOCR);
checkBoxTesseractItalicsOn.Checked = Configuration.Settings.VobSubOcr.UseItalicsInTesseract;
checkBoxTesseractItalicsOn.Text = Configuration.Settings.Language.General.Italic;
@@ -551,12 +545,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
_tesseractAsyncStrings = new string[max];
_tesseractAsyncIndex = (int)numericUpDownStartNumber.Value + 5;
- _tesseractThread = new BackgroundWorker();
- _tesseractThread.DoWork += TesseractThreadDoWork;
- _tesseractThread.RunWorkerCompleted += TesseractThreadRunWorkerCompleted;
- _tesseractThread.WorkerSupportsCancellation = true;
- if (_tesseractAsyncIndex >= 0 && _tesseractAsyncIndex < max)
- _tesseractThread.RunWorkerAsync(GetSubtitleBitmap(_tesseractAsyncIndex));
}
System.Threading.Thread.Sleep(1000);
subtitleListView1.SelectedIndexChanged -= SubtitleListView1SelectedIndexChanged;
@@ -627,7 +615,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
numericUpDownNumberOfPixelsIsSpaceNOCR.Value = vobSubOcrSettings.XOrMorePixelsMakesSpace;
_vobSubOcrSettings = vobSubOcrSettings;
- InitializeModi();
InitializeTesseract();
LoadImageCompareCharacterDatabaseList();
@@ -655,7 +642,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
numericUpDownNumberOfPixelsIsSpaceNOCR.Value = vobSubOcrSettings.XOrMorePixelsMakesSpace;
_vobSubOcrSettings = vobSubOcrSettings;
- InitializeModi();
InitializeTesseract();
LoadImageCompareCharacterDatabaseList();
@@ -687,7 +673,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
numericUpDownNumberOfPixelsIsSpaceNOCR.Value = vobSubOcrSettings.XOrMorePixelsMakesSpace;
_vobSubOcrSettings = vobSubOcrSettings;
- InitializeModi();
InitializeTesseract();
LoadImageCompareCharacterDatabaseList();
@@ -744,6 +729,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private void DoBatch()
{
+ _abort = false;
FormVobSubOcr_Shown(null, null);
checkBoxPromptForUnknownWords.Checked = false;
@@ -752,12 +738,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
_tesseractAsyncStrings = new string[max];
_tesseractAsyncIndex = (int)numericUpDownStartNumber.Value + 5;
- _tesseractThread = new BackgroundWorker();
- _tesseractThread.DoWork += TesseractThreadDoWork;
- _tesseractThread.RunWorkerCompleted += TesseractThreadRunWorkerCompleted;
- _tesseractThread.WorkerSupportsCancellation = true;
- if (_tesseractAsyncIndex >= 0 && _tesseractAsyncIndex < max)
- _tesseractThread.RunWorkerAsync(GetSubtitleBitmap(_tesseractAsyncIndex));
}
System.Threading.Thread.Sleep(1000);
subtitleListView1.SelectedIndexChanged -= SubtitleListView1SelectedIndexChanged;
@@ -849,7 +829,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
numericUpDownNumberOfPixelsIsSpaceNOCR.Value = 11;
_vobSubOcrSettings = vobSubOcrSettings;
- InitializeModi();
InitializeTesseract();
LoadImageCompareCharacterDatabaseList();
@@ -911,36 +890,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
}
}
- else if (_ocrMethodIndex == _ocrMethodImageCompare)
- {
- comboBoxCharacterDatabase.SelectedIndexChanged -= ComboBoxCharacterDatabaseSelectedIndexChanged;
- string characterDatabasePath = Configuration.VobSubCompareDirectory.TrimEnd(Path.DirectorySeparatorChar);
- if (!Directory.Exists(characterDatabasePath))
- Directory.CreateDirectory(characterDatabasePath);
-
- comboBoxCharacterDatabase.Items.Clear();
-
- foreach (string dir in Directory.GetDirectories(characterDatabasePath))
- comboBoxCharacterDatabase.Items.Add(Path.GetFileName(dir));
-
- if (comboBoxCharacterDatabase.Items.Count == 0)
- {
- Directory.CreateDirectory(characterDatabasePath + Path.DirectorySeparatorChar + _vobSubOcrSettings.LastImageCompareFolder);
- comboBoxCharacterDatabase.Items.Add(_vobSubOcrSettings.LastImageCompareFolder);
- }
-
- for (int i = 0; i < comboBoxCharacterDatabase.Items.Count; i++)
- {
- if (comboBoxCharacterDatabase.Items[i].ToString().Equals(_vobSubOcrSettings.LastImageCompareFolder, StringComparison.OrdinalIgnoreCase))
- {
- comboBoxCharacterDatabase.SelectedIndex = i;
- break;
- }
- }
- if (comboBoxCharacterDatabase.SelectedIndex < 0)
- comboBoxCharacterDatabase.SelectedIndex = 0;
- comboBoxCharacterDatabase.SelectedIndexChanged += ComboBoxCharacterDatabaseSelectedIndexChanged;
- }
}
catch (Exception ex)
{
@@ -953,89 +902,13 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
DisposeImageCompareBitmaps();
_binaryOcrDb = null;
- if (_ocrMethodIndex == _ocrMethodImageCompare)
- {
- LoadOldCompareImages();
- }
- else if (_ocrMethodIndex == _ocrMethodBinaryImageCompare)
+ if (_ocrMethodIndex == _ocrMethodBinaryImageCompare)
{
string db = Configuration.OcrDirectory + comboBoxCharacterDatabase.SelectedItem + ".db";
_binaryOcrDb = new BinaryOcrDb(db, true);
}
}
- private void LoadOldCompareImages()
- {
- _compareBitmaps = new List();
- string path = Configuration.VobSubCompareDirectory + comboBoxCharacterDatabase.SelectedItem + Path.DirectorySeparatorChar;
- if (!File.Exists(path + "CompareDescription.xml"))
- _compareDoc.LoadXml("");
- else
- _compareDoc.Load(path + "CompareDescription.xml");
-
- string databaseName = path + "Images.db";
- if (!File.Exists(databaseName))
- {
- labelStatus.Text = Configuration.Settings.Language.VobSubOcr.LoadingImageCompareDatabase;
- labelStatus.Refresh();
- using (var f = new FileStream(databaseName, FileMode.Create))
- {
- foreach (string bmpFileName in Directory.GetFiles(path, "*.bmp"))
- {
- string name = Path.GetFileNameWithoutExtension(bmpFileName);
-
- XmlNode node = _compareDoc.DocumentElement.SelectSingleNode("FileName[.='" + name + "']");
- if (node != null)
- {
- node.InnerText = f.Position.ToString(CultureInfo.InvariantCulture);
- var b = new Bitmap(bmpFileName);
- var m = new ManagedBitmap(b);
- b.Dispose();
- m.AppendToStream(f);
- }
- }
- }
- _compareDoc.Save(path + "Images.xml");
- string text = File.ReadAllText(path + "Images.xml");
- File.WriteAllText(path + "Images.xml", text.Replace("", ""));
- labelStatus.Text = string.Empty;
- }
-
- if (File.Exists(databaseName))
- {
- labelStatus.Text = Configuration.Settings.Language.VobSubOcr.LoadingImageCompareDatabase;
- labelStatus.Refresh();
- _compareDoc.Load(path + "Images.xml");
- using (var f = new FileStream(databaseName, FileMode.Open))
- {
- foreach (XmlNode node in _compareDoc.DocumentElement.SelectNodes("Item"))
- {
- try
- {
- string name = node.InnerText;
- int pos = Convert.ToInt32(name);
- bool isItalic = node.Attributes["Italic"] != null;
- string text = node.Attributes["Text"].InnerText;
- int expandCount = 0;
- if (node.Attributes["Expand"] != null)
- {
- if (!int.TryParse(node.Attributes["Expand"].InnerText, out expandCount))
- expandCount = 0;
- }
- f.Position = pos;
- var mbmp = new ManagedBitmap(f);
- _compareBitmaps.Add(new CompareItem(mbmp, name, isItalic, expandCount, text));
- }
- catch
- {
- // ignored
- }
- }
- }
- labelStatus.Text = string.Empty;
- }
- }
-
private void DisposeImageCompareBitmaps()
{
_compareBitmaps = null;
@@ -1067,9 +940,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
chooseLanguage.ShowIcon = true;
}
chooseLanguage.Initialize(_vobSubMergedPackist, _palette, vobSubParser.IdxLanguages, string.Empty);
- Form form = _main;
- if (form == null)
- form = this;
+ var form = _main ?? (Form)this;
if (batchMode)
{
chooseLanguage.SelectActive();
@@ -1100,7 +971,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
languageString = languageString.ToLower();
for (int i = 0; i < comboBoxTesseractLanguages.Items.Count; i++)
{
- var tl = (comboBoxTesseractLanguages.Items[i] as TesseractLanguage);
+ var tl = comboBoxTesseractLanguages.Items[i] as TesseractLanguage;
if (tl.Text.StartsWith("Chinese", StringComparison.OrdinalIgnoreCase) && (languageString.StartsWith("chinese", StringComparison.OrdinalIgnoreCase) || languageString.StartsWith("中文", StringComparison.OrdinalIgnoreCase)))
{
comboBoxTesseractLanguages.SelectedIndex = i;
@@ -1199,9 +1070,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
checkBoxShowOnlyForced.Checked == false)
{
_bdnXmlSubtitle.Paragraphs.Add(new Paragraph(x));
- var p = new Paragraph(x);
- p.Text = string.Empty;
- _subtitle.Paragraphs.Add(p);
+ _subtitle.Paragraphs.Add(new Paragraph(x) { Text = string.Empty });
}
}
_subtitle.Renumber();
@@ -1233,14 +1102,14 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
for (int i = 0; i < max; i++)
{
var x = _bluRaySubtitlesOriginal[i];
- if (checkBoxShowOnlyForced.Checked && x.IsForced ||
- checkBoxShowOnlyForced.Checked == false)
+ if (checkBoxShowOnlyForced.Checked && x.IsForced || checkBoxShowOnlyForced.Checked == false)
{
_bluRaySubtitles.Add(x);
- Paragraph p = new Paragraph();
- p.StartTime = new TimeCode(x.StartTime / 90.0);
- p.EndTime = new TimeCode(x.EndTime / 90.0);
- _subtitle.Paragraphs.Add(p);
+ _subtitle.Paragraphs.Add(new Paragraph
+ {
+ StartTime = new TimeCode(x.StartTime / 90.0),
+ EndTime = new TimeCode(x.EndTime / 90.0)
+ });
}
}
_subtitle.Renumber();
@@ -1727,20 +1596,19 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
if (_mp4List != null)
return _mp4List.Count;
- else if (_spList != null)
+ if (_spList != null)
return _spList.Count;
- else if (_bdnXmlSubtitle != null)
+ if (_bdnXmlSubtitle != null)
return _bdnXmlSubtitle.Paragraphs.Count;
- else if (_bluRaySubtitlesOriginal != null)
+ if (_bluRaySubtitlesOriginal != null)
return _bluRaySubtitles.Count;
- else if (_xSubList != null)
+ if (_xSubList != null)
return _xSubList.Count;
- else if (_dvbSubtitles != null)
+ if (_dvbSubtitles != null)
return _dvbSubtitles.Count;
- else if (_dvbPesSubtitles != null)
+ if (_dvbPesSubtitles != null)
return _dvbPesSubtitles.Count;
- else
- return _vobSubMergedPackist.Count;
+ return _vobSubMergedPackist.Count;
}
private Bitmap ShowSubtitleImage(int index)
@@ -1759,11 +1627,10 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
groupBoxSubtitleImage.Text = Configuration.Settings.Language.VobSubOcr.SubtitleImage;
bmp = new Bitmap(1, 1);
}
- Bitmap old = pictureBoxSubtitleImage.Image as Bitmap;
+ var old = pictureBoxSubtitleImage.Image as Bitmap;
pictureBoxSubtitleImage.Image = bmp.Clone() as Bitmap;
pictureBoxSubtitleImage.Invalidate();
- if (old != null)
- old.Dispose();
+ old?.Dispose();
return bmp;
}
@@ -2959,12 +2826,11 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
internal CompareMatch GetNOcrCompareMatch(ImageSplitterItem targetItem, NikseBitmap parentBitmap, NOcrDb nOcrDb, bool tryItalicScaling, bool deepSeek)
{
- bool italic;
var expandedResult = NOcrFindExpandedMatch(parentBitmap, targetItem, nOcrDb.OcrCharacters);
if (expandedResult != null)
return new CompareMatch(expandedResult.Text, expandedResult.Italic, expandedResult.ExpandCount, null, expandedResult);
- var result = NOcrFindBestMatchNew(targetItem, targetItem.Y - targetItem.ParentY, out italic, nOcrDb, tryItalicScaling, deepSeek);
+ var result = NOcrFindBestMatchNew(targetItem, targetItem.Y - targetItem.ParentY, out var italic, nOcrDb, tryItalicScaling, deepSeek);
if (result == null)
{
if (checkBoxNOcrCorrect.Checked)
@@ -3008,8 +2874,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
return new CompareMatch(expandedResult.Text, expandedResult.Italic, expandedResult.ExpandCount, null, expandedResult);
}
- bool italic;
- var result = NOcrFindBestMatchNew(targetItem, targetItem.Y - targetItem.ParentY, out italic, nOcrDb, tryItalicScaling, deepSeek);
+ var result = NOcrFindBestMatchNew(targetItem, targetItem.Y - targetItem.ParentY, out var italic, nOcrDb, tryItalicScaling, deepSeek);
if (result == null)
{
if (checkBoxNOcrCorrect.Checked)
@@ -3047,12 +2912,11 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
internal static CompareMatch GetNOcrCompareMatch(ImageSplitterItem targetItem, NikseBitmap parentBitmap, NOcrThreadParameter p)
{
- bool italic;
var expandedResult = NOcrFindExpandedMatch(parentBitmap, targetItem, p.NOcrChars);
if (expandedResult != null)
return new CompareMatch(expandedResult.Text, expandedResult.Italic, expandedResult.ExpandCount, null, expandedResult);
- var result = NOcrFindBestMatch(targetItem, targetItem.Y - targetItem.ParentY, out italic, p.NOcrChars, p.UnItalicFactor, p.AdvancedItalicDetection, true);
+ var result = NOcrFindBestMatch(targetItem, targetItem.Y - targetItem.ParentY, out var italic, p.NOcrChars, p.UnItalicFactor, p.AdvancedItalicDetection, true);
if (result == null)
return null;
@@ -3149,8 +3013,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
if (smallestDifference > 2 && target.Width > 12)
{
var cutBitmap = target.CopyRectangle(new Rectangle(1, 0, target.Width - 2, target.Height));
- int topCrop = 0;
- var cutBitmap2 = NikseBitmapImageSplitter.CropTopAndBottom(cutBitmap, out topCrop, 2);
+ var cutBitmap2 = NikseBitmapImageSplitter.CropTopAndBottom(cutBitmap, out _, 2);
if (cutBitmap2.Height != target.Height)
FindBestMatch(out index, ref smallestDifference, ref smallestIndex, cutBitmap2, _compareBitmaps);
}
@@ -3197,7 +3060,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
secondBestGuess = null;
int index = 0;
int smallestDifference = 10000;
- BinaryOcrBitmap hit = null;
var target = targetItem.NikseBitmap;
if (_binaryOcrDb == null)
{
@@ -3254,7 +3116,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
}
- FindBestMatchNew(ref index, ref smallestDifference, out hit, target, _binaryOcrDb, bob, maxDiff);
+ FindBestMatchNew(ref index, ref smallestDifference, out var hit, target, _binaryOcrDb, bob, maxDiff);
if (maxDiff > 0)
{
if (target.Width > 16 && target.Height > 16 && (hit == null || smallestDifference * 100.0 / (target.Width * target.Height) > maxDiff))
@@ -3366,8 +3228,8 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
return new CompareMatch("j", false, 0, null);
}
- bool italicLowercaseI;
- if (bob.IsLowercaseI(out italicLowercaseI))
+
+ if (bob.IsLowercaseI(out var italicLowercaseI))
{
return new CompareMatch("i", italicLowercaseI, 0, null);
}
@@ -3422,16 +3284,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
return percentage;
}
- public static Bitmap CopyBitmapSection(Bitmap srcBitmap, Rectangle section)
- {
- var bmp = new Bitmap(section.Width, section.Height);
- var g = Graphics.FromImage(bmp);
- g.DrawImage(srcBitmap, 0, 0, section, GraphicsUnit.Pixel);
- g.Dispose();
- return bmp;
- }
-
-
private static void FindBestMatchNew(ref int index, ref int smallestDifference, out BinaryOcrBitmap hit, NikseBitmap target, BinaryOcrDb binOcrDb, BinaryOcrBitmap bob, double maxDiff)
{
hit = null;
@@ -4096,66 +3948,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
return count;
}
- private string SaveCompareItem(NikseBitmap newTarget, string text, bool isItalic, int expandCount)
- {
- string path = Configuration.VobSubCompareDirectory + comboBoxCharacterDatabase.SelectedItem + Path.DirectorySeparatorChar;
- string databaseName = path + "Images.db";
- FileStream f;
- long pos;
- if (!File.Exists(databaseName))
- {
- if (!Directory.Exists(Configuration.OcrDirectory))
- {
- Directory.CreateDirectory(Configuration.OcrDirectory);
- }
-
- using (f = new FileStream(databaseName, FileMode.Create))
- {
- pos = f.Position;
- new ManagedBitmap(newTarget).AppendToStream(f);
- }
- }
- else
- {
- using (f = new FileStream(databaseName, FileMode.Append))
- {
- pos = f.Position;
- new ManagedBitmap(newTarget).AppendToStream(f);
- }
- }
- string name = pos.ToString(CultureInfo.InvariantCulture);
-
- if (_compareDoc == null)
- {
- _compareDoc = new XmlDocument();
- _compareDoc.LoadXml("");
- }
- if (_compareBitmaps == null)
- _compareBitmaps = new List();
- _compareBitmaps.Add(new CompareItem(new ManagedBitmap(newTarget), name, isItalic, expandCount, text));
-
- XmlElement element = _compareDoc.CreateElement("Item");
- XmlAttribute attribute = _compareDoc.CreateAttribute("Text");
- attribute.InnerText = text;
- element.Attributes.Append(attribute);
- if (expandCount > 0)
- {
- XmlAttribute expandSelection = _compareDoc.CreateAttribute("Expand");
- expandSelection.InnerText = expandCount.ToString(CultureInfo.InvariantCulture);
- element.Attributes.Append(expandSelection);
- }
- if (isItalic)
- {
- XmlAttribute italic = _compareDoc.CreateAttribute("Italic");
- italic.InnerText = "true";
- element.Attributes.Append(italic);
- }
- element.InnerText = pos.ToString(CultureInfo.InvariantCulture);
- _compareDoc.DocumentElement.AppendChild(element);
- _compareDoc.Save(path + "Images.xml");
- return name;
- }
-
private string SaveCompareItemNew(ImageSplitterItem newTarget, string text, bool isItalic, List expandList)
{
int expandCount = 0;
@@ -4186,190 +3978,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
}
- ///
- /// Ocr via image compare
- ///
- private string SplitAndOcrBitmapNormal(Bitmap bitmap, int listViewIndex)
- {
- if (_ocrFixEngine == null)
- LoadOcrFixEngine(null, LanguageString);
-
- string threadText = null;
- if (_icThreadResults != null && !string.IsNullOrEmpty(_icThreadResults[listViewIndex]))
- threadText = _icThreadResults[listViewIndex];
-
- string line;
- if (threadText == null)
- {
- var matches = new List();
- var parentBitmap = new NikseBitmap(bitmap);
- List list = NikseBitmapImageSplitter.SplitBitmapToLetters(parentBitmap, (int)numericUpDownPixelsIsSpace.Value, checkBoxRightToLeft.Checked, Configuration.Settings.VobSubOcr.TopToBottom);
- int index = 0;
- bool expandSelection = false;
- bool shrinkSelection = false;
- var expandSelectionList = new List();
- while (index < list.Count)
- {
- ImageSplitterItem item = list[index];
- if (expandSelection || shrinkSelection)
- {
- expandSelection = false;
- if (shrinkSelection && index > 0)
- {
- shrinkSelection = false;
- }
- else if (index + 1 < list.Count && list[index + 1].NikseBitmap != null) // only allow expand to EndOfLine or space
- {
- index++;
- expandSelectionList.Add(list[index]);
- }
- item = GetExpandedSelection(parentBitmap, expandSelectionList, checkBoxRightToLeft.Checked);
-
- _vobSubOcrCharacter.Initialize(bitmap, item, _manualOcrDialogPosition, _italicCheckedLast, expandSelectionList.Count > 1, null, _lastAdditions, this);
- DialogResult result = _vobSubOcrCharacter.ShowDialog(this);
- _manualOcrDialogPosition = _vobSubOcrCharacter.FormPosition;
- if (result == DialogResult.OK && _vobSubOcrCharacter.ShrinkSelection)
- {
- shrinkSelection = true;
- index--;
- if (expandSelectionList.Count > 0)
- expandSelectionList.RemoveAt(expandSelectionList.Count - 1);
- }
- else if (result == DialogResult.OK && _vobSubOcrCharacter.ExpandSelection)
- {
- expandSelection = true;
- }
- else if (result == DialogResult.OK)
- {
- string text = _vobSubOcrCharacter.ManualRecognizedCharacters;
- string name = SaveCompareItem(item.NikseBitmap, text, _vobSubOcrCharacter.IsItalic, expandSelectionList.Count);
- var addition = new ImageCompareAddition(name, text, item.NikseBitmap, _vobSubOcrCharacter.IsItalic, listViewIndex);
- _lastAdditions.Add(addition);
- matches.Add(new CompareMatch(text, _vobSubOcrCharacter.IsItalic, expandSelectionList.Count, null));
- expandSelectionList = new List();
- }
- else if (result == DialogResult.Abort)
- {
- _abort = true;
- }
- else
- {
- matches.Add(new CompareMatch("*", false, 0, null));
- }
- _italicCheckedLast = _vobSubOcrCharacter.IsItalic;
- }
- else if (item.NikseBitmap == null)
- {
- matches.Add(new CompareMatch(item.SpecialCharacter, false, 0, null));
- }
- else
- {
- CompareMatch bestGuess;
- CompareMatch match = GetCompareMatch(item, parentBitmap, out bestGuess, list, index);
- if (match == null)
- {
- _vobSubOcrCharacter.Initialize(bitmap, item, _manualOcrDialogPosition, _italicCheckedLast, false, bestGuess, _lastAdditions, this);
- DialogResult result = _vobSubOcrCharacter.ShowDialog(this);
- _manualOcrDialogPosition = _vobSubOcrCharacter.FormPosition;
- if (result == DialogResult.OK && _vobSubOcrCharacter.ExpandSelection)
- {
- expandSelectionList.Add(item);
- expandSelection = true;
- }
- else if (result == DialogResult.OK)
- {
- string text = _vobSubOcrCharacter.ManualRecognizedCharacters;
- string name = SaveCompareItem(item.NikseBitmap, text, _vobSubOcrCharacter.IsItalic, 0);
- var addition = new ImageCompareAddition(name, text, item.NikseBitmap, _vobSubOcrCharacter.IsItalic, listViewIndex);
- _lastAdditions.Add(addition);
- matches.Add(new CompareMatch(text, _vobSubOcrCharacter.IsItalic, 0, null));
- }
- else if (result == DialogResult.Abort)
- {
- _abort = true;
- }
- else
- {
- matches.Add(new CompareMatch("*", false, 0, null));
- }
- _italicCheckedLast = _vobSubOcrCharacter.IsItalic;
- }
- else // found image match
- {
- matches.Add(new CompareMatch(match.Text, match.Italic, 0, null));
- if (match.ExpandCount > 0)
- index += match.ExpandCount - 1;
- }
- }
- if (_abort)
- return string.Empty;
- if (!expandSelection && !shrinkSelection)
- index++;
- if (shrinkSelection && expandSelectionList.Count < 2)
- {
- shrinkSelection = false;
- expandSelectionList = new List();
- }
- }
-
- line = GetStringWithItalicTags(matches);
- }
- else
- {
- line = threadText;
- }
- if (checkBoxAutoFixCommonErrors.Checked && _ocrFixEngine != null)
- line = _ocrFixEngine.FixOcrErrorsViaHardcodedRules(line, _lastLine, null); // TODO: Add abbreviations list
-
- if (checkBoxRightToLeft.Checked)
- line = ReverseNumberStrings(line);
-
- //OCR fix engine
- string textWithOutFixes = line;
- //OCR fix engine not loaded, when no dictionary is selected
- if (_ocrFixEngine != null && _ocrFixEngine.IsDictionaryLoaded)
- {
- if (checkBoxAutoFixCommonErrors.Checked)
- line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, GetAutoGuessLevel());
-
- int correctWords;
- int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
-
- if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && string.IsNullOrWhiteSpace(textWithOutFixes.Replace("~", string.Empty)))
- {
- _ocrFixEngine.AutoGuessesUsed.Clear();
- _ocrFixEngine.UnknownWordsFound.Clear();
- line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel());
- }
-
- if (_ocrFixEngine.Abort)
- {
- ButtonStopClick(null, null);
- _ocrFixEngine.Abort = false;
- return string.Empty;
- }
-
- // Log used word guesses (via word replace list)
- foreach (string guess in _ocrFixEngine.AutoGuessesUsed)
- listBoxLogSuggestions.Items.Add(guess);
- _ocrFixEngine.AutoGuessesUsed.Clear();
-
- // Log unkown words guess (found via spelling dictionaries)
- LogUnknownWords();
-
- ColorLineByNumberOfUnknownWords(listViewIndex, wordsNotFound, line);
- }
-
- if (textWithOutFixes != null && textWithOutFixes.Trim() != line.Trim())
- {
- _tesseractOcrAutoFixes++;
- labelFixesMade.Text = $" - {_tesseractOcrAutoFixes}";
- LogOcrFix(listViewIndex, textWithOutFixes, line);
- }
-
- return line;
- }
-
private void ColorLineByNumberOfUnknownWords(int index, int wordsNotFound, string line)
{
if (wordsNotFound >= 3)
@@ -4492,8 +4100,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
_ocrCount++;
_ocrHeight += (item.NikseBitmap.Height - _ocrHeight) / _ocrCount;
- CompareMatch bestGuess;
- CompareMatch match = GetCompareMatchNew(item, out bestGuess, list, index);
+ CompareMatch match = GetCompareMatchNew(item, out var bestGuess, list, index);
if (match == null) // Try line OCR if no image compare match
{
if (_nOcrDb != null && _nOcrDb.OcrCharacters.Count > 0 && _numericUpDownMaxErrorPct < 1)
@@ -4670,55 +4277,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
}
- public static List LoadNOcrForTesseract(string xmlRessourceName)
- {
- var nocrChars = new List();
- Assembly asm = Assembly.GetExecutingAssembly();
- Stream strm = asm.GetManifestResourceStream(xmlRessourceName);
- if (strm != null)
- {
- XmlDocument doc = new XmlDocument();
- var rdr = new StreamReader(strm);
- using (var zip = new System.IO.Compression.GZipStream(rdr.BaseStream, System.IO.Compression.CompressionMode.Decompress))
- {
- byte[] data = new byte[175000];
- zip.Read(data, 0, 175000);
- doc.LoadXml(Encoding.UTF8.GetString(data));
- }
- rdr.Close();
-
- try
- {
- foreach (XmlNode node in doc.DocumentElement.SelectNodes("Char"))
- {
- var oc = new NOcrChar(node.Attributes["Text"].Value);
- oc.Width = Convert.ToInt32(node.Attributes["Width"].Value, CultureInfo.InvariantCulture);
- oc.Height = Convert.ToInt32(node.Attributes["Height"].Value, CultureInfo.InvariantCulture);
- oc.MarginTop = Convert.ToInt32(node.Attributes["MarginTop"].Value, CultureInfo.InvariantCulture);
- if (node.Attributes["Italic"] != null)
- oc.Italic = Convert.ToBoolean(node.Attributes["Italic"].Value, CultureInfo.InvariantCulture);
- if (node.Attributes["ExpandCount"] != null)
- oc.ExpandCount = Convert.ToInt32(node.Attributes["ExpandCount"].Value, CultureInfo.InvariantCulture);
- foreach (XmlNode pointNode in node.SelectNodes("Point"))
- {
- var op = new NOcrPoint(DecodePoint(pointNode.Attributes["Start"].Value), DecodePoint(pointNode.Attributes["End"].Value));
- XmlAttribute a = pointNode.Attributes["On"];
- if (a != null && Convert.ToBoolean(a.Value))
- oc.LinesForeground.Add(op);
- else
- oc.LinesBackground.Add(op);
- }
- nocrChars.Add(oc);
- }
- }
- catch (Exception exception)
- {
- MessageBox.Show(exception.Message);
- }
- }
- return nocrChars;
- }
-
private static Point DecodePoint(string text)
{
var arr = text.Split(',');
@@ -4897,8 +4455,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
if (checkBoxAutoFixCommonErrors.Checked)
line = _ocrFixEngine.FixOcrErrors(line, listViewIndex, _lastLine, true, GetAutoGuessLevel());
- int correctWords;
- int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
+ int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out var correctWords);
if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && string.IsNullOrWhiteSpace(textWithOutFixes.Replace("~", string.Empty)))
{
@@ -5158,8 +4715,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
maximumX = item.X + item.NikseBitmap.Width;
}
nbmp.CropTransparentSidesAndBottom(0, true);
- int topCropping;
- nbmp = NikseBitmapImageSplitter.CropTopAndBottom(nbmp, out topCropping);
+ nbmp = NikseBitmapImageSplitter.CropTopAndBottom(nbmp, out _);
return new ImageSplitterItem(minimumX, minimumY, nbmp);
}
@@ -5249,8 +4805,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
if (!convertAllToItalic && !convertAllToNonItalic)
{
- bool italicOrNot = false;
- bool isMixedCaseWithoutDashAndAlike = IsMixedCaseWithoutDashAndAlike(matches, i, out italicOrNot);
+ bool isMixedCaseWithoutDashAndAlike = IsMixedCaseWithoutDashAndAlike(matches, i, out var italicOrNot);
if ((text == "-" || text == "—" || text == "." || text == "'") && !isMixedCaseWithoutDashAndAlike)
{
italic = italicOrNot;
@@ -5383,13 +4938,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
lettersNonItalics = 0;
}
- public Subtitle SubtitleFromOcr
- {
- get
- {
- return _subtitle;
- }
- }
+ public Subtitle SubtitleFromOcr => _subtitle;
private void FormVobSubOcr_Shown(object sender, EventArgs e)
{
@@ -5552,271 +5101,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
subtitleListView1.MultiSelect = true;
}
- private static void ImageCompareThreadDoWork(object sender, DoWorkEventArgs e)
- {
- var p = (ImageCompareThreadParameter)e.Argument;
- e.Result = p;
- Bitmap bitmap = p.Picture;
- var matches = new List();
- List lines = NikseBitmapImageSplitter.SplitVertical(bitmap);
- List list = NikseBitmapImageSplitter.SplitBitmapToLetters(lines, p.NumberOfPixelsIsSpace, p.RightToLeft, Configuration.Settings.VobSubOcr.TopToBottom);
-
- int outerIndex = 0;
- while (outerIndex < list.Count)
- {
- ImageSplitterItem item = list[outerIndex];
- if (item.NikseBitmap == null)
- {
- matches.Add(new CompareMatch(item.SpecialCharacter, false, 0, null));
- }
- else
- {
- var target = item.NikseBitmap;
- int numberOfForegroundColors = CalculateNumberOfForegroundColors(target);
-
- int smallestDifference = 10000;
- int smallestIndex = -1;
-
- int index;
- if (smallestDifference > 0)
- {
- index = 0;
- foreach (CompareItem compareItem in p.CompareBitmaps)
- {
- if (compareItem.Bitmap.Width == target.Width && compareItem.Bitmap.Height == target.Height)
- {
- if (compareItem.NumberOfForegroundColors < 1)
- compareItem.NumberOfForegroundColors = CalculateNumberOfForegroundColors(compareItem.Bitmap);
-
- if (Math.Abs(compareItem.NumberOfForegroundColors - numberOfForegroundColors) < 30)
- {
- int dif = NikseBitmapImageSplitter.IsBitmapsAlike(compareItem.Bitmap, target);
- if (dif < smallestDifference)
- {
- smallestDifference = dif;
- smallestIndex = index;
- if (dif < 0.2)
- break; // foreach ending
- }
- }
- }
- index++;
- }
- }
-
- if (smallestDifference > 1 && target.Width < 55 && target.Width > 5)
- {
- index = 0;
- foreach (CompareItem compareItem in p.CompareBitmaps)
- {
- if (compareItem.Bitmap.Width == target.Width && compareItem.Bitmap.Height == target.Height + 1)
- {
- if (compareItem.NumberOfForegroundColors == -1)
- compareItem.NumberOfForegroundColors = CalculateNumberOfForegroundColors(compareItem.Bitmap);
-
- if (Math.Abs(compareItem.NumberOfForegroundColors - numberOfForegroundColors) < 50)
- {
- int dif = NikseBitmapImageSplitter.IsBitmapsAlike(target, compareItem.Bitmap);
- if (dif < smallestDifference)
- {
- smallestDifference = dif;
- smallestIndex = index;
- if (dif < 0.5)
- break; // foreach ending
- }
- }
- }
- index++;
- }
- }
-
- if (smallestDifference > 1 && target.Width < 55 && target.Width > 5)
- {
- index = 0;
- foreach (CompareItem compareItem in p.CompareBitmaps)
- {
- if (compareItem.Bitmap.Width == target.Width - 1 && compareItem.Bitmap.Height == target.Height ||
- compareItem.Bitmap.Width == target.Width - 1 && compareItem.Bitmap.Height == target.Height - 1 ||
- compareItem.Bitmap.Width == target.Width && compareItem.Bitmap.Height == target.Height - 1)
- {
- if (compareItem.NumberOfForegroundColors < 1)
- compareItem.NumberOfForegroundColors = CalculateNumberOfForegroundColors(compareItem.Bitmap);
-
- if (Math.Abs(compareItem.NumberOfForegroundColors - numberOfForegroundColors) < 55)
- {
- int dif = NikseBitmapImageSplitter.IsBitmapsAlike(compareItem.Bitmap, target);
- if (dif < smallestDifference)
- {
- smallestDifference = dif;
- smallestIndex = index;
- if (dif < 0.5)
- break; // foreach ending
- }
- }
- }
- index++;
- }
- }
-
- double differencePercentage = smallestDifference * 100.0 / (item.NikseBitmap.Width * item.NikseBitmap.Height);
- double maxDiff = p.MaxErrorPercent;
- if (differencePercentage <= maxDiff && smallestIndex >= 0)
- {
- var hit = p.CompareBitmaps[smallestIndex];
- var match = new CompareMatch(hit.Text, hit.Italic, hit.ExpandCount, hit.Name);
- matches.Add(new CompareMatch(match.Text, match.Italic, 0, null));
- if (match.ExpandCount > 0)
- outerIndex += match.ExpandCount - 1;
- }
- else
- {
- p.Result = string.Empty;
- return;
- }
- }
- outerIndex++;
- }
- bitmap.Dispose();
- p.Result = GetStringWithItalicTags(matches);
- }
-
- private void ImageCompareThreadRunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e)
- {
- var p = (ImageCompareThreadParameter)e.Result;
- if (!_icThreadsStop)
- {
- if (string.IsNullOrEmpty(_icThreadResults[p.Index]))
- _icThreadResults[p.Index] = p.Result;
- p.Index += p.Increment;
- while (p.Index <= _mainOcrIndex)
- p.Index += p.Increment;
- p.Picture.Dispose();
- if (p.Index < _subtitle.Paragraphs.Count)
- {
- p.Result = string.Empty;
- p.Picture = GetSubtitleBitmap(p.Index);
- p.Self.RunWorkerAsync(p);
- }
- }
- else
- {
- _mainOcrRunning = false;
- }
- }
-
- public string NocrFastCheck(Bitmap bitmap)
- {
- var nbmpInput = new NikseBitmap(bitmap);
- nbmpInput.ReplaceNonWhiteWithTransparent();
-
- var matches = new List();
- List list = NikseBitmapImageSplitter.SplitBitmapToLetters(nbmpInput, (int)numericUpDownNumberOfPixelsIsSpaceNOCR.Value, checkBoxRightToLeft.Checked, Configuration.Settings.VobSubOcr.TopToBottom);
-
- foreach (ImageSplitterItem item in list)
- {
- if (item.NikseBitmap != null)
- {
- item.NikseBitmap.ReplaceTransparentWith(Color.Black);
- }
- }
- int index = 0;
-
- while (index < list.Count)
- {
- ImageSplitterItem item = list[index];
- if (item.NikseBitmap == null)
- {
- matches.Add(new CompareMatch(item.SpecialCharacter, false, 0, null));
- }
- else
- {
- CompareMatch match = null;
-
- var nbmp = item.NikseBitmap;
- int topMargin = item.Y - item.ParentY;
- foreach (NOcrChar oc in _nOcrDb.OcrCharacters)
- {
- if (Math.Abs(oc.Width - nbmp.Width) < 3 && Math.Abs(oc.Height - nbmp.Height) < 4 && Math.Abs(oc.MarginTop - topMargin) < 4)
- { // only very accurate matches
-
- bool ok = true;
- var index2 = 0;
- while (index2 < oc.LinesForeground.Count && ok)
- {
- NOcrPoint op = oc.LinesForeground[index2];
- foreach (Point point in op.ScaledGetPoints(oc, nbmp.Width, nbmp.Height))
- {
- if (point.X >= 0 && point.Y >= 0 && point.X < nbmp.Width && point.Y < nbmp.Height)
- {
- Color c = nbmp.GetPixel(point.X, point.Y);
- if (c.A > 150 && c.R + c.G + c.B > NocrMinColor)
- {
- }
- else
- {
- Point p = new Point(point.X - 1, point.Y);
- if (p.X < 0)
- p.X = 1;
- c = nbmp.GetPixel(p.X, p.Y);
- if (nbmp.Width > 20 && c.A > 150 && c.R + c.G + c.B > NocrMinColor)
- {
- }
- else
- {
- ok = false;
- break;
- }
- }
- }
- }
- index2++;
- }
- index2 = 0;
- while (index2 < oc.LinesBackground.Count && ok)
- {
- NOcrPoint op = oc.LinesBackground[index2];
- foreach (Point point in op.ScaledGetPoints(oc, nbmp.Width, nbmp.Height))
- {
- if (point.X >= 0 && point.Y >= 0 && point.X < nbmp.Width && point.Y < nbmp.Height)
- {
- Color c = nbmp.GetPixel(point.X, point.Y);
- if (c.A > 150 && c.R + c.G + c.B > NocrMinColor)
- {
- Point p = new Point(point.X, point.Y);
- if (oc.Width > 19 && point.X > 0)
- p.X = p.X - 1;
- c = nbmp.GetPixel(p.X, p.Y);
- if (c.A > 150 && c.R + c.G + c.B > NocrMinColor)
- {
- ok = false;
- break;
- }
- }
- }
- }
- index2++;
- }
- if (ok)
- match = new CompareMatch(oc.Text, oc.Italic, 0, null);
- }
- }
-
- if (match == null)
- {
- matches.Add(new CompareMatch("*", false, 0, null));
- }
- else // found image match
- {
- matches.Add(new CompareMatch(match.Text, match.Italic, 0, null));
- if (match.ExpandCount > 0)
- index += match.ExpandCount - 1;
- }
- }
- index++;
- }
- return GetStringWithItalicTags(matches);
- }
-
private static void NOcrThreadDoWork(object sender, DoWorkEventArgs e)
{
var p = (NOcrThreadParameter)e.Argument;
@@ -5891,30 +5175,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
}
- private void TesseractThreadDoWork(object sender, DoWorkEventArgs e)
- {
- var bitmap = (Bitmap)e.Argument;
- if (bitmap != null)
- {
- if (_tesseractAsyncIndex >= 0 && _tesseractAsyncStrings != null && _tesseractAsyncIndex < _tesseractAsyncStrings.Length)
- {
- if (string.IsNullOrEmpty(_tesseractAsyncStrings[_tesseractAsyncIndex]))
- _tesseractAsyncStrings[_tesseractAsyncIndex] = Tesseract3DoOcrViaExe(bitmap, _languageId, "--psm 6", _tesseractEngineMode); // 6 = Assume a single uniform block of text.);
- }
- bitmap.Dispose();
- }
- }
-
- private void TesseractThreadRunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e)
- {
- if (!e.Cancelled)
- {
- _tesseractAsyncIndex++;
- if (_tesseractAsyncIndex >= 0 && _tesseractAsyncStrings != null && _tesseractAsyncIndex < _tesseractAsyncStrings.Length)
- _tesseractThread.RunWorkerAsync(GetSubtitleBitmap(_tesseractAsyncIndex));
- }
- }
-
private bool _isLatinDb;
private void ButtonStartOcrClick(object sender, EventArgs e)
@@ -5924,6 +5184,12 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
return;
}
+ if (_ocrMethodIndex == _ocrMethodTesseract302 || _ocrMethodIndex == _ocrMethodTesseract)
+ {
+ _tesseractThreadRunner = new TesseractThreadRunner(OcrDone);
+ _tesseractRunner = new TesseractRunner();
+ }
+
if (_ocrMethodIndex == _ocrMethodTesseract && comboBoxTesseractLanguages.Items.Count == 0)
{
buttonGetTesseractDictionaries_Click(sender, e);
@@ -5931,7 +5197,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
_tesseractEngineMode = comboBoxTesseractEngineMode.SelectedIndex;
- _tesseractErrors = 0;
_isLatinDb = comboBoxCharacterDatabase.SelectedItem != null && comboBoxCharacterDatabase.SelectedItem.ToString().Equals("Latin", StringComparison.Ordinal);
Configuration.Settings.VobSubOcr.RightToLeft = checkBoxRightToLeft.Checked;
_lastLine = null;
@@ -5953,18 +5218,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
_nOcrDb = null;
_tesseractAsyncStrings = new string[max];
_tesseractAsyncIndex = (int)numericUpDownStartNumber.Value + 5;
- _tesseractThread = new BackgroundWorker();
- _tesseractThread.DoWork += TesseractThreadDoWork;
- _tesseractThread.RunWorkerCompleted += TesseractThreadRunWorkerCompleted;
- _tesseractThread.WorkerSupportsCancellation = true;
- if (_tesseractAsyncIndex >= 0 && _tesseractAsyncIndex < max)
- _tesseractThread.RunWorkerAsync(GetSubtitleBitmap(_tesseractAsyncIndex));
- }
- else if (_ocrMethodIndex == _ocrMethodImageCompare)
- {
- if (_compareBitmaps == null)
- LoadImageCompareBitmaps();
- _numericUpDownMaxErrorPct = (double)numericUpDownMaxErrorPct.Value;
}
else if (_ocrMethodIndex == _ocrMethodNocr)
{
@@ -5999,9 +5252,11 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
if (start + i < max)
{
var bw = new BackgroundWorker();
- var p = new NOcrThreadParameter(GetSubtitleBitmap(start + i), start + i, _nOcrDb.OcrCharacters, bw, noOfThreads, _unItalicFactor, checkBoxNOcrItalic.Checked, (int)numericUpDownNumberOfPixelsIsSpaceNOCR.Value, checkBoxRightToLeft.Checked);
- p.NOcrLastLowercaseHeight = _nocrLastLowercaseHeight;
- p.NOcrLastUppercaseHeight = _nocrLastUppercaseHeight;
+ var p = new NOcrThreadParameter(GetSubtitleBitmap(start + i), start + i, _nOcrDb.OcrCharacters, bw, noOfThreads, _unItalicFactor, checkBoxNOcrItalic.Checked, (int)numericUpDownNumberOfPixelsIsSpaceNOCR.Value, checkBoxRightToLeft.Checked)
+ {
+ NOcrLastLowercaseHeight = _nocrLastLowercaseHeight,
+ NOcrLastUppercaseHeight = _nocrLastUppercaseHeight
+ };
bw.DoWork += NOcrThreadDoWork;
bw.RunWorkerCompleted += NOcrThreadRunWorkerCompleted;
bw.RunWorkerAsync(p);
@@ -6033,90 +5288,133 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
_mainOcrRunning = true;
subtitleListView1.MultiSelect = false;
mainOcrTimer_Tick(null, null);
-
- if (_ocrMethodIndex == _ocrMethodImageCompare)
- {
- _icThreadsStop = false;
- _icThreadResults = new string[_subtitle.Paragraphs.Count];
- int noOfThreads = Environment.ProcessorCount - 2; // -1 or -2?
- if (noOfThreads >= max)
- noOfThreads = max - 1;
- int start = (int)numericUpDownStartNumber.Value + 5;
- if (noOfThreads > 2)
- noOfThreads = 2; // Threading is not really good - subtitle picture creation should probably be threaded also/instead
- for (int i = 0; i < noOfThreads; i++)
- {
- if (start + i < max)
- {
- Application.DoEvents();
- var bw = new BackgroundWorker();
- var p = new ImageCompareThreadParameter(GetSubtitleBitmap(start + i), start + i, _compareBitmaps, bw, noOfThreads, (int)numericUpDownPixelsIsSpace.Value, checkBoxRightToLeft.Checked, (float)numericUpDownMaxErrorPct.Value);
- bw.DoWork += ImageCompareThreadDoWork;
- bw.RunWorkerCompleted += ImageCompareThreadRunWorkerCompleted;
- bw.RunWorkerAsync(p);
- }
- }
- }
}
- // public List _elapseds = new List();
- private bool MainLoop(int max, int i)
+ private TesseractThreadRunner _tesseractThreadRunner;
+
+ public void OcrDone(int index, TesseractThreadRunner.ImageJob job)
{
- if (i >= max)
- {
- SetButtonsEnabledAfterOcrDone();
- _mainOcrRunning = false;
- return true;
- }
-
- var bmp = ShowSubtitleImage(i);
- TimeCode startTime;
- TimeCode endTime;
- GetSubtitleTime(i, out startTime, out endTime);
- labelStatus.Text = $"{i + 1} / {max}: {startTime} - {endTime}";
- progressBar1.Value = i + 1;
- labelStatus.Refresh();
- progressBar1.Refresh();
- if (_abort)
- {
- bmp.Dispose();
- SetButtonsEnabledAfterOcrDone();
- _mainOcrRunning = false;
- return true;
- }
-
- _mainOcrBitmap = bmp;
-
- int j = i;
- subtitleListView1.Items[j].Selected = true;
- if (j < max - 1)
- j++;
- if (j < max - 1)
- j++;
- subtitleListView1.Items[j].EnsureVisible();
-
- string text = string.Empty;
- // var sw = Stopwatch.StartNew();
- if (_ocrMethodIndex == _ocrMethodBinaryImageCompare)
- text = SplitAndOcrBinaryImageCompare(bmp, i);
- else if (_ocrMethodIndex == _ocrMethodTesseract)
- text = OcrViaTesseract(bmp, i);
- else if (_ocrMethodIndex == _ocrMethodTesseract302)
- text = OcrViaTesseract(bmp, i);
- else if (_ocrMethodIndex == _ocrMethodImageCompare)
- text = SplitAndOcrBitmapNormal(bmp, i);
- else if (_ocrMethodIndex == _ocrMethodNocr)
- text = OcrViaNOCR(bmp, i);
- else if (_ocrMethodIndex == _ocrMethodModi)
- text = CallModi(i);
- // sw.Stop();
- //_elapseds.Add(sw.ElapsedMilliseconds);
- //double ts = 0;
- //for (int k = 0; k < _elapseds.Count; k++)
- //{
- // ts += _elapseds[k];
- //}
- //Text = (ts / _elapseds.Count).ToString(); // display ms in win title bar
+ _tesseractAsyncStrings[index] = job.Result;
+ string text = OcrViaTesseract(job.Bitmap, index);
+
+ _lastLine = text;
+
+ text = text.Replace("-", "-");
+ text = text.Replace("a", "a");
+ text = text.Replace(".", ".");
+ text = text.Replace(" ", " ");
+ text = text.Trim();
+
+ text = text.Replace(" " + Environment.NewLine, Environment.NewLine);
+ text = text.Replace(Environment.NewLine + " ", Environment.NewLine);
+
+ // max allow 2 lines
+ if (_autoBreakLines && text.Replace(Environment.NewLine, "*").Length + 2 <= text.Length)
+ {
+ text = text.Replace(" " + Environment.NewLine, Environment.NewLine);
+ text = text.Replace(Environment.NewLine + " ", Environment.NewLine);
+ while (text.Contains(Environment.NewLine + Environment.NewLine))
+ text = text.Replace(Environment.NewLine + Environment.NewLine, Environment.NewLine);
+
+ if (text.Replace(Environment.NewLine, "*").Length + 2 <= text.Length)
+ text = Utilities.AutoBreakLine(text);
+ }
+
+ if (_dvbSubtitles != null && checkBoxTransportStreamGetColorAndSplit.Checked)
+ {
+ text = Utilities.UnbreakLine(text);
+ if (_dvbSubColor != Color.Transparent)
+ text = "" + text + "";
+ }
+
+ _linesOcred++;
+
+ if (_abort)
+ {
+ textBoxCurrentText.Text = text;
+ _mainOcrRunning = false;
+ SetButtonsEnabledAfterOcrDone();
+ _nocrThreadsStop = true;
+ }
+
+ text = text.Trim();
+ text = text.Replace(" ", " ");
+ text = text.Replace(Environment.NewLine + Environment.NewLine, Environment.NewLine);
+ text = text.Replace(" ", " ");
+ text = text.Replace(Environment.NewLine + Environment.NewLine, Environment.NewLine);
+
+ if (index >= subtitleListView1.Items.Count)
+ return;
+ var item = subtitleListView1.Items[index];
+ item.Selected = true;
+ item.EnsureVisible();
+
+ Paragraph p = _subtitle.GetParagraphOrDefault(index);
+ if (p != null)
+ p.Text = text;
+ if (subtitleListView1.SelectedItems.Count == 1 && subtitleListView1.SelectedItems[0].Index == index)
+ textBoxCurrentText.Text = text;
+ else
+ subtitleListView1.SetText(index, text);
+
+ var max = GetSubtitleCount();
+ GetSubtitleTime(index, out var startTime, out var endTime);
+ labelStatus.Text = $"{index + 1} / {max}: {startTime} - {endTime}";
+ progressBar1.Value = index + 1;
+ labelStatus.Refresh();
+ progressBar1.Refresh();
+
+ _linesOcred++;
+ job.Bitmap.Dispose();
+ if (index >= max - 1)
+ {
+ SetButtonsEnabledAfterOcrDone();
+ _mainOcrRunning = false;
+ }
+ }
+
+ private bool MainLoop(int max, int i)
+ {
+ if (i >= max)
+ {
+ SetButtonsEnabledAfterOcrDone();
+ _mainOcrRunning = false;
+ return true;
+ }
+
+ var bmp = ShowSubtitleImage(i);
+ TimeCode startTime;
+ TimeCode endTime;
+ GetSubtitleTime(i, out startTime, out endTime);
+ labelStatus.Text = $"{i + 1} / {max}: {startTime} - {endTime}";
+ progressBar1.Value = i + 1;
+ labelStatus.Refresh();
+ progressBar1.Refresh();
+ if (_abort)
+ {
+ bmp.Dispose();
+ SetButtonsEnabledAfterOcrDone();
+ _mainOcrRunning = false;
+ return true;
+ }
+
+ _mainOcrBitmap = bmp;
+
+ int j = i;
+ subtitleListView1.Items[j].Selected = true;
+ if (j < max - 1)
+ j++;
+ if (j < max - 1)
+ j++;
+ subtitleListView1.Items[j].EnsureVisible();
+
+ string text = string.Empty;
+ if (_ocrMethodIndex == _ocrMethodBinaryImageCompare)
+ text = SplitAndOcrBinaryImageCompare(bmp, i);
+ else if (_ocrMethodIndex == _ocrMethodNocr)
+ text = OcrViaNOCR(bmp, i);
+ else if (_ocrMethodIndex == _ocrMethodModi)
+ text = CallModi(i);
_lastLine = text;
@@ -6156,7 +5454,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
_mainOcrRunning = false;
SetButtonsEnabledAfterOcrDone();
_nocrThreadsStop = true;
- _icThreadsStop = true;
return true;
}
@@ -6177,10 +5474,35 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
return false;
}
+ private bool MainLoopTesseract(int max, int i)
+ {
+ if (i >= max)
+ {
+ _tesseractThreadRunner.CheckQueue();
+ return false;
+ }
+
+ if (_abort)
+ {
+ SetButtonsEnabledAfterOcrDone();
+ _mainOcrRunning = false;
+ return true;
+ }
+
+ var bmp = GetSubtitleBitmap(i);
+ _mainOcrBitmap = bmp;
+ _tesseractThreadRunner.AddImageJob(bmp, i, _languageId, string.Empty, _tesseractEngineMode.ToString(CultureInfo.InvariantCulture), _ocrMethodIndex == _ocrMethodTesseract302);
+ _tesseractThreadRunner.CheckQueue();
+ return false;
+ }
+
private void mainOcrTimer_Tick(object sender, EventArgs e)
{
_mainOcrTimer.Stop();
- bool done = MainLoop(_mainOcrTimerMax, _mainOcrIndex);
+
+ bool done = _ocrMethodIndex == _ocrMethodTesseract || _ocrMethodIndex == _ocrMethodTesseract302 ?
+ MainLoopTesseract(_mainOcrTimerMax, _mainOcrIndex) :
+ MainLoop(_mainOcrTimerMax, _mainOcrIndex);
if (done || _abort)
{
SetButtonsEnabledAfterOcrDone();
@@ -6209,7 +5531,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
public static Bitmap ResizeBitmap(Bitmap b, int width, int height)
{
var result = new Bitmap(width, height);
- using (Graphics g = Graphics.FromImage(result))
+ using (var g = Graphics.FromImage(result))
g.DrawImage(b, 0, 0, width, height);
return result;
}
@@ -6217,186 +5539,47 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
public static Bitmap UnItalic(Bitmap bmp, double factor)
{
int left = (int)(bmp.Height * factor);
- Bitmap unItaliced = new Bitmap(bmp.Width + left + 4, bmp.Height);
-
- Point[] destinationPoints = {
- new Point(0, 0), // destination for upper-left point of original
- new Point(bmp.Width, 0), // destination for upper-right point of original
- new Point(left, bmp.Height) // destination for lower-left point of original
- };
-
+ var unItaliced = new Bitmap(bmp.Width + left + 4, bmp.Height);
using (var g = Graphics.FromImage(unItaliced))
{
- g.DrawImage(bmp, destinationPoints);
+ g.DrawImage(bmp, new[] {
+ new Point(0, 0), // destination for upper-left point of original
+ new Point(bmp.Width, 0), // destination for upper-right point of original
+ new Point(left, bmp.Height) // destination for lower-left point of original
+ });
}
return unItaliced;
}
+ TesseractRunner _tesseractRunner;
+
private string Tesseract3DoOcrViaExe(Bitmap bmp, string language, string psmMode, int tesseractEngineMode)
{
- var directory = _ocrMethodIndex == _ocrMethodTesseract302 ? Configuration.Tesseract302Directory : Configuration.TesseractDirectory;
+ if (_tesseractRunner == null)
+ {
+ _tesseractThreadRunner = new TesseractThreadRunner(OcrDone);
+ _tesseractRunner = new TesseractRunner();
+ }
// change yellow color to white - easier for Tesseract
var nbmp = new NikseBitmap(bmp);
nbmp.ReplaceYellowWithWhite(); // optimized replace
string pngFileName = Path.GetTempPath() + Guid.NewGuid() + ".png";
- string tempTextFileName;
using (var b = nbmp.GetBitmap())
{
b.Save(pngFileName, System.Drawing.Imaging.ImageFormat.Png);
- tempTextFileName = Path.GetTempPath() + Guid.NewGuid();
}
- using (var process = new Process())
+ var result = _tesseractRunner.Run(language, psmMode, tesseractEngineMode.ToString(CultureInfo.InvariantCulture), pngFileName, _ocrMethodIndex == _ocrMethodTesseract302);
+ if (_tesseractRunner.TesseractErrors.Count <= 2 && !string.IsNullOrEmpty(_tesseractRunner.LastError))
{
- process.StartInfo = new ProcessStartInfo(directory + "tesseract.exe") { UseShellExecute = true };
-
- if (_ocrMethodIndex == _ocrMethodTesseract302)
- {
- process.StartInfo.Arguments = "\"" + pngFileName + "\" \"" + tempTextFileName + "\" " + language;
- }
- else
- {
- process.StartInfo.Arguments = "\"" + pngFileName + "\" \"" + tempTextFileName + "\" --oem " + tesseractEngineMode + " -l " + language;
- }
-
- if (!string.IsNullOrEmpty(psmMode))
- process.StartInfo.Arguments += " " + psmMode.Trim();
-
- process.StartInfo.Arguments += " hocr";
- process.StartInfo.WindowStyle = ProcessWindowStyle.Hidden;
-
- if (Configuration.IsRunningOnLinux() || Configuration.IsRunningOnMac())
- {
- process.StartInfo.UseShellExecute = false;
- process.StartInfo.RedirectStandardError = true;
- process.StartInfo.FileName = "tesseract";
- }
- else
- {
- var tessdataPath = Path.Combine(directory, "tessdata");
-
- if (_ocrMethodIndex == _ocrMethodTesseract)
- {
- process.StartInfo.Arguments = " --tessdata-dir \"" + tessdataPath + "\" " + process.StartInfo.Arguments.Trim();
- process.ErrorDataReceived += TesseractErrorReceived;
- }
- process.StartInfo.WorkingDirectory = directory;
- process.StartInfo.UseShellExecute = false;
- process.StartInfo.CreateNoWindow = true;
- process.StartInfo.RedirectStandardError = true;
- process.EnableRaisingEvents = true;
- }
-
- try
- {
- process.Start();
- process.BeginErrorReadLine();
- }
- catch
- {
- _tesseractErrors++;
- if (_tesseractErrors <= 2)
- {
- if (Configuration.IsRunningOnLinux() || Configuration.IsRunningOnMac())
- {
- MessageBox.Show("Unable to start 'Tesseract' - make sure tesseract-ocr 4.x is installed!");
- }
- else
- {
- MessageBox.Show("Unable to start 'Tesseract' (" + Configuration.TesseractDirectory + "tesseract.exe) - make sure Subtitle Edit is install correctly + Visual Studio 2017 C++ runtime");
- }
- throw;
- }
- }
- process.WaitForExit(5000);
- }
-
- string result = string.Empty;
- string outputFileName = tempTextFileName + ".html";
- if (!File.Exists(outputFileName))
- outputFileName = tempTextFileName + ".hocr";
- try
- {
- if (File.Exists(outputFileName))
- {
- result = File.ReadAllText(outputFileName, Encoding.UTF8);
- result = ParseHocr(result);
- File.Delete(outputFileName);
- }
- File.Delete(pngFileName);
- }
- catch
- {
- // ignored
+ MessageBox.Show(_tesseractRunner.LastError);
}
return result;
}
- private void TesseractErrorReceived(object sender, DataReceivedEventArgs e)
- {
- string msg = e.Data;
-
- if (string.IsNullOrEmpty(msg) ||
- msg.StartsWith("Tesseract Open Source OCR Engine", StringComparison.OrdinalIgnoreCase) ||
- msg.Contains("Too few characters", StringComparison.OrdinalIgnoreCase) ||
- msg.Contains("Empty page", StringComparison.OrdinalIgnoreCase) ||
- msg.Contains(" diacritics", StringComparison.OrdinalIgnoreCase) ||
- msg.Contains("Weak margin", StringComparison.OrdinalIgnoreCase))
- {
- return;
- }
-
- _tesseractErrors++;
- if (_tesseractErrors <= 2)
- {
- MessageBox.Show("An error occurred while running tesseract: " + msg);
- }
- }
-
- private static string ParseHocr(string html)
- {
- string s = html.Replace("", "@001_____").Replace("", "@002_____");
-
- int first = s.IndexOf('<');
- while (first >= 0)
- {
- int last = s.IndexOf('>');
- if (last > 0)
- {
- s = s.Remove(first, last - first + 1);
- first = s.IndexOf('<');
- }
- else
- {
- first = -1;
- }
- }
-
- s = s.Trim();
- s = s.Replace("@001_____", "").Replace("@002_____", "");
- while (s.Contains(" "))
- s = s.Replace(" ", " ");
- s = s.Replace(" ", " ");
-
- // html escape decoding
- s = s.Replace("&", "&");
- s = s.Replace("<", "<");
- s = s.Replace(">", ">");
- s = s.Replace(""", "\"");
- s = s.Replace("'", "'");
- s = s.Replace("'", "'");
-
- while (s.Contains("\n\n"))
- s = s.Replace("\n\n", "\n");
- s = s.Replace("\n", "\n");
- s = s.Replace("\n", Environment.NewLine);
-
- return s;
- }
-
private bool HasSingleLetters(string line)
{
if (!_ocrFixEngine.IsDictionaryLoaded || !_ocrFixEngine.SpellCheckDictionaryName.StartsWith("en_", StringComparison.Ordinal))
@@ -6467,10 +5650,8 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
else if (_ocrFixEngine != null && !psm.Contains('$') && !psm.Contains('•') && !psm.Contains('€'))
{
- int correctWordsNoFixes;
- int wordsNotFoundNoFixes = _ocrFixEngine.CountUnknownWordsViaDictionary(textWithOutFixes, out correctWordsNoFixes);
- int correctWordsPsm7;
- int wordsNotFoundPsm7 = _ocrFixEngine.CountUnknownWordsViaDictionary(psm, out correctWordsPsm7);
+ int wordsNotFoundNoFixes = _ocrFixEngine.CountUnknownWordsViaDictionary(textWithOutFixes, out var correctWordsNoFixes);
+ int wordsNotFoundPsm7 = _ocrFixEngine.CountUnknownWordsViaDictionary(psm, out var correctWordsPsm7);
if (wordsNotFoundPsm7 <= wordsNotFoundNoFixes && correctWordsPsm7 > correctWordsNoFixes)
{
textWithOutFixes = psm;
@@ -7047,6 +6228,8 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
s = "" + HtmlUtil.RemoveOpenCloseTags(s, HtmlUtil.TagItalic) + "";
s = s.Replace("" + Environment.NewLine + "", Environment.NewLine);
+ s = s.Replace(" a ", " a ");
+
return HtmlUtil.FixInvalidItalicTags(s);
}
@@ -7203,34 +6386,18 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
_modiEnabled = false;
}
- if (!_modiEnabled)
- {
- comboBoxOcrMethod.Items.RemoveAt(_ocrMethodModi);
-
- if (_ocrMethodTesseract > _ocrMethodModi)
- _ocrMethodTesseract--;
- if (_ocrMethodBinaryImageCompare > _ocrMethodModi)
- _ocrMethodBinaryImageCompare--;
- if (_ocrMethodNocr > _ocrMethodModi)
- _ocrMethodNocr--;
- if (_ocrMethodTesseract302 > _ocrMethodModi)
- _ocrMethodTesseract302--;
- if (_ocrMethodTesseract302 > _ocrMethodNocr)
- _ocrMethodTesseract302--;
- if (_ocrMethodImageCompare > _ocrMethodModi)
- _ocrMethodImageCompare--;
- }
}
private void InitializeTesseract(string chosenLanguage = null)
{
- if (!Directory.Exists(Configuration.TesseractDirectory) && !Configuration.IsRunningOnLinux() && !Configuration.IsRunningOnMac())
+ if (!Directory.Exists(Configuration.Tesseract302Directory) && !Configuration.IsRunningOnLinux() && !Configuration.IsRunningOnMac() &&
+ Directory.Exists(Configuration.TesseractOriginalDirectory))
{
foreach (string dirPath in Directory.GetDirectories(Configuration.TesseractOriginalDirectory, "*", SearchOption.AllDirectories))
- Directory.CreateDirectory(dirPath.Replace(Configuration.TesseractOriginalDirectory, Configuration.TesseractDirectory));
+ Directory.CreateDirectory(dirPath.Replace(Configuration.TesseractOriginalDirectory, Configuration.Tesseract302Directory));
foreach (string newPath in Directory.GetFiles(Configuration.TesseractOriginalDirectory, "*.*", SearchOption.AllDirectories))
- File.Copy(newPath, newPath.Replace(Configuration.TesseractOriginalDirectory, Configuration.TesseractDirectory), true);
+ File.Copy(newPath, newPath.Replace(Configuration.TesseractOriginalDirectory, Configuration.Tesseract302Directory), true);
}
string dir = _ocrMethodIndex == _ocrMethodTesseract302 ? Configuration.Tesseract302DataDirectory : Configuration.TesseractDataDirectory;
@@ -7281,7 +6448,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private void InitializeModiLanguages()
{
- foreach (ModiLanguage ml in ModiLanguage.AllLanguages)
+ foreach (var ml in ModiLanguage.AllLanguages)
{
comboBoxModiLanguage.Items.Add(ml);
if (ml.Id == _vobSubOcrSettings.LastModiLanguageId)
@@ -7299,11 +6466,9 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private void ButtonStopClick(object sender, EventArgs e)
{
- if (_mainOcrTimer != null)
- _mainOcrTimer.Stop();
+ _mainOcrTimer?.Stop();
_abort = true;
_nocrThreadsStop = true;
- _icThreadsStop = true;
buttonStop.Enabled = false;
progressBar1.Visible = false;
labelStatus.Text = string.Empty;
@@ -7363,7 +6528,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private void ButtonNewCharacterDatabaseClick(object sender, EventArgs e)
{
- using (var newFolder = new VobSubOcrNewFolder(_ocrMethodIndex == _ocrMethodImageCompare))
+ using (var newFolder = new VobSubOcrNewFolder(false))
{
if (newFolder.ShowDialog(this) == DialogResult.OK)
{
@@ -7523,13 +6688,12 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private void ComboBoxTesseractLanguagesSelectedIndexChanged(object sender, EventArgs e)
{
Configuration.Settings.VobSubOcr.TesseractLastLanguage = (comboBoxTesseractLanguages.SelectedItem as TesseractLanguage).Id;
- if (_ocrFixEngine != null)
- _ocrFixEngine.Dispose();
+ _ocrFixEngine?.Dispose();
_ocrFixEngine = null;
LoadOcrFixEngine(null, null);
}
- private void LoadOcrFixEngine(string threeLetterISOLanguageName, string hunspellName)
+ private void LoadOcrFixEngine(string threeLetterIsoLanguageName, string hunspellName)
{
if (_ocrMethodIndex != _ocrMethodTesseract && _ocrMethodIndex != _ocrMethodTesseract302)
{
@@ -7537,21 +6701,21 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
var ci = CultureInfo.GetCultureInfo(LanguageString.Replace("_", "-"));
_languageId = ci.ThreeLetterISOLanguageName;
- threeLetterISOLanguageName = ci.ThreeLetterISOLanguageName;
+ threeLetterIsoLanguageName = ci.ThreeLetterISOLanguageName;
}
catch
{
// ignored
}
}
- else if (string.IsNullOrEmpty(threeLetterISOLanguageName) && comboBoxTesseractLanguages.SelectedItem != null)
+ else if (string.IsNullOrEmpty(threeLetterIsoLanguageName) && comboBoxTesseractLanguages.SelectedItem != null)
{
_languageId = (comboBoxTesseractLanguages.SelectedItem as TesseractLanguage).Id;
- threeLetterISOLanguageName = _languageId;
+ threeLetterIsoLanguageName = _languageId;
}
_ocrFixEngine?.Dispose();
- _ocrFixEngine = new OcrFixEngine(threeLetterISOLanguageName, hunspellName, this, _ocrMethodIndex == _ocrMethodBinaryImageCompare);
+ _ocrFixEngine = new OcrFixEngine(threeLetterIsoLanguageName, hunspellName, this, _ocrMethodIndex == _ocrMethodBinaryImageCompare);
if (_ocrFixEngine.IsDictionaryLoaded)
{
string loadedDictionaryName = _ocrFixEngine.SpellCheckDictionaryName;
@@ -7604,30 +6768,49 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private void ComboBoxOcrMethodSelectedIndexChanged(object sender, EventArgs e)
{
- _icThreadsStop = true;
+ _abort = true;
_binaryOcrDb = null;
_nOcrDb = null;
_ocrMethodIndex = comboBoxOcrMethod.SelectedIndex;
if (_ocrMethodIndex == _ocrMethodTesseract)
{
+ ResetTesseractThread();
InitializeTesseract();
ShowOcrMethodGroupBox(GroupBoxTesseractMethod);
- Configuration.Settings.VobSubOcr.LastOcrMethod = "Tesseract";
+ Configuration.Settings.VobSubOcr.LastOcrMethod = "Tesseract4";
comboBoxTesseractEngineMode.Visible = true;
labelTesseractEngineMode.Visible = true;
checkBoxTesseractFallback.Text = "Fallback to Tesseract 3.02";
checkBoxTesseractFallback.Visible = File.Exists(Path.Combine(Configuration.Tesseract302Directory, "tesseract.exe"));
+ if (!File.Exists(Path.Combine(Configuration.TesseractDirectory, "tesseract.exe")))
+ {
+ if (MessageBox.Show("Download Tesseract 4 Beta", "Subtitle Edit", MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
+ {
+ using (var form = new DownloadTesseract4())
+ {
+ form.ShowDialog(this);
+ }
+ }
+ else
+ {
+ comboBoxOcrMethod.SelectedIndex = _ocrMethodBinaryImageCompare;
+ return;
+ }
+ }
}
else if (_ocrMethodIndex == _ocrMethodTesseract302)
{
+ ResetTesseractThread();
InitializeTesseract();
ShowOcrMethodGroupBox(GroupBoxTesseractMethod);
Configuration.Settings.VobSubOcr.LastOcrMethod = "Tesseract302";
comboBoxTesseractEngineMode.Visible = false;
labelTesseractEngineMode.Visible = false;
+ checkBoxTesseractFallback.Text = "Fallback to Tesseract 4";
+ checkBoxTesseractFallback.Visible = File.Exists(Path.Combine(Configuration.TesseractDirectory, "tesseract.exe"));
if (!File.Exists(Path.Combine(Configuration.Tesseract302Directory, "tesseract.exe")))
{
- if (MessageBox.Show("Download Tesseract 3.02", null, MessageBoxButtons.YesNo) == DialogResult.Yes)
+ if (MessageBox.Show("Download Tesseract 3.02", "Subtitle Edit", MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
using (var form = new DownloadTesseract302())
{
@@ -7636,19 +6819,10 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
else
{
- comboBoxOcrMethod.SelectedIndex = _ocrMethodTesseract;
+ comboBoxOcrMethod.SelectedIndex = _ocrMethodBinaryImageCompare;
return;
}
}
- checkBoxTesseractFallback.Text = "Fallback to Tesseract 4";
- checkBoxTesseractFallback.Visible = true;
- }
- else if (_ocrMethodIndex == _ocrMethodImageCompare)
- {
- ShowOcrMethodGroupBox(groupBoxImageCompareMethod);
- Configuration.Settings.VobSubOcr.LastOcrMethod = "BitmapCompare";
- checkBoxPromptForUnknownWords.Checked = false;
- LoadImageCompareCharacterDatabaseList();
}
else if (_ocrMethodIndex == _ocrMethodNocr)
{
@@ -7736,7 +6910,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
saveImageAsToolStripMenuItem.Visible = !enableIfRaisedBySubListView || subtitleListView1.SelectedItems.Count == 1;
// Image compare.
- bool enableIfImageCompare = _ocrMethodIndex == _ocrMethodImageCompare || _ocrMethodIndex == _ocrMethodBinaryImageCompare;
+ bool enableIfImageCompare = _ocrMethodIndex == _ocrMethodBinaryImageCompare;
inspectImageCompareMatchesForCurrentImageToolStripMenuItem.Visible = enableIfImageCompare;
EditLastAdditionsToolStripMenuItem.Visible = enableIfImageCompare && _lastAdditions != null && _lastAdditions.Count > 0;
@@ -7860,16 +7034,13 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private void ResetTesseractThread()
{
- if (_tesseractThread != null)
+ _tesseractThreadRunner?.Cancel();
+ if (_tesseractAsyncStrings != null)
{
- _tesseractThread.CancelAsync();
- if (_tesseractAsyncStrings != null)
- {
- for (int i = 0; i < _tesseractAsyncStrings.Length; i++)
- _tesseractAsyncStrings[i] = string.Empty;
- }
- _tesseractAsyncIndex = 0;
+ for (int i = 0; i < _tesseractAsyncStrings.Length; i++)
+ _tesseractAsyncStrings[i] = string.Empty;
}
+ _tesseractAsyncIndex = 0;
}
private void PictureBoxColorChooserClick(object sender, EventArgs e)
@@ -7903,11 +7074,11 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private void checkBoxShowOnlyForced_CheckedChanged(object sender, EventArgs e)
{
- if (_tesseractThread != null)
+ if (_tesseractThreadRunner != null)
{
- _tesseractThread.CancelAsync();
+ _tesseractThreadRunner.Cancel();
int i = 0;
- while (i < 10 && _tesseractThread.IsBusy)
+ while (i < 10)
{
System.Threading.Thread.Sleep(100);
i++;
@@ -8032,7 +7203,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
catch
{
- var arr = LanguageString.Split(new char[] { '-', '_' });
+ var arr = LanguageString.Split('-', '_');
if (arr.Length > 1 && arr[0].Length == 2)
{
foreach (var x in CultureInfo.GetCultures(CultureTypes.NeutralCultures))
@@ -8077,7 +7248,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
numericUpDownPixelsIsSpace.Value = 11;
_vobSubOcrSettings = vobSubOcrSettings;
- InitializeModi();
InitializeTesseract();
LoadImageCompareCharacterDatabaseList();
@@ -8093,9 +7263,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private void SetOcrMethod()
{
- if (Configuration.Settings.VobSubOcr.LastOcrMethod == "BitmapCompare" && comboBoxOcrMethod.Items.Count > _ocrMethodBinaryImageCompare)
- comboBoxOcrMethod.SelectedIndex = _ocrMethodBinaryImageCompare; //_ocrMethodImageCompare;
- else if (Configuration.Settings.VobSubOcr.LastOcrMethod == "BinaryImageCompare" && comboBoxOcrMethod.Items.Count > _ocrMethodBinaryImageCompare)
+ if (Configuration.Settings.VobSubOcr.LastOcrMethod == "BinaryImageCompare" && comboBoxOcrMethod.Items.Count > _ocrMethodBinaryImageCompare)
comboBoxOcrMethod.SelectedIndex = _ocrMethodBinaryImageCompare;
else if (Configuration.Settings.VobSubOcr.LastOcrMethod == "MODI" && comboBoxOcrMethod.Items.Count > _ocrMethodModi)
comboBoxOcrMethod.SelectedIndex = _ocrMethodModi;
@@ -8103,6 +7271,8 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
comboBoxOcrMethod.SelectedIndex = _ocrMethodNocr;
else if (Configuration.Settings.VobSubOcr.LastOcrMethod == "Tesseract302" && comboBoxOcrMethod.Items.Count > _ocrMethodTesseract302)
comboBoxOcrMethod.SelectedIndex = _ocrMethodTesseract302;
+ else if (Configuration.Settings.VobSubOcr.LastOcrMethod == "Tesseract4" && comboBoxOcrMethod.Items.Count > _ocrMethodTesseract302)
+ comboBoxOcrMethod.SelectedIndex = _ocrMethodTesseract;
else
comboBoxOcrMethod.SelectedIndex = 0;
}
@@ -8270,12 +7440,11 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
else
{
- CompareMatch bestGuess;
CompareMatch match;
if (_binaryOcrDb != null)
- match = GetCompareMatchNew(item, out bestGuess, sourceList, index);
+ match = GetCompareMatchNew(item, out _, sourceList, index);
else
- match = GetCompareMatch(item, parentBitmap, out bestGuess, sourceList, index);
+ match = GetCompareMatch(item, parentBitmap, out _, sourceList, index);
if (match == null)
{
matches.Add(new CompareMatch(Configuration.Settings.Language.VobSubOcr.NoMatch, false, 0, null));
@@ -8326,8 +7495,8 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
Cursor = Cursors.Default;
}
}
- if (_binaryOcrDb != null)
- _binaryOcrDb.LoadCompareImages();
+
+ _binaryOcrDb?.LoadCompareImages();
Cursor = Cursors.Default;
}
@@ -8379,7 +7548,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
numericUpDownPixelsIsSpace.Value = vobSubOcrSettings.XOrMorePixelsMakesSpace;
_vobSubOcrSettings = vobSubOcrSettings;
- InitializeModi();
InitializeTesseract();
LoadImageCompareCharacterDatabaseList();
@@ -8442,7 +7610,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
numericUpDownPixelsIsSpace.Value = vobSubOcrSettings.XOrMorePixelsMakesSpace;
_vobSubOcrSettings = vobSubOcrSettings;
- InitializeModi();
InitializeTesseract();
LoadImageCompareCharacterDatabaseList();
@@ -8480,7 +7647,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
numericUpDownPixelsIsSpace.Value = vobSubOcrSettings.XOrMorePixelsMakesSpace;
_vobSubOcrSettings = vobSubOcrSettings;
- InitializeModi();
InitializeTesseract();
LoadImageCompareCharacterDatabaseList();
@@ -8538,14 +7704,11 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
}
- _icThreadsStop = true;
_abort = true;
_nocrThreadsStop = true;
- if (_mainOcrTimer != null)
- _mainOcrTimer.Stop();
+ _mainOcrTimer?.Stop();
- if (_tesseractThread != null)
- _tesseractThread.CancelAsync();
+ _tesseractThreadRunner?.Cancel();
_tesseractAsyncIndex = 10000;
System.Threading.Thread.Sleep(100);
@@ -9015,7 +8178,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private void buttonLineOcrNewLanguage_Click(object sender, EventArgs e)
{
- using (var newFolder = new VobSubOcrNewFolder(_ocrMethodIndex == _ocrMethodImageCompare))
+ using (var newFolder = new VobSubOcrNewFolder(false))
{
if (newFolder.ShowDialog(this) == DialogResult.OK)
{
@@ -9107,7 +8270,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
numericUpDownNumberOfPixelsIsSpaceNOCR.Value = vobSubOcrSettings.XOrMorePixelsMakesSpace;
_vobSubOcrSettings = vobSubOcrSettings;
- InitializeModi();
InitializeTesseract();
LoadImageCompareCharacterDatabaseList();
@@ -9137,7 +8299,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
if (_ocrMethodIndex == _ocrMethodTesseract)
{
- _icThreadsStop = true;
_abort = true;
_nocrThreadsStop = true;
ResetTesseractThread();
@@ -9300,8 +8461,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
var sub = new Subtitle();
- Encoding encoding;
- SubtitleFormat format = sub.LoadSubtitle(fileName, out encoding, null);
+ SubtitleFormat format = sub.LoadSubtitle(fileName, out _, null);
if (format == null)
return;
@@ -9350,7 +8510,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private void subtitleListView1_DoubleClick(object sender, EventArgs e)
{
- if (subtitleListView1.SelectedItems.Count > 0 && (_ocrMethodIndex == _ocrMethodBinaryImageCompare || _ocrMethodIndex == _ocrMethodImageCompare))
+ if (subtitleListView1.SelectedItems.Count > 0 && _ocrMethodIndex == _ocrMethodBinaryImageCompare)
{
InspectImageCompareMatchesForCurrentImageToolStripMenuItem_Click(null, null);
}
@@ -9378,4 +8538,4 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
}
}
-}
+}
\ No newline at end of file
diff --git a/src/Logic/OCR/Tesseract/TesseractMultiRunner.cs b/src/Logic/OCR/Tesseract/TesseractMultiRunner.cs
new file mode 100644
index 000000000..b057c8f2c
--- /dev/null
+++ b/src/Logic/OCR/Tesseract/TesseractMultiRunner.cs
@@ -0,0 +1,188 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Text;
+using Nikse.SubtitleEdit.Core;
+
+namespace Nikse.SubtitleEdit.Logic.Ocr.Tesseract
+{
+ ///
+ /// Run multiple images per tesseract call
+ ///
+ public class TesseractMultiRunner
+ {
+ private readonly List _tesseractErrors;
+
+ public TesseractMultiRunner()
+ {
+ _tesseractErrors = new List();
+ }
+
+ private void TesseractErrorReceived(object sender, DataReceivedEventArgs e)
+ {
+ var msg = e.Data;
+
+ if (string.IsNullOrEmpty(msg) ||
+ msg.StartsWith("Tesseract Open Source OCR Engine", StringComparison.OrdinalIgnoreCase) ||
+ msg.Contains("Too few characters", StringComparison.OrdinalIgnoreCase) ||
+ msg.Contains("Empty page", StringComparison.OrdinalIgnoreCase) ||
+ msg.Contains(" diacritics", StringComparison.OrdinalIgnoreCase) ||
+ msg.Contains("Weak margin", StringComparison.OrdinalIgnoreCase))
+ {
+ return;
+ }
+
+ _tesseractErrors.Add(msg);
+ }
+
+ public string Run(List bmps, string language, string psmMode)
+ {
+ // change yellow color to white - easier for Tesseract
+ string inputFileName = Path.GetTempPath() + Guid.NewGuid() + ".txt";
+ var filesToDelete = new List();
+ var sb = new StringBuilder();
+ foreach (var bmp in bmps)
+ {
+ bmp.ReplaceYellowWithWhite(); // optimized replace
+ string pngFileName = Path.GetTempPath() + Guid.NewGuid() + ".png";
+ using (var b = bmp.GetBitmap())
+ {
+ b.Save(pngFileName, System.Drawing.Imaging.ImageFormat.Png);
+ }
+ filesToDelete.Add(pngFileName);
+ sb.AppendLine(pngFileName);
+ }
+
+ File.WriteAllText(inputFileName, sb.ToString());
+ filesToDelete.Add(inputFileName);
+ var outputFileName = Path.GetTempPath() + Guid.NewGuid();
+ var dir = @"C:\Data\SubtitleEdit\subtitleedit\src\bin\Debug\Tesseract4";
+ using (var process = new Process())
+ {
+ process.StartInfo = new ProcessStartInfo(dir + "tesseract.exe")
+ {
+ UseShellExecute = true,
+ Arguments = "\"" + inputFileName + "\" \"" + outputFileName + "\" -l " + language
+ };
+
+ if (!string.IsNullOrEmpty(psmMode))
+ process.StartInfo.Arguments += " " + psmMode.Trim();
+
+ process.StartInfo.Arguments += " hocr";
+ process.StartInfo.Arguments = " --tessdata-dir \"" + Path.Combine(dir, "tessdata") + "\" " + process.StartInfo.Arguments.Trim();
+ process.StartInfo.WindowStyle = ProcessWindowStyle.Hidden;
+
+ if (Configuration.IsRunningOnLinux() || Configuration.IsRunningOnMac())
+ {
+ process.StartInfo.UseShellExecute = false;
+ process.StartInfo.RedirectStandardError = true;
+ process.StartInfo.FileName = "tesseract";
+ }
+ else
+ {
+ var tessdataPath = Path.Combine(Configuration.TesseractDirectory, "tessdata");
+ process.StartInfo.Arguments = " --tessdata-dir \"" + tessdataPath + "\" " + process.StartInfo.Arguments.Trim();
+ process.StartInfo.WorkingDirectory = Configuration.TesseractDirectory;
+ process.StartInfo.UseShellExecute = false;
+ process.StartInfo.CreateNoWindow = true;
+ process.StartInfo.RedirectStandardError = true;
+ process.ErrorDataReceived += TesseractErrorReceived;
+ process.EnableRaisingEvents = true;
+ }
+
+ try
+ {
+ process.Start();
+ process.BeginErrorReadLine();
+ }
+ catch
+ {
+ if (_tesseractErrors.Count <= 2)
+ {
+
+ if (Configuration.IsRunningOnLinux() || Configuration.IsRunningOnMac())
+ {
+ _tesseractErrors.Add("Unable to start 'Tesseract' - make sure tesseract-ocr 4.x is installed!");
+ }
+ else
+ {
+ _tesseractErrors.Add("Unable to start 'Tesseract' (" + Configuration.TesseractDirectory + "tesseract.exe) - make sure Subtitle Edit is install correctly + Visual Studio 2017 C++ runtime");
+ }
+ }
+ }
+
+ process.WaitForExit(5000 + bmps.Count * 500);
+
+ string result = string.Empty;
+ string resultFileName = outputFileName + ".html";
+ if (!File.Exists(outputFileName))
+ resultFileName = outputFileName + ".hocr";
+ filesToDelete.Add(resultFileName);
+ try
+ {
+ if (File.Exists(outputFileName))
+ {
+ result = File.ReadAllText(outputFileName, Encoding.UTF8);
+ result = ParseHocr(result);
+ }
+ foreach (var fileName in filesToDelete)
+ {
+ if (File.Exists(fileName))
+ {
+ File.Delete(fileName);
+ }
+ }
+ }
+ catch
+ {
+ // ignored
+ }
+
+ return result;
+ }
+ }
+
+ private static string ParseHocr(string html)
+ {
+ string s = html.Replace("", "@001_____").Replace("", "@002_____");
+
+ int first = s.IndexOf('<');
+ while (first >= 0)
+ {
+ int last = s.IndexOf('>');
+ if (last > 0)
+ {
+ s = s.Remove(first, last - first + 1);
+ first = s.IndexOf('<');
+ }
+ else
+ {
+ first = -1;
+ }
+ }
+
+ s = s.Trim();
+ s = s.Replace("@001_____", "").Replace("@002_____", "");
+ while (s.Contains(" "))
+ s = s.Replace(" ", " ");
+ s = s.Replace(" ", " ");
+
+ // html escape decoding
+ s = s.Replace("&", "&");
+ s = s.Replace("<", "<");
+ s = s.Replace(">", ">");
+ s = s.Replace(""", "\"");
+ s = s.Replace("'", "'");
+ s = s.Replace("'", "'");
+
+ while (s.Contains("\n\n"))
+ s = s.Replace("\n\n", "\n");
+ s = s.Replace("\n", "\n");
+ s = s.Replace("\n", Environment.NewLine);
+
+ return s;
+ }
+
+ }
+}
\ No newline at end of file
diff --git a/src/Logic/OCR/Tesseract/TesseractRunner.cs b/src/Logic/OCR/Tesseract/TesseractRunner.cs
new file mode 100644
index 000000000..6804a981d
--- /dev/null
+++ b/src/Logic/OCR/Tesseract/TesseractRunner.cs
@@ -0,0 +1,160 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Text;
+using Nikse.SubtitleEdit.Core;
+
+namespace Nikse.SubtitleEdit.Logic.Ocr.Tesseract
+{
+ public class TesseractRunner
+ {
+ public List TesseractErrors { get; set; }
+ public string LastError { get; set; }
+
+ public TesseractRunner()
+ {
+ TesseractErrors = new List();
+ }
+
+ public string Run(string languageCode, string psmMode, string engineMode, string imageFileName, bool run302 = false)
+ {
+ LastError = null;
+ var dir = run302 ? Configuration.Tesseract302Directory : Configuration.TesseractDirectory;
+ string tempTextFileName = Path.GetTempPath() + Guid.NewGuid();
+ using (var process = new Process())
+ {
+ process.StartInfo = new ProcessStartInfo(Path.Combine(dir, "tesseract.exe"))
+ {
+ UseShellExecute = true,
+ Arguments = "\"" + imageFileName + "\" \"" + tempTextFileName + "\" -l " + languageCode
+ };
+
+ if (!string.IsNullOrEmpty(psmMode))
+ {
+ process.StartInfo.Arguments += " --psm " + psmMode;
+ }
+
+ if (!string.IsNullOrEmpty(engineMode) && !run302)
+ {
+ process.StartInfo.Arguments += " --oem " + engineMode;
+ }
+
+ process.StartInfo.Arguments += " hocr";
+ if (run302)
+ {
+ process.StartInfo.WorkingDirectory = Configuration.Tesseract302Directory;
+ }
+ else
+ {
+ process.ErrorDataReceived += TesseractErrorReceived;
+ process.StartInfo.Arguments = " --tessdata-dir \"" + Path.Combine(dir, "tessdata") + "\" " + process.StartInfo.Arguments.Trim();
+ }
+
+ process.StartInfo.WindowStyle = ProcessWindowStyle.Hidden;
+ try
+ {
+ process.Start();
+ }
+ catch (Exception exception)
+ {
+ LastError = exception.Message + Environment.NewLine + exception.StackTrace;
+ TesseractErrors.Add(LastError);
+ return "Error!";
+ }
+ process.WaitForExit(5000);
+ }
+
+ string result = string.Empty;
+ string outputFileName = tempTextFileName + ".html";
+ if (!File.Exists(outputFileName))
+ outputFileName = tempTextFileName + ".hocr";
+ try
+ {
+ if (File.Exists(outputFileName))
+ {
+ result = File.ReadAllText(outputFileName, Encoding.UTF8);
+ result = ParseHocr(result);
+ File.Delete(outputFileName);
+ }
+ File.Delete(imageFileName);
+ }
+ catch
+ {
+ // ignored
+ }
+
+ return result;
+ }
+
+ private static string ParseHocr(string html)
+ {
+ string s = html.Replace("", "@001_____").Replace("", "@002_____");
+
+ int first = s.IndexOf('<');
+ while (first >= 0)
+ {
+ int last = s.IndexOf('>');
+ if (last > 0)
+ {
+ s = s.Remove(first, last - first + 1);
+ first = s.IndexOf('<');
+ }
+ else
+ {
+ first = -1;
+ }
+ }
+
+ s = s.Trim();
+ s = s.Replace("@001_____", "").Replace("@002_____", "");
+ while (s.Contains(" "))
+ s = s.Replace(" ", " ");
+ s = s.Replace(" ", " ");
+
+ // html escape decoding
+ s = s.Replace("&", "&")
+ .Replace("<", "<")
+ .Replace(">", ">")
+ .Replace(""", "\"")
+ .Replace("'", "'")
+ .Replace("'", "'");
+
+ while (s.Contains("\n\n"))
+ s = s.Replace("\n\n", "\n");
+ s = s.Replace("\n", "\n");
+ s = s.Replace("\n", Environment.NewLine);
+
+ return s;
+ }
+
+ private void TesseractErrorReceived(object sender, DataReceivedEventArgs e)
+ {
+ string msg = e.Data;
+
+ if (string.IsNullOrEmpty(msg) ||
+ msg.StartsWith("Tesseract Open Source OCR Engine", StringComparison.OrdinalIgnoreCase) ||
+ msg.Contains("Too few characters", StringComparison.OrdinalIgnoreCase) ||
+ msg.Contains("Empty page", StringComparison.OrdinalIgnoreCase) ||
+ msg.Contains(" diacritics", StringComparison.OrdinalIgnoreCase) ||
+ msg.Contains("Weak margin", StringComparison.OrdinalIgnoreCase))
+ {
+ return;
+ }
+
+ if (TesseractErrors.Count <= 100)
+ {
+ if (string.IsNullOrEmpty(LastError))
+ {
+ LastError = msg;
+ }
+ else if (!LastError.Contains(msg))
+ {
+ LastError = LastError + Environment.NewLine + msg;
+ }
+ TesseractErrors.Add(msg);
+ }
+ }
+
+ }
+}
diff --git a/src/Logic/OCR/Tesseract/TesseractThreadRunner.cs b/src/Logic/OCR/Tesseract/TesseractThreadRunner.cs
new file mode 100644
index 000000000..16788c007
--- /dev/null
+++ b/src/Logic/OCR/Tesseract/TesseractThreadRunner.cs
@@ -0,0 +1,95 @@
+using System;
+using System.Collections.Generic;
+using System.Drawing;
+using System.IO;
+using System.Threading;
+
+namespace Nikse.SubtitleEdit.Logic.Ocr.Tesseract
+{
+ public class TesseractThreadRunner
+ {
+ public delegate void OcrDone(int index, ImageJob job);
+ private readonly OcrDone _callback;
+ private readonly Queue _jobQueue;
+ private static readonly object QueueLock = new object();
+ private readonly TesseractRunner _tesseractRunner;
+ private bool _abort;
+
+ public TesseractThreadRunner(OcrDone callback = null)
+ {
+ _jobQueue = new Queue();
+ _callback = callback;
+ _tesseractRunner = new TesseractRunner();
+ }
+
+ public class ImageJob
+ {
+ public string FileName { get; set; }
+ public int Index { get; set; }
+ public string Result { get; set; }
+ public DateTime Completed { get; set; }
+ public string LanguageCode { get; set; }
+ public string PsmMode { get; set; }
+ public string EngineMode { get; set; }
+ public bool Run302 { get; set; }
+ public Bitmap Bitmap { get; set; }
+ }
+
+ private void DoOcr(object j)
+ {
+ if (_abort)
+ {
+ return;
+ }
+
+ var job = (ImageJob)j;
+ job.Result = _tesseractRunner.Run(job.LanguageCode, job.PsmMode, job.EngineMode, job.FileName, job.Run302);
+ lock (QueueLock)
+ {
+ job.Completed = DateTime.UtcNow;
+ }
+ }
+
+ public void AddImageJob(Bitmap bmp, int index, string language, string psmMode, string engineMode, bool run302)
+ {
+ var job = new ImageJob
+ {
+ FileName = Path.GetTempFileName() + ".png",
+ Index = index,
+ Completed = DateTime.MaxValue,
+ Bitmap = bmp,
+ LanguageCode = language,
+ PsmMode = psmMode,
+ EngineMode = engineMode,
+ Run302 = run302
+ };
+ bmp.Save(job.FileName, System.Drawing.Imaging.ImageFormat.Png);
+ ThreadPool.QueueUserWorkItem(DoOcr, job);
+ _jobQueue.Enqueue(job);
+ }
+
+ public void CheckQueue()
+ {
+ if (_jobQueue.Count == 0)
+ {
+ return;
+ }
+
+ lock (QueueLock)
+ {
+ var checkTime = DateTime.UtcNow;
+ var job = _jobQueue.Peek();
+ if (job != null && job.Completed < checkTime)
+ {
+ _jobQueue.Dequeue();
+ _callback?.Invoke(job.Index, job);
+ }
+ }
+ }
+
+ public void Cancel()
+ {
+ _abort = true;
+ }
+ }
+}
diff --git a/src/SubtitleEdit.csproj b/src/SubtitleEdit.csproj
index faae6059a..a0ebdf9b9 100644
--- a/src/SubtitleEdit.csproj
+++ b/src/SubtitleEdit.csproj
@@ -518,6 +518,12 @@
NetworkStart.cs
+
+ Form
+
+
+ DownloadTesseract4.cs
+
Form
@@ -937,6 +943,9 @@
+
+
+
@@ -1207,6 +1216,9 @@
NetworkStart.cs
+
+ DownloadTesseract4.cs
+
DownloadTesseract302.cs