Go back to include Tesseract 3.02 per derfault

T4 seems to have some problems with line breaks + no italic detection + some bad results
This commit is contained in:
Nikolaj Olsson 2018-09-23 20:24:39 +02:00
parent 0cd466707e
commit 9650bf8b20
19 changed files with 1059 additions and 1160 deletions

Binary file not shown.

View File

@ -269,16 +269,12 @@ Source: ..\Changelog.txt; DestDir: {app};
Source: ..\LICENSE.txt; DestDir: {app}; Flags: ignoreversion; Components: main
Source: Icons\uninstall.ico; DestDir: {app}\Icons; Flags: ignoreversion; Components: main
Source: ..\Tesseract4\tessdata\configs\hocr; DestDir: {app}\Tesseract4\tessdata\configs; Flags: ignoreversion; Components: main
Source: ..\Tesseract4\tessdata\osd.traineddata; DestDir: {app}\Tesseract4\tessdata; Flags: ignoreversion; Components: main
Source: ..\Tesseract4\tesseract.exe; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
Source: ..\Tesseract4\gif.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
Source: ..\Tesseract4\jpeg62.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
Source: ..\Tesseract4\leptonica-1.74.4.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
Source: ..\Tesseract4\libpng16.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
Source: ..\Tesseract4\lzma.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
Source: ..\Tesseract4\tiff.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
Source: ..\Tesseract4\zlib1.dll; DestDir: {app}\Tesseract4; Flags: ignoreversion; Components: main
Source: ..\Tesseract302\tessdata\configs\hocr; DestDir: {app}\Tesseract302\tessdata\configs; Flags: ignoreversion; Components: main
Source: ..\Tesseract302\tessdata\eng.traineddata; DestDir: {app}\Tesseract302\tessdata; Flags: ignoreversion; Components: main
Source: ..\Tesseract302\tessdata\music.traineddata; DestDir: {app}\Tesseract302\tessdata; Flags: ignoreversion; Components: main
Source: ..\Tesseract302\tesseract.exe; DestDir: {app}\Tesseract302; Flags: ignoreversion; Components: main
Source: ..\Tesseract302\msvcp90.dll; DestDir: {app}\Tesseract302; Flags: ignoreversion; Components: main
Source: ..\Tesseract302\msvcr90.dll; DestDir: {app}\Tesseract302; Flags: ignoreversion; Components: main
[Icons]
@ -325,6 +321,12 @@ Type: files; Name: {app}\TessData\eng.word-dawg; Check: IsU
Type: dirifempty; Name: {app}\TessData; Check: IsUpgrade()
Type: files; Name: {app}\Tesseract\leptonlib.dll; Check: IsUpgrade()
Type: files; Name: {app}\tessnet2_32.dll; Check: IsUpgrade()
Type: files; Name: {app}\Tesseract302\tessdata\configs\hocr; Check: IsUpgrade()
Type: files; Name: {app}\Tesseract302\tessdata\eng.traineddata; Check: IsUpgrade()
Type: files; Name: {app}\Tesseract302\tessdata\music.traineddata; Check: IsUpgrade()
Type: files; Name: {app}\Tesseract302\tesseract.exe; Check: IsUpgrade()
Type: files; Name: {app}\Tesseract302\msvcp90.dll; Check: IsUpgrade()
Type: files; Name: {app}\Tesseract302\msvcr90.dll; Check: IsUpgrade()
Type: files; Name: {app}\Icons\SubtitleEdit.srt.ico; Check: IsUpgrade()
Type: files; Name: {app}\DocumentIcons.dll; Check: IsUpgrade()
Type: files; Name: {app}\Settings.xml; Check: IsUpgrade()
@ -596,6 +598,12 @@ begin
DelTree(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract4\tessdata\*.traineddata'), False, True, False);
DeleteFile(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract4\tessdata\configs\hocr'));
DeleteFile(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\tessdata\configs\hocr'));
DelTree(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\tessdata\*.traineddata'), False, True, False);
DeleteFile(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\tesseract.exe'));
DeleteFile(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\msvcp90.dll'));
DeleteFile(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\msvcr90.dll'));
// Remove possible installed mpv
DeleteFile(ExpandConstant('{userappdata}\Subtitle Edit\mpv-1.dll'));
@ -614,6 +622,9 @@ begin
RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract4\tessdata\configs'));
RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract4\tessdata'));
RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract4'));
RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\tessdata\configs'));
RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302\tessdata'));
RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit\Tesseract302'));
RemoveDir(ExpandConstant('{userappdata}\Subtitle Edit'));
end;

View File

@ -17,7 +17,7 @@ namespace Nikse.SubtitleEdit.Core
public static readonly string BaseDirectory = GetBaseDirectory();
public static readonly string DataDirectory = GetDataDirectory();
public static readonly string TesseractOriginalDirectory = BaseDirectory + "Tesseract4" + Path.DirectorySeparatorChar;
public static readonly string TesseractOriginalDirectory = BaseDirectory + "Tesseract302" + Path.DirectorySeparatorChar;
public static readonly string DictionariesDirectory = DataDirectory + "Dictionaries" + Path.DirectorySeparatorChar;
public static readonly string SpectrogramsDirectory = DataDirectory + "Spectrograms" + Path.DirectorySeparatorChar;
public static readonly string SceneChangesDirectory = DataDirectory + "SceneChanges" + Path.DirectorySeparatorChar;

View File

@ -3665,7 +3665,7 @@ namespace Nikse.SubtitleEdit.Forms
bool isUnicode = currentEncoding == Encoding.Unicode || currentEncoding == Encoding.UTF32 || currentEncoding == Encoding.GetEncoding(12001) || currentEncoding == Encoding.UTF7 || currentEncoding == Encoding.UTF8;
if (!isUnicode && (allText.Contains(new[] { '♪', '♫', '♥', '—', '―', '…' }))) // ANSI & music/unicode symbols
{
if (MessageBox.Show(string.Format(_language.UnicodeMusicSymbolsAnsiWarning), Title, MessageBoxButtons.YesNo) == DialogResult.No)
if (MessageBox.Show(string.Format(_language.UnicodeMusicSymbolsAnsiWarning), Title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
return DialogResult.No;
}
@ -3685,7 +3685,7 @@ namespace Nikse.SubtitleEdit.Forms
}
if (containsNegativeTime)
{
if (MessageBox.Show(_language.NegativeTimeWarning, Title, MessageBoxButtons.YesNo) == DialogResult.No)
if (MessageBox.Show(_language.NegativeTimeWarning, Title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
return DialogResult.No;
}
@ -3698,7 +3698,7 @@ namespace Nikse.SubtitleEdit.Forms
if (MessageBox.Show(string.Format(_language.OverwriteModifiedFile,
_fileName, fileOnDisk.ToShortDateString(), fileOnDisk.ToString("HH:mm:ss"),
Environment.NewLine, _fileDateTime.ToShortDateString(), _fileDateTime.ToString("HH:mm:ss")),
Title + " - " + _language.FileOnDiskModified, MessageBoxButtons.YesNo) == DialogResult.No)
Title + " - " + _language.FileOnDiskModified, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
return DialogResult.No;
}
if (fileInfo.IsReadOnly)
@ -3797,7 +3797,7 @@ namespace Nikse.SubtitleEdit.Forms
}
if (containsNegativeTime)
{
if (MessageBox.Show(_language.NegativeTimeWarning, Title, MessageBoxButtons.YesNo) == DialogResult.No)
if (MessageBox.Show(_language.NegativeTimeWarning, Title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
return DialogResult.No;
}
@ -3824,7 +3824,7 @@ namespace Nikse.SubtitleEdit.Forms
bool isUnicode = currentEncoding != null && (currentEncoding == Encoding.Unicode || currentEncoding == Encoding.UTF32 || currentEncoding == Encoding.UTF7 || currentEncoding == Encoding.UTF8);
if (!isUnicode && (allText.Contains(new[] { '♪', '♫', '♥', '—', '―', '…' }))) // ANSI & music/unicode symbols
{
if (MessageBox.Show(string.Format(_language.UnicodeMusicSymbolsAnsiWarning), Title, MessageBoxButtons.YesNo) == DialogResult.No)
if (MessageBox.Show(string.Format(_language.UnicodeMusicSymbolsAnsiWarning), Title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
return DialogResult.No;
}
@ -4745,7 +4745,7 @@ namespace Nikse.SubtitleEdit.Forms
//if we fail to find the text, we might want to start searching from the top of the file.
if (!found && _findHelper.StartLineIndex >= 1)
{
if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
found = _findHelper.Find(_subtitle, _subtitleAlternate, -1);
}
@ -4827,7 +4827,7 @@ namespace Nikse.SubtitleEdit.Forms
{
if (_findHelper.StartLineIndex >= 1)
{
if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
_findHelper.StartLineIndex = 0;
if (_findHelper.Find(_subtitle, _subtitleAlternate, 0))
@ -5094,7 +5094,7 @@ namespace Nikse.SubtitleEdit.Forms
string msgText = _language.ReplaceContinueNotFound;
if (matches.Count > 0)
msgText = string.Format(_language.ReplaceXContinue, matches.Count);
if (MessageBox.Show(msgText, _language.ReplaceContinueTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
if (MessageBox.Show(msgText, _language.ReplaceContinueTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
s = result.Substring(0, start - 1);
var rest = result.Remove(0, start - 1);
@ -5234,7 +5234,7 @@ namespace Nikse.SubtitleEdit.Forms
string msgText = _language.ReplaceContinueNotFound;
if (replaceCount > 0)
msgText = string.Format(_language.ReplaceXContinue, replaceCount);
if (MessageBox.Show(msgText, _language.ReplaceContinueTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
if (MessageBox.Show(msgText, _language.ReplaceContinueTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
stopAtIndex = firstIndex;
_findHelper.MatchInOriginal = false;
@ -5277,7 +5277,7 @@ namespace Nikse.SubtitleEdit.Forms
if (_replaceStartLineIndex >= 1) // Prompt for start over
{
_replaceStartLineIndex = 0;
if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
SubtitleListview1.SelectIndexAndEnsureVisible(0, true);
_findHelper.StartLineIndex = 0;
@ -5361,7 +5361,7 @@ namespace Nikse.SubtitleEdit.Forms
if (_replaceStartLineIndex >= 1)
{
_replaceStartLineIndex = 0;
if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
if (MessageBox.Show(_language.FindContinue, _language.FindContinueTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
SubtitleListview1.SelectIndexAndEnsureVisible(0, true);
_findHelper.StartLineIndex = 0;
@ -6008,7 +6008,7 @@ namespace Nikse.SubtitleEdit.Forms
visualSync.ShowDialog(this);
if (visualSync.OkPressed)
{
if (MessageBox.Show(_language.AppendSynchronizedSubtitlePrompt, _language.SubtitleAppendPromptTitle, MessageBoxButtons.YesNo) == DialogResult.Yes)
if (MessageBox.Show(_language.AppendSynchronizedSubtitlePrompt, _language.SubtitleAppendPromptTitle, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
int start = _subtitle.Paragraphs.Count + 1;
var fr = CurrentFrameRate;
@ -6185,7 +6185,7 @@ namespace Nikse.SubtitleEdit.Forms
if (!isSwedish)
promptText = _language.TranslateSwedishToDanishWarning;
if (MessageBox.Show(promptText, Title, MessageBoxButtons.YesNo) == DialogResult.Yes)
if (MessageBox.Show(promptText, Title, MessageBoxButtons.YesNoCancel) == DialogResult.Yes)
{
try
{
@ -6537,6 +6537,12 @@ namespace Nikse.SubtitleEdit.Forms
RefreshSelectedParagraph();
}
public void DeleteLine(int i, Paragraph p)
{
MakeHistoryForUndo(Configuration.Settings.Language.Main.OneLineDeleted);
DeleteSelectedLines();
}
public void FocusParagraph(int index)
{
if (tabControlSubtitle.SelectedIndex == TabControlSourceView)
@ -7445,7 +7451,7 @@ namespace Nikse.SubtitleEdit.Forms
askText = _language.DeleteOneLinePrompt;
}
if (Configuration.Settings.General.PromptDeleteLines && MessageBox.Show(askText, Title, MessageBoxButtons.YesNo) == DialogResult.No)
if (Configuration.Settings.General.PromptDeleteLines && MessageBox.Show(askText, Title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
{
_cutText = string.Empty;
return;
@ -11843,7 +11849,7 @@ namespace Nikse.SubtitleEdit.Forms
if (File.Exists(idxFileName))
return true;
var dr = MessageBox.Show(string.Format(_language.IdxFileNotFoundWarning, idxFileName), _title, MessageBoxButtons.YesNo);
var dr = MessageBox.Show(string.Format(_language.IdxFileNotFoundWarning, idxFileName), _title, MessageBoxButtons.YesNoCancel);
return dr == DialogResult.Yes;
}
if (verbose)
@ -14940,7 +14946,7 @@ namespace Nikse.SubtitleEdit.Forms
if (timeCodeSubtitle.Paragraphs.Count != _subtitle.Paragraphs.Count)
{
var text = string.Format(_language.ImportTimeCodesDifferentNumberOfLinesWarning, timeCodeSubtitle.Paragraphs.Count, _subtitle.Paragraphs.Count);
if (MessageBox.Show(this, text, _title, MessageBoxButtons.YesNo) == DialogResult.No)
if (MessageBox.Show(this, text, _title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
return;
}
@ -22658,7 +22664,7 @@ namespace Nikse.SubtitleEdit.Forms
newP.EndTime.TotalMilliseconds <= p.EndTime.TotalMilliseconds))
{
// new subs will overlap existing subs
if (MessageBox.Show(_language.PromptInsertSubtitleOverlap, _languageGeneral.Title, MessageBoxButtons.YesNo) == DialogResult.No)
if (MessageBox.Show(_language.PromptInsertSubtitleOverlap, _languageGeneral.Title, MessageBoxButtons.YesNoCancel) != DialogResult.Yes)
{
return;
}

View File

@ -37,9 +37,9 @@
this.labelDescription1.AutoSize = true;
this.labelDescription1.Location = new System.Drawing.Point(21, 27);
this.labelDescription1.Name = "labelDescription1";
this.labelDescription1.Size = new System.Drawing.Size(220, 13);
this.labelDescription1.Size = new System.Drawing.Size(145, 13);
this.labelDescription1.TabIndex = 29;
this.labelDescription1.Text = "Get Tesseract OCR dictionaries from the web";
this.labelDescription1.Text = "Downloading Tesseract OCR";
//
// labelPleaseWait
//
@ -63,7 +63,7 @@
this.Name = "DownloadTesseract302";
this.ShowInTaskbar = false;
this.StartPosition = System.Windows.Forms.FormStartPosition.CenterParent;
this.Text = "DownloadTesseract302";
this.Text = "Download Tesseract 3.02";
this.ResumeLayout(false);
this.PerformLayout();

View File

@ -0,0 +1,77 @@
namespace Nikse.SubtitleEdit.Forms.Ocr
{
partial class DownloadTesseract4
{
/// <summary>
/// Required designer variable.
/// </summary>
private System.ComponentModel.IContainer components = null;
/// <summary>
/// Clean up any resources being used.
/// </summary>
/// <param name="disposing">true if managed resources should be disposed; otherwise, false.</param>
protected override void Dispose(bool disposing)
{
if (disposing && (components != null))
{
components.Dispose();
}
base.Dispose(disposing);
}
#region Windows Form Designer generated code
/// <summary>
/// Required method for Designer support - do not modify
/// the contents of this method with the code editor.
/// </summary>
private void InitializeComponent()
{
this.labelDescription1 = new System.Windows.Forms.Label();
this.labelPleaseWait = new System.Windows.Forms.Label();
this.SuspendLayout();
//
// labelDescription1
//
this.labelDescription1.AutoSize = true;
this.labelDescription1.Location = new System.Drawing.Point(21, 27);
this.labelDescription1.Name = "labelDescription1";
this.labelDescription1.Size = new System.Drawing.Size(145, 13);
this.labelDescription1.TabIndex = 29;
this.labelDescription1.Text = "Downloading Tesseract OCR";
//
// labelPleaseWait
//
this.labelPleaseWait.AutoSize = true;
this.labelPleaseWait.Location = new System.Drawing.Point(21, 59);
this.labelPleaseWait.Name = "labelPleaseWait";
this.labelPleaseWait.Size = new System.Drawing.Size(70, 13);
this.labelPleaseWait.TabIndex = 28;
this.labelPleaseWait.Text = "Please wait...";
//
// DownloadTesseract4
//
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
this.ClientSize = new System.Drawing.Size(320, 93);
this.Controls.Add(this.labelDescription1);
this.Controls.Add(this.labelPleaseWait);
this.FormBorderStyle = System.Windows.Forms.FormBorderStyle.FixedDialog;
this.MaximizeBox = false;
this.MinimizeBox = false;
this.Name = "DownloadTesseract4";
this.ShowInTaskbar = false;
this.StartPosition = System.Windows.Forms.FormStartPosition.CenterParent;
this.Text = "Download Tesseract 4";
this.ResumeLayout(false);
this.PerformLayout();
}
#endregion
private System.Windows.Forms.Label labelDescription1;
private System.Windows.Forms.Label labelPleaseWait;
}
}

View File

@ -0,0 +1,70 @@
using System;
using System.IO;
using System.IO.Compression;
using System.Net;
using System.Windows.Forms;
using Nikse.SubtitleEdit.Core;
namespace Nikse.SubtitleEdit.Forms.Ocr
{
public partial class DownloadTesseract4 : Form
{
public DownloadTesseract4()
{
InitializeComponent();
var wc = new WebClient { Proxy = Utilities.GetProxy() };
wc.DownloadDataAsync(new Uri("https://github.com/SubtitleEdit/support-files/raw/master/Tesseract4.tar.gz"));
wc.DownloadDataCompleted += wc_DownloadDataCompleted;
wc.DownloadProgressChanged += (o, args) =>
{
labelPleaseWait.Text = Configuration.Settings.Language.General.PleaseWait + " " + args.ProgressPercentage + "%";
};
}
private void wc_DownloadDataCompleted(object sender, DownloadDataCompletedEventArgs e)
{
if (e.Error != null)
{
MessageBox.Show(Configuration.Settings.Language.GetTesseractDictionaries.DownloadFailed);
DialogResult = DialogResult.Cancel;
return;
}
string dictionaryFolder = Configuration.TesseractDirectory;
if (!Directory.Exists(dictionaryFolder))
Directory.CreateDirectory(dictionaryFolder);
var tempFileName = Path.GetTempFileName() + ".tar";
using (var ms = new MemoryStream(e.Result))
using (var fs = new FileStream(tempFileName, FileMode.Create))
using (var zip = new GZipStream(ms, CompressionMode.Decompress))
{
byte[] buffer = new byte[1024];
int nRead;
while ((nRead = zip.Read(buffer, 0, buffer.Length)) > 0)
{
fs.Write(buffer, 0, nRead);
}
}
using (var tr = new TarReader(tempFileName))
{
foreach (var th in tr.Files)
{
string fn = Path.Combine(dictionaryFolder, th.FileName.Replace('/', Path.DirectorySeparatorChar));
if (th.IsFolder)
{
Directory.CreateDirectory(Path.Combine(dictionaryFolder, th.FileName.Replace('/', Path.DirectorySeparatorChar)));
}
else if (th.FileSizeInBytes > 0)
{
th.WriteData(fn);
}
}
}
File.Delete(tempFileName);
Cursor = Cursors.Default;
DialogResult = DialogResult.OK;
}
}
}

View File

@ -0,0 +1,120 @@
<?xml version="1.0" encoding="utf-8"?>
<root>
<!--
Microsoft ResX Schema
Version 2.0
The primary goals of this format is to allow a simple XML format
that is mostly human readable. The generation and parsing of the
various data types are done through the TypeConverter classes
associated with the data types.
Example:
... ado.net/XML headers & schema ...
<resheader name="resmimetype">text/microsoft-resx</resheader>
<resheader name="version">2.0</resheader>
<resheader name="reader">System.Resources.ResXResourceReader, System.Windows.Forms, ...</resheader>
<resheader name="writer">System.Resources.ResXResourceWriter, System.Windows.Forms, ...</resheader>
<data name="Name1"><value>this is my long string</value><comment>this is a comment</comment></data>
<data name="Color1" type="System.Drawing.Color, System.Drawing">Blue</data>
<data name="Bitmap1" mimetype="application/x-microsoft.net.object.binary.base64">
<value>[base64 mime encoded serialized .NET Framework object]</value>
</data>
<data name="Icon1" type="System.Drawing.Icon, System.Drawing" mimetype="application/x-microsoft.net.object.bytearray.base64">
<value>[base64 mime encoded string representing a byte array form of the .NET Framework object]</value>
<comment>This is a comment</comment>
</data>
There are any number of "resheader" rows that contain simple
name/value pairs.
Each data row contains a name, and value. The row also contains a
type or mimetype. Type corresponds to a .NET class that support
text/value conversion through the TypeConverter architecture.
Classes that don't support this are serialized and stored with the
mimetype set.
The mimetype is used for serialized objects, and tells the
ResXResourceReader how to depersist the object. This is currently not
extensible. For a given mimetype the value must be set accordingly:
Note - application/x-microsoft.net.object.binary.base64 is the format
that the ResXResourceWriter will generate, however the reader can
read any of the formats listed below.
mimetype: application/x-microsoft.net.object.binary.base64
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Binary.BinaryFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.soap.base64
value : The object must be serialized with
: System.Runtime.Serialization.Formatters.Soap.SoapFormatter
: and then encoded with base64 encoding.
mimetype: application/x-microsoft.net.object.bytearray.base64
value : The object must be serialized into a byte array
: using a System.ComponentModel.TypeConverter
: and then encoded with base64 encoding.
-->
<xsd:schema id="root" xmlns="" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:msdata="urn:schemas-microsoft-com:xml-msdata">
<xsd:import namespace="http://www.w3.org/XML/1998/namespace" />
<xsd:element name="root" msdata:IsDataSet="true">
<xsd:complexType>
<xsd:choice maxOccurs="unbounded">
<xsd:element name="metadata">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" />
</xsd:sequence>
<xsd:attribute name="name" use="required" type="xsd:string" />
<xsd:attribute name="type" type="xsd:string" />
<xsd:attribute name="mimetype" type="xsd:string" />
<xsd:attribute ref="xml:space" />
</xsd:complexType>
</xsd:element>
<xsd:element name="assembly">
<xsd:complexType>
<xsd:attribute name="alias" type="xsd:string" />
<xsd:attribute name="name" type="xsd:string" />
</xsd:complexType>
</xsd:element>
<xsd:element name="data">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
<xsd:element name="comment" type="xsd:string" minOccurs="0" msdata:Ordinal="2" />
</xsd:sequence>
<xsd:attribute name="name" type="xsd:string" use="required" msdata:Ordinal="1" />
<xsd:attribute name="type" type="xsd:string" msdata:Ordinal="3" />
<xsd:attribute name="mimetype" type="xsd:string" msdata:Ordinal="4" />
<xsd:attribute ref="xml:space" />
</xsd:complexType>
</xsd:element>
<xsd:element name="resheader">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="value" type="xsd:string" minOccurs="0" msdata:Ordinal="1" />
</xsd:sequence>
<xsd:attribute name="name" type="xsd:string" use="required" />
</xsd:complexType>
</xsd:element>
</xsd:choice>
</xsd:complexType>
</xsd:element>
</xsd:schema>
<resheader name="resmimetype">
<value>text/microsoft-resx</value>
</resheader>
<resheader name="version">
<value>2.0</value>
</resheader>
<resheader name="reader">
<value>System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</resheader>
<resheader name="writer">
<value>System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089</value>
</resheader>
</root>

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,188 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Text;
using Nikse.SubtitleEdit.Core;
namespace Nikse.SubtitleEdit.Logic.Ocr.Tesseract
{
/// <summary>
/// Run multiple images per tesseract call
/// </summary>
public class TesseractMultiRunner
{
private readonly List<string> _tesseractErrors;
public TesseractMultiRunner()
{
_tesseractErrors = new List<string>();
}
private void TesseractErrorReceived(object sender, DataReceivedEventArgs e)
{
var msg = e.Data;
if (string.IsNullOrEmpty(msg) ||
msg.StartsWith("Tesseract Open Source OCR Engine", StringComparison.OrdinalIgnoreCase) ||
msg.Contains("Too few characters", StringComparison.OrdinalIgnoreCase) ||
msg.Contains("Empty page", StringComparison.OrdinalIgnoreCase) ||
msg.Contains(" diacritics", StringComparison.OrdinalIgnoreCase) ||
msg.Contains("Weak margin", StringComparison.OrdinalIgnoreCase))
{
return;
}
_tesseractErrors.Add(msg);
}
public string Run(List<NikseBitmap> bmps, string language, string psmMode)
{
// change yellow color to white - easier for Tesseract
string inputFileName = Path.GetTempPath() + Guid.NewGuid() + ".txt";
var filesToDelete = new List<string>();
var sb = new StringBuilder();
foreach (var bmp in bmps)
{
bmp.ReplaceYellowWithWhite(); // optimized replace
string pngFileName = Path.GetTempPath() + Guid.NewGuid() + ".png";
using (var b = bmp.GetBitmap())
{
b.Save(pngFileName, System.Drawing.Imaging.ImageFormat.Png);
}
filesToDelete.Add(pngFileName);
sb.AppendLine(pngFileName);
}
File.WriteAllText(inputFileName, sb.ToString());
filesToDelete.Add(inputFileName);
var outputFileName = Path.GetTempPath() + Guid.NewGuid();
var dir = @"C:\Data\SubtitleEdit\subtitleedit\src\bin\Debug\Tesseract4";
using (var process = new Process())
{
process.StartInfo = new ProcessStartInfo(dir + "tesseract.exe")
{
UseShellExecute = true,
Arguments = "\"" + inputFileName + "\" \"" + outputFileName + "\" -l " + language
};
if (!string.IsNullOrEmpty(psmMode))
process.StartInfo.Arguments += " " + psmMode.Trim();
process.StartInfo.Arguments += " hocr";
process.StartInfo.Arguments = " --tessdata-dir \"" + Path.Combine(dir, "tessdata") + "\" " + process.StartInfo.Arguments.Trim();
process.StartInfo.WindowStyle = ProcessWindowStyle.Hidden;
if (Configuration.IsRunningOnLinux() || Configuration.IsRunningOnMac())
{
process.StartInfo.UseShellExecute = false;
process.StartInfo.RedirectStandardError = true;
process.StartInfo.FileName = "tesseract";
}
else
{
var tessdataPath = Path.Combine(Configuration.TesseractDirectory, "tessdata");
process.StartInfo.Arguments = " --tessdata-dir \"" + tessdataPath + "\" " + process.StartInfo.Arguments.Trim();
process.StartInfo.WorkingDirectory = Configuration.TesseractDirectory;
process.StartInfo.UseShellExecute = false;
process.StartInfo.CreateNoWindow = true;
process.StartInfo.RedirectStandardError = true;
process.ErrorDataReceived += TesseractErrorReceived;
process.EnableRaisingEvents = true;
}
try
{
process.Start();
process.BeginErrorReadLine();
}
catch
{
if (_tesseractErrors.Count <= 2)
{
if (Configuration.IsRunningOnLinux() || Configuration.IsRunningOnMac())
{
_tesseractErrors.Add("Unable to start 'Tesseract' - make sure tesseract-ocr 4.x is installed!");
}
else
{
_tesseractErrors.Add("Unable to start 'Tesseract' (" + Configuration.TesseractDirectory + "tesseract.exe) - make sure Subtitle Edit is install correctly + Visual Studio 2017 C++ runtime");
}
}
}
process.WaitForExit(5000 + bmps.Count * 500);
string result = string.Empty;
string resultFileName = outputFileName + ".html";
if (!File.Exists(outputFileName))
resultFileName = outputFileName + ".hocr";
filesToDelete.Add(resultFileName);
try
{
if (File.Exists(outputFileName))
{
result = File.ReadAllText(outputFileName, Encoding.UTF8);
result = ParseHocr(result);
}
foreach (var fileName in filesToDelete)
{
if (File.Exists(fileName))
{
File.Delete(fileName);
}
}
}
catch
{
// ignored
}
return result;
}
}
private static string ParseHocr(string html)
{
string s = html.Replace("<em>", "@001_____").Replace("</em>", "@002_____");
int first = s.IndexOf('<');
while (first >= 0)
{
int last = s.IndexOf('>');
if (last > 0)
{
s = s.Remove(first, last - first + 1);
first = s.IndexOf('<');
}
else
{
first = -1;
}
}
s = s.Trim();
s = s.Replace("@001_____", "<i>").Replace("@002_____", "</i>");
while (s.Contains(" "))
s = s.Replace(" ", " ");
s = s.Replace("</i> <i>", " ");
// html escape decoding
s = s.Replace("&amp;", "&");
s = s.Replace("&lt;", "<");
s = s.Replace("&gt;", ">");
s = s.Replace("&quot;", "\"");
s = s.Replace("&#39;", "'");
s = s.Replace("&apos;", "'");
while (s.Contains("\n\n"))
s = s.Replace("\n\n", "\n");
s = s.Replace("</i>\n<i>", "\n");
s = s.Replace("\n", Environment.NewLine);
return s;
}
}
}

View File

@ -0,0 +1,160 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Text;
using Nikse.SubtitleEdit.Core;
namespace Nikse.SubtitleEdit.Logic.Ocr.Tesseract
{
public class TesseractRunner
{
public List<string> TesseractErrors { get; set; }
public string LastError { get; set; }
public TesseractRunner()
{
TesseractErrors = new List<string>();
}
public string Run(string languageCode, string psmMode, string engineMode, string imageFileName, bool run302 = false)
{
LastError = null;
var dir = run302 ? Configuration.Tesseract302Directory : Configuration.TesseractDirectory;
string tempTextFileName = Path.GetTempPath() + Guid.NewGuid();
using (var process = new Process())
{
process.StartInfo = new ProcessStartInfo(Path.Combine(dir, "tesseract.exe"))
{
UseShellExecute = true,
Arguments = "\"" + imageFileName + "\" \"" + tempTextFileName + "\" -l " + languageCode
};
if (!string.IsNullOrEmpty(psmMode))
{
process.StartInfo.Arguments += " --psm " + psmMode;
}
if (!string.IsNullOrEmpty(engineMode) && !run302)
{
process.StartInfo.Arguments += " --oem " + engineMode;
}
process.StartInfo.Arguments += " hocr";
if (run302)
{
process.StartInfo.WorkingDirectory = Configuration.Tesseract302Directory;
}
else
{
process.ErrorDataReceived += TesseractErrorReceived;
process.StartInfo.Arguments = " --tessdata-dir \"" + Path.Combine(dir, "tessdata") + "\" " + process.StartInfo.Arguments.Trim();
}
process.StartInfo.WindowStyle = ProcessWindowStyle.Hidden;
try
{
process.Start();
}
catch (Exception exception)
{
LastError = exception.Message + Environment.NewLine + exception.StackTrace;
TesseractErrors.Add(LastError);
return "Error!";
}
process.WaitForExit(5000);
}
string result = string.Empty;
string outputFileName = tempTextFileName + ".html";
if (!File.Exists(outputFileName))
outputFileName = tempTextFileName + ".hocr";
try
{
if (File.Exists(outputFileName))
{
result = File.ReadAllText(outputFileName, Encoding.UTF8);
result = ParseHocr(result);
File.Delete(outputFileName);
}
File.Delete(imageFileName);
}
catch
{
// ignored
}
return result;
}
private static string ParseHocr(string html)
{
string s = html.Replace("<em>", "@001_____").Replace("</em>", "@002_____");
int first = s.IndexOf('<');
while (first >= 0)
{
int last = s.IndexOf('>');
if (last > 0)
{
s = s.Remove(first, last - first + 1);
first = s.IndexOf('<');
}
else
{
first = -1;
}
}
s = s.Trim();
s = s.Replace("@001_____", "<i>").Replace("@002_____", "</i>");
while (s.Contains(" "))
s = s.Replace(" ", " ");
s = s.Replace("</i> <i>", " ");
// html escape decoding
s = s.Replace("&amp;", "&")
.Replace("&lt;", "<")
.Replace("&gt;", ">")
.Replace("&quot;", "\"")
.Replace("&#39;", "'")
.Replace("&apos;", "'");
while (s.Contains("\n\n"))
s = s.Replace("\n\n", "\n");
s = s.Replace("</i>\n<i>", "\n");
s = s.Replace("\n", Environment.NewLine);
return s;
}
private void TesseractErrorReceived(object sender, DataReceivedEventArgs e)
{
string msg = e.Data;
if (string.IsNullOrEmpty(msg) ||
msg.StartsWith("Tesseract Open Source OCR Engine", StringComparison.OrdinalIgnoreCase) ||
msg.Contains("Too few characters", StringComparison.OrdinalIgnoreCase) ||
msg.Contains("Empty page", StringComparison.OrdinalIgnoreCase) ||
msg.Contains(" diacritics", StringComparison.OrdinalIgnoreCase) ||
msg.Contains("Weak margin", StringComparison.OrdinalIgnoreCase))
{
return;
}
if (TesseractErrors.Count <= 100)
{
if (string.IsNullOrEmpty(LastError))
{
LastError = msg;
}
else if (!LastError.Contains(msg))
{
LastError = LastError + Environment.NewLine + msg;
}
TesseractErrors.Add(msg);
}
}
}
}

View File

@ -0,0 +1,95 @@
using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using System.Threading;
namespace Nikse.SubtitleEdit.Logic.Ocr.Tesseract
{
public class TesseractThreadRunner
{
public delegate void OcrDone(int index, ImageJob job);
private readonly OcrDone _callback;
private readonly Queue<ImageJob> _jobQueue;
private static readonly object QueueLock = new object();
private readonly TesseractRunner _tesseractRunner;
private bool _abort;
public TesseractThreadRunner(OcrDone callback = null)
{
_jobQueue = new Queue<ImageJob>();
_callback = callback;
_tesseractRunner = new TesseractRunner();
}
public class ImageJob
{
public string FileName { get; set; }
public int Index { get; set; }
public string Result { get; set; }
public DateTime Completed { get; set; }
public string LanguageCode { get; set; }
public string PsmMode { get; set; }
public string EngineMode { get; set; }
public bool Run302 { get; set; }
public Bitmap Bitmap { get; set; }
}
private void DoOcr(object j)
{
if (_abort)
{
return;
}
var job = (ImageJob)j;
job.Result = _tesseractRunner.Run(job.LanguageCode, job.PsmMode, job.EngineMode, job.FileName, job.Run302);
lock (QueueLock)
{
job.Completed = DateTime.UtcNow;
}
}
public void AddImageJob(Bitmap bmp, int index, string language, string psmMode, string engineMode, bool run302)
{
var job = new ImageJob
{
FileName = Path.GetTempFileName() + ".png",
Index = index,
Completed = DateTime.MaxValue,
Bitmap = bmp,
LanguageCode = language,
PsmMode = psmMode,
EngineMode = engineMode,
Run302 = run302
};
bmp.Save(job.FileName, System.Drawing.Imaging.ImageFormat.Png);
ThreadPool.QueueUserWorkItem(DoOcr, job);
_jobQueue.Enqueue(job);
}
public void CheckQueue()
{
if (_jobQueue.Count == 0)
{
return;
}
lock (QueueLock)
{
var checkTime = DateTime.UtcNow;
var job = _jobQueue.Peek();
if (job != null && job.Completed < checkTime)
{
_jobQueue.Dequeue();
_callback?.Invoke(job.Index, job);
}
}
}
public void Cancel()
{
_abort = true;
}
}
}

View File

@ -518,6 +518,12 @@
<Compile Include="Forms\Networking\NetworkStart.Designer.cs">
<DependentUpon>NetworkStart.cs</DependentUpon>
</Compile>
<Compile Include="Forms\Ocr\DownloadTesseract4.cs">
<SubType>Form</SubType>
</Compile>
<Compile Include="Forms\Ocr\DownloadTesseract4.Designer.cs">
<DependentUpon>DownloadTesseract4.cs</DependentUpon>
</Compile>
<Compile Include="Forms\Ocr\DownloadTesseract302.cs">
<SubType>Form</SubType>
</Compile>
@ -937,6 +943,9 @@
<Compile Include="Logic\Ocr\NOcrPoint.cs" />
<Compile Include="Logic\Ocr\PreprocessingSettings.cs" />
<Compile Include="Logic\Ocr\SpellCheckOcrTextResult.cs" />
<Compile Include="Logic\Ocr\Tesseract\TesseractThreadRunner.cs" />
<Compile Include="Logic\Ocr\Tesseract\TesseractRunner.cs" />
<Compile Include="Logic\Ocr\Tesseract\TesseractMultiRunner.cs" />
<Compile Include="Logic\RtfTextConverterRichTextBox.cs" />
<Compile Include="Logic\SpellCheck\Hunspell.cs" />
<Compile Include="Logic\SpellCheck\LinuxHunspell.cs" />
@ -1207,6 +1216,9 @@
<EmbeddedResource Include="Forms\Networking\NetworkStart.resx">
<DependentUpon>NetworkStart.cs</DependentUpon>
</EmbeddedResource>
<EmbeddedResource Include="Forms\Ocr\DownloadTesseract4.resx">
<DependentUpon>DownloadTesseract4.cs</DependentUpon>
</EmbeddedResource>
<EmbeddedResource Include="Forms\Ocr\DownloadTesseract302.resx">
<DependentUpon>DownloadTesseract302.cs</DependentUpon>
</EmbeddedResource>