Fixed Tesseract dictionaries download (google code downloads no longer available)

This commit is contained in:
Nikolaj Olsson 2016-08-30 17:07:19 +02:00
parent e4b86812d8
commit 5a6c2b5f42
4 changed files with 99 additions and 42 deletions

View File

@ -60,7 +60,7 @@
this.labelChooseLanguageAndClickDownload.AutoSize = true;
this.labelChooseLanguageAndClickDownload.Location = new System.Drawing.Point(19, 52);
this.labelChooseLanguageAndClickDownload.Name = "labelChooseLanguageAndClickDownload";
this.labelChooseLanguageAndClickDownload.Size = new System.Drawing.Size(202, 13);
this.labelChooseLanguageAndClickDownload.Size = new System.Drawing.Size(208, 13);
this.labelChooseLanguageAndClickDownload.TabIndex = 23;
this.labelChooseLanguageAndClickDownload.Text = "Choose your language and click download";
//
@ -116,6 +116,7 @@
this.Controls.Add(this.buttonDownload);
this.Controls.Add(this.linkLabelOpenDictionaryFolder);
this.Controls.Add(this.buttonOK);
this.FormBorderStyle = System.Windows.Forms.FormBorderStyle.FixedDialog;
this.KeyPreview = true;
this.MaximizeBox = false;
this.MinimizeBox = false;

View File

@ -15,6 +15,7 @@ namespace Nikse.SubtitleEdit.Forms
private List<string> _dictionaryDownloadLinks = new List<string>();
private List<string> _descriptions = new List<string>();
private string _xmlName = null;
private string _dictionaryFileName = null;
public GetTesseractDictionaries()
{
@ -45,9 +46,17 @@ namespace Nikse.SubtitleEdit.Forms
using (var rdr = new StreamReader(strm))
using (var zip = new GZipStream(rdr.BaseStream, CompressionMode.Decompress))
{
byte[] data = new byte[175000];
zip.Read(data, 0, 175000);
doc.LoadXml(System.Text.Encoding.UTF8.GetString(data));
byte[] data = new byte[195000];
int bytesRead = zip.Read(data, 0, data.Length);
var s = System.Text.Encoding.UTF8.GetString(data, 0, bytesRead).Trim();
try
{
doc.LoadXml(s);
}
catch (Exception exception)
{
MessageBox.Show(exception.Message);
}
}
foreach (XmlNode node in doc.DocumentElement.SelectNodes("Dictionary"))
@ -87,14 +96,26 @@ namespace Nikse.SubtitleEdit.Forms
buttonOK.Enabled = false;
buttonDownload.Enabled = false;
comboBoxDictionaries.Enabled = false;
this.Refresh();
Refresh();
Cursor = Cursors.WaitCursor;
int index = comboBoxDictionaries.SelectedIndex;
string url = _dictionaryDownloadLinks[index];
var wc = new WebClient { Proxy = Utilities.GetProxy() };
wc.DownloadDataCompleted += wc_DownloadDataCompleted;
if (url.EndsWith(".traineddata", StringComparison.OrdinalIgnoreCase))
{
_dictionaryFileName = Path.GetFileName(url);
wc.DownloadDataCompleted += wc_DownloadTrainedDataCompleted;
}
else
{
wc.DownloadDataCompleted += wc_DownloadDataCompleted;
}
wc.DownloadProgressChanged += (o, args) =>
{
labelPleaseWait.Text = Configuration.Settings.Language.General.PleaseWait + " " + args.ProgressPercentage + "%";
};
wc.DownloadDataAsync(new Uri(url));
Cursor = Cursors.Default;
}
@ -155,6 +176,40 @@ namespace Nikse.SubtitleEdit.Forms
MessageBox.Show(string.Format(Configuration.Settings.Language.GetDictionaries.XDownloaded, comboBoxDictionaries.Items[index]));
}
private void wc_DownloadTrainedDataCompleted(object sender, DownloadDataCompletedEventArgs e)
{
if (e.Error != null)
{
MessageBox.Show(Configuration.Settings.Language.GetTesseractDictionaries.DownloadFailed);
DialogResult = DialogResult.Cancel;
return;
}
string dictionaryFolder = Configuration.TesseractDataFolder;
if (!Directory.Exists(dictionaryFolder))
Directory.CreateDirectory(dictionaryFolder);
int index = comboBoxDictionaries.SelectedIndex;
using (var ms = new MemoryStream(e.Result))
using (var fs = new FileStream(Path.Combine(dictionaryFolder, _dictionaryFileName), FileMode.Create))
{
ms.Position = 0;
byte[] buffer = new byte[1024];
int nRead;
while ((nRead = ms.Read(buffer, 0, buffer.Length)) > 0)
{
fs.Write(buffer, 0, nRead);
}
}
Cursor = Cursors.Default;
labelPleaseWait.Text = string.Empty;
buttonOK.Enabled = true;
buttonDownload.Enabled = true;
comboBoxDictionaries.Enabled = true;
MessageBox.Show(string.Format(Configuration.Settings.Language.GetDictionaries.XDownloaded, comboBoxDictionaries.Items[index]));
}
private void linkLabelOpenDictionaryFolder_LinkClicked(object sender, LinkLabelLinkClickedEventArgs e)
{
string dictionaryFolder = Configuration.TesseractDataFolder;

View File

@ -1,166 +1,167 @@
<!-- Alternately from https://sourceforge.net/projects/tesseract-ocr-alt/files/ -->
<TesseractDictionaries>
<Dictionary>
<!--<Dictionary>
<EnglishName>Arabic</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.ara.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/ara.traineddata</DownloadLink>
<Description>Arabic language data for Tesseract 3.02</Description>
</Dictionary>
</Dictionary>-->
<Dictionary>
<EnglishName>Basque</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.eus.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/eus.traineddata</DownloadLink>
<Description>Basque language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Bulgarian</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.bul.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/bul.traineddata</DownloadLink>
<Description>Bulgarian language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Catalan</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.cat.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/cat.traineddata</DownloadLink>
<Description>Catalan language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Chinese (Simplified)</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.chi_sim.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/chi_sim.traineddata</DownloadLink>
<Description>Chinese (Simplified) language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Chinese (Traditional)</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.chi_tra.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/chi_tra.traineddata</DownloadLink>
<Description>Chinese (Traditional) language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Czech</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.ces.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/ces.traineddata</DownloadLink>
<Description>Czech language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Danish</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.dan.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/dan.traineddata</DownloadLink>
<Description>Danish language data for Tesseract 3.02</Description>
</Dictionary> <Dictionary>
<EnglishName>Dutch</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.nld.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/nld.traineddata</DownloadLink>
<Description>Dutch language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>English</EnglishName>
<DownloadLink>https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.eng.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/eng.traineddata</DownloadLink>
<Description>English language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Finnish</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.fin.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/fin.traineddata</DownloadLink>
<Description>Finnish language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>French</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.fra.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/fra.traineddata</DownloadLink>
<Description>French language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>German</EnglishName>
<DownloadLink>https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.deu.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/deu.traineddata</DownloadLink>
<Description>German language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Greek</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.ell.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/ell.traineddata</DownloadLink>
<Description>Greek language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Hindi</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.hin.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/hin.traineddata</DownloadLink>
<Description>Hindi language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Hungarian</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.hun.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/hun.traineddata</DownloadLink>
<Description>Hungarian language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Icelandic</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.isl.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/isl.traineddata</DownloadLink>
<Description>Icelandic language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Italian</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.ita.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/ita.traineddata</DownloadLink>
<Description>Italian language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Japanese</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.jpn.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/jpn.traineddata</DownloadLink>
<Description>Japanese language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Korean</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.kor.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/kor.traineddata</DownloadLink>
<Description>Korean language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Lithuanian</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.lit.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/lit.traineddata</DownloadLink>
<Description>Lithuanian language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Norwegian</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.nor.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/nor.traineddata</DownloadLink>
<Description>Norwegian language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Polish</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.pol.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/pol.traineddata</DownloadLink>
<Description>Polish language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Portuguese</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.por.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/por.traineddata</DownloadLink>
<Description>Portuguese language data for Tesseract 3.02 </Description>
</Dictionary>
<Dictionary>
<EnglishName>Romanian</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.ron.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/ron.traineddata</DownloadLink>
<Description>Romanian language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Russian</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.rus.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/rus.traineddata</DownloadLink>
<Description>Russian Language Data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Serbian (Latin)</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.srp.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/srp.traineddata</DownloadLink>
<Description>Serbian (Latin) language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Spanish</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.spa.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/spa.traineddata</DownloadLink>
<Description>Spanish language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Swedish</EnglishName>
<DownloadLink>https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.swe.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/swe.traineddata</DownloadLink>
<Description>Swedish language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Thai</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.tha.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/tha.traineddata</DownloadLink>
<Description>Thai language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Turkish</EnglishName>
<DownloadLink>https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.tur.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/tur.traineddata</DownloadLink>
<Description>Turkish language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Ukrainian</EnglishName>
<DownloadLink>https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.ukr.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/ukr.traineddata</DownloadLink>
<Description>Ukrainian language data for Tesseract 3.02</Description>
</Dictionary>
<Dictionary>
<EnglishName>Vietnamese</EnglishName>
<DownloadLink>http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.vie.tar.gz</DownloadLink>
<DownloadLink>https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/vie.traineddata</DownloadLink>
<Description>Vietnamese Language Data for Tesseract 3.02</Description>
</Dictionary>
</TesseractDictionaries>
</TesseractDictionaries>