diff --git a/src/Forms/GetTesseractDictionaries.Designer.cs b/src/Forms/GetTesseractDictionaries.Designer.cs index f70d09ed7..fc58eb661 100644 --- a/src/Forms/GetTesseractDictionaries.Designer.cs +++ b/src/Forms/GetTesseractDictionaries.Designer.cs @@ -60,7 +60,7 @@ this.labelChooseLanguageAndClickDownload.AutoSize = true; this.labelChooseLanguageAndClickDownload.Location = new System.Drawing.Point(19, 52); this.labelChooseLanguageAndClickDownload.Name = "labelChooseLanguageAndClickDownload"; - this.labelChooseLanguageAndClickDownload.Size = new System.Drawing.Size(202, 13); + this.labelChooseLanguageAndClickDownload.Size = new System.Drawing.Size(208, 13); this.labelChooseLanguageAndClickDownload.TabIndex = 23; this.labelChooseLanguageAndClickDownload.Text = "Choose your language and click download"; // @@ -116,6 +116,7 @@ this.Controls.Add(this.buttonDownload); this.Controls.Add(this.linkLabelOpenDictionaryFolder); this.Controls.Add(this.buttonOK); + this.FormBorderStyle = System.Windows.Forms.FormBorderStyle.FixedDialog; this.KeyPreview = true; this.MaximizeBox = false; this.MinimizeBox = false; diff --git a/src/Forms/GetTesseractDictionaries.cs b/src/Forms/GetTesseractDictionaries.cs index c3c63fd8b..23ee86e1d 100644 --- a/src/Forms/GetTesseractDictionaries.cs +++ b/src/Forms/GetTesseractDictionaries.cs @@ -15,6 +15,7 @@ namespace Nikse.SubtitleEdit.Forms private List _dictionaryDownloadLinks = new List(); private List _descriptions = new List(); private string _xmlName = null; + private string _dictionaryFileName = null; public GetTesseractDictionaries() { @@ -45,9 +46,17 @@ namespace Nikse.SubtitleEdit.Forms using (var rdr = new StreamReader(strm)) using (var zip = new GZipStream(rdr.BaseStream, CompressionMode.Decompress)) { - byte[] data = new byte[175000]; - zip.Read(data, 0, 175000); - doc.LoadXml(System.Text.Encoding.UTF8.GetString(data)); + byte[] data = new byte[195000]; + int bytesRead = zip.Read(data, 0, data.Length); + var s = System.Text.Encoding.UTF8.GetString(data, 0, bytesRead).Trim(); + try + { + doc.LoadXml(s); + } + catch (Exception exception) + { + MessageBox.Show(exception.Message); + } } foreach (XmlNode node in doc.DocumentElement.SelectNodes("Dictionary")) @@ -87,14 +96,26 @@ namespace Nikse.SubtitleEdit.Forms buttonOK.Enabled = false; buttonDownload.Enabled = false; comboBoxDictionaries.Enabled = false; - this.Refresh(); + Refresh(); Cursor = Cursors.WaitCursor; int index = comboBoxDictionaries.SelectedIndex; string url = _dictionaryDownloadLinks[index]; var wc = new WebClient { Proxy = Utilities.GetProxy() }; - wc.DownloadDataCompleted += wc_DownloadDataCompleted; + if (url.EndsWith(".traineddata", StringComparison.OrdinalIgnoreCase)) + { + _dictionaryFileName = Path.GetFileName(url); + wc.DownloadDataCompleted += wc_DownloadTrainedDataCompleted; + } + else + { + wc.DownloadDataCompleted += wc_DownloadDataCompleted; + } + wc.DownloadProgressChanged += (o, args) => + { + labelPleaseWait.Text = Configuration.Settings.Language.General.PleaseWait + " " + args.ProgressPercentage + "%"; + }; wc.DownloadDataAsync(new Uri(url)); Cursor = Cursors.Default; } @@ -155,6 +176,40 @@ namespace Nikse.SubtitleEdit.Forms MessageBox.Show(string.Format(Configuration.Settings.Language.GetDictionaries.XDownloaded, comboBoxDictionaries.Items[index])); } + private void wc_DownloadTrainedDataCompleted(object sender, DownloadDataCompletedEventArgs e) + { + if (e.Error != null) + { + MessageBox.Show(Configuration.Settings.Language.GetTesseractDictionaries.DownloadFailed); + DialogResult = DialogResult.Cancel; + return; + } + + string dictionaryFolder = Configuration.TesseractDataFolder; + if (!Directory.Exists(dictionaryFolder)) + Directory.CreateDirectory(dictionaryFolder); + + int index = comboBoxDictionaries.SelectedIndex; + + using (var ms = new MemoryStream(e.Result)) + using (var fs = new FileStream(Path.Combine(dictionaryFolder, _dictionaryFileName), FileMode.Create)) + { + ms.Position = 0; + byte[] buffer = new byte[1024]; + int nRead; + while ((nRead = ms.Read(buffer, 0, buffer.Length)) > 0) + { + fs.Write(buffer, 0, nRead); + } + } + Cursor = Cursors.Default; + labelPleaseWait.Text = string.Empty; + buttonOK.Enabled = true; + buttonDownload.Enabled = true; + comboBoxDictionaries.Enabled = true; + MessageBox.Show(string.Format(Configuration.Settings.Language.GetDictionaries.XDownloaded, comboBoxDictionaries.Items[index])); + } + private void linkLabelOpenDictionaryFolder_LinkClicked(object sender, LinkLabelLinkClickedEventArgs e) { string dictionaryFolder = Configuration.TesseractDataFolder; diff --git a/src/Resources/TesseractDictionaries.xml b/src/Resources/TesseractDictionaries.xml index 1fdf49428..0aeb7b85d 100644 --- a/src/Resources/TesseractDictionaries.xml +++ b/src/Resources/TesseractDictionaries.xml @@ -1,166 +1,167 @@ + - + Basque - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.eus.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/eus.traineddata Basque language data for Tesseract 3.02 Bulgarian - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.bul.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/bul.traineddata Bulgarian language data for Tesseract 3.02 Catalan - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.cat.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/cat.traineddata Catalan language data for Tesseract 3.02 Chinese (Simplified) - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.chi_sim.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/chi_sim.traineddata Chinese (Simplified) language data for Tesseract 3.02 Chinese (Traditional) - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.chi_tra.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/chi_tra.traineddata Chinese (Traditional) language data for Tesseract 3.02 Czech - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.ces.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/ces.traineddata Czech language data for Tesseract 3.02 Danish - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.dan.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/dan.traineddata Danish language data for Tesseract 3.02 Dutch - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.nld.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/nld.traineddata Dutch language data for Tesseract 3.02 English - https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.eng.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/eng.traineddata English language data for Tesseract 3.02 Finnish - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.fin.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/fin.traineddata Finnish language data for Tesseract 3.02 French - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.fra.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/fra.traineddata French language data for Tesseract 3.02 German - https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.deu.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/deu.traineddata German language data for Tesseract 3.02 Greek - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.ell.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/ell.traineddata Greek language data for Tesseract 3.02 Hindi - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.hin.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/hin.traineddata Hindi language data for Tesseract 3.02 Hungarian - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.hun.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/hun.traineddata Hungarian language data for Tesseract 3.02 Icelandic - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.isl.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/isl.traineddata Icelandic language data for Tesseract 3.02 Italian - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.ita.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/ita.traineddata Italian language data for Tesseract 3.02 Japanese - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.jpn.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/jpn.traineddata Japanese language data for Tesseract 3.02 Korean - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.kor.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/kor.traineddata Korean language data for Tesseract 3.02 Lithuanian - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.lit.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/lit.traineddata Lithuanian language data for Tesseract 3.02 Norwegian - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.nor.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/nor.traineddata Norwegian language data for Tesseract 3.02 Polish - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.pol.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/pol.traineddata Polish language data for Tesseract 3.02 Portuguese - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.por.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/por.traineddata Portuguese language data for Tesseract 3.02 Romanian - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.ron.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/ron.traineddata Romanian language data for Tesseract 3.02 Russian - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.rus.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/rus.traineddata Russian Language Data for Tesseract 3.02 Serbian (Latin) - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.srp.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/srp.traineddata Serbian (Latin) language data for Tesseract 3.02 Spanish - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.spa.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/spa.traineddata Spanish language data for Tesseract 3.02 Swedish - https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.swe.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/swe.traineddata Swedish language data for Tesseract 3.02 Thai - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.tha.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/tha.traineddata Thai language data for Tesseract 3.02 Turkish - https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.tur.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/tur.traineddata Turkish language data for Tesseract 3.02 Ukrainian - https://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.ukr.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/ukr.traineddata Ukrainian language data for Tesseract 3.02 Vietnamese - http://tesseract-ocr.googlecode.com/files/tesseract-ocr-3.02.vie.tar.gz + https://github.com/tesseract-ocr/tessdata/raw/bf82613055ebc6e63d9e3b438a5c234bfd638c93/vie.traineddata Vietnamese Language Data for Tesseract 3.02 - + \ No newline at end of file diff --git a/src/Resources/TesseractDictionaries.xml.gz b/src/Resources/TesseractDictionaries.xml.gz index 5ea56d432..1d4487357 100644 Binary files a/src/Resources/TesseractDictionaries.xml.gz and b/src/Resources/TesseractDictionaries.xml.gz differ