diff --git a/src/Forms/VobSubOcr.Designer.cs b/src/Forms/VobSubOcr.Designer.cs index aab151134..5ace14ebf 100644 --- a/src/Forms/VobSubOcr.Designer.cs +++ b/src/Forms/VobSubOcr.Designer.cs @@ -255,8 +255,6 @@ namespace Nikse.SubtitleEdit.Forms // checkBoxUseModiInTesseractForUnknownWords // this.checkBoxUseModiInTesseractForUnknownWords.AutoSize = true; - this.checkBoxUseModiInTesseractForUnknownWords.Checked = true; - this.checkBoxUseModiInTesseractForUnknownWords.CheckState = System.Windows.Forms.CheckState.Checked; this.checkBoxUseModiInTesseractForUnknownWords.Location = new System.Drawing.Point(22, 74); this.checkBoxUseModiInTesseractForUnknownWords.Name = "checkBoxUseModiInTesseractForUnknownWords"; this.checkBoxUseModiInTesseractForUnknownWords.Size = new System.Drawing.Size(165, 17); diff --git a/src/Forms/VobSubOcr.cs b/src/Forms/VobSubOcr.cs index d669539f7..de513e643 100644 --- a/src/Forms/VobSubOcr.cs +++ b/src/Forms/VobSubOcr.cs @@ -996,12 +996,20 @@ namespace Nikse.SubtitleEdit.Forms i++; } } + comboBoxModiLanguage.SelectedIndex = -1; } var sb = new StringBuilder(); int badWords = 0; var textWithOutFixes = new StringBuilder(); textWithOutFixes.Append(Tesseract3DoOcrViaExe(bitmap, _languageId)); + + if (textWithOutFixes.ToString().Trim().Length == 0) + { + textWithOutFixes = new StringBuilder(); + textWithOutFixes.Append(TesseractResizeAndRetry(bitmap)); + } + sb.Append(textWithOutFixes.ToString()); int numberOfWords = sb.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length; @@ -1012,6 +1020,22 @@ namespace Nikse.SubtitleEdit.Forms line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line); + if (wordsNotFound > 0) + { + string newText = TesseractResizeAndRetry(bitmap); + newText = _ocrFixEngine.FixOcrErrors(newText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); + int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText); + if (newWordsNotFound < wordsNotFound) + { + wordsNotFound = newWordsNotFound; + textWithOutFixes = new StringBuilder(); + textWithOutFixes.Append(newText); + sb = new StringBuilder(); + sb.Append(newText); + } + } + + if (wordsNotFound > 0 || sb.ToString().Replace("~", string.Empty).Trim().Length == 0) { _ocrFixEngine.AutoGuessesUsed.Clear(); @@ -1024,7 +1048,7 @@ namespace Nikse.SubtitleEdit.Forms if (modiText.Length == 0) modiText = CallModi(index); // retry... strange MODI if (modiText.Length == 0) - modiText = CallModi(index); // retry... strange MODI + modiText = CallModi(index); // retry... strange MODI if (modiText.Length > 1) { @@ -1106,6 +1130,18 @@ namespace Nikse.SubtitleEdit.Forms return line; } + private string TesseractResizeAndRetry(Bitmap bitmap) + { + string result = Tesseract3DoOcrViaExe(ResizeBitmap(bitmap, bitmap.Width * 2, bitmap.Height * 2), _languageId); + if (result.Trim().Length == 0) + { + result = Tesseract3DoOcrViaExe(ResizeBitmap(bitmap, bitmap.Width * 3, bitmap.Height * 2), _languageId); + if (result.ToString().Trim().Length == 0) + result = Tesseract3DoOcrViaExe(ResizeBitmap(bitmap, bitmap.Width * 4, bitmap.Height * 2), _languageId); + } + return result.TrimEnd(); + } + private void LogOcrFix(int index, string oldLine, string newLine) { listBoxLog.Items.Add(string.Format("#{0}: {1} -> {2}", index+1, oldLine.Replace(Environment.NewLine, " "), newLine.Replace(Environment.NewLine, " "))); @@ -1452,7 +1488,7 @@ namespace Nikse.SubtitleEdit.Forms saveFileDialog1.Title = Configuration.Settings.Language.VobSubOcr.SaveSubtitleImageAs; saveFileDialog1.AddExtension = true; saveFileDialog1.FileName = "Image" + _selectedIndex; - saveFileDialog1.Filter = "PNG image|*.png|BMP image|*.bmp|GIF image|*.gif"; + saveFileDialog1.Filter = "PNG image|*.png|BMP image|*.bmp|GIF image|*.gif|TIFF image|*.tiff"; saveFileDialog1.FilterIndex = 0; DialogResult result = saveFileDialog1.ShowDialog(this); @@ -1471,8 +1507,10 @@ namespace Nikse.SubtitleEdit.Forms bmp.Save(saveFileDialog1.FileName, System.Drawing.Imaging.ImageFormat.Png); else if (saveFileDialog1.FilterIndex == 1) bmp.Save(saveFileDialog1.FileName); - else + else if (saveFileDialog1.FilterIndex == 2) bmp.Save(saveFileDialog1.FileName, System.Drawing.Imaging.ImageFormat.Gif); + else + bmp.Save(saveFileDialog1.FileName, System.Drawing.Imaging.ImageFormat.Tiff); } catch (Exception exception) { diff --git a/src/Logic/OCR/OcrFixEngine.cs b/src/Logic/OCR/OcrFixEngine.cs index 3c18775ee..e98b00499 100644 --- a/src/Logic/OCR/OcrFixEngine.cs +++ b/src/Logic/OCR/OcrFixEngine.cs @@ -575,24 +575,36 @@ namespace Nikse.SubtitleEdit.Logic.OCR if (autoFix && useAutoGuess) { + List guesses = new List(); + if (word.Length > 5) { - foreach (string guess in CreateGuessesFromLetters(word)) + guesses = (List)CreateGuessesFromLetters(word); + } + else + { + guesses.Add(word.Replace("fi", "fi")); + guesses.Add(word.Replace("fi", "fj")); + guesses.Add(word.Replace("fl", "fl")); + if (!word.EndsWith("€") && !word.StartsWith("€")) + guesses.Add(word.Replace("€", "e")); + guesses.Add(word.Replace("fi", "fj")); + } + foreach (string guess in guesses) + { + if (IsWordOrWordsCorrect(_hunspell, guess)) { - if (IsWordOrWordsCorrect(_hunspell, guess)) + var regex = new Regex(@"\b" + word + @"\b"); + Match match = regex.Match(line); + if (match.Success) { - var regex = new Regex(@"\b" + word + @"\b"); - Match match = regex.Match(line); - if (match.Success) - { - if (log) - AutoGuessesUsed.Add(string.Format("#{0}: {1} -> {2} in line via '{3}': {4}", index + 1, word, guess, "OCRFixReplaceList.xml", line.Replace(Environment.NewLine, " "))); + if (log) + AutoGuessesUsed.Add(string.Format("#{0}: {1} -> {2} in line via '{3}': {4}", index + 1, word, guess, "OCRFixReplaceList.xml", line.Replace(Environment.NewLine, " "))); - line = line.Remove(match.Index, match.Value.Length).Insert(match.Index, guess); - wordsNotFound--; - correct = true; - break; - } + line = line.Remove(match.Index, match.Value.Length).Insert(match.Index, guess); + wordsNotFound--; + correct = true; + break; } } } @@ -636,8 +648,8 @@ namespace Nikse.SubtitleEdit.Logic.OCR case OcrSpellCheck.Action.AddToUserDictionary: if (_userWordListXmlFileName != null) { - _userWordList.Add(_spellCheck.Word); - Utilities.AddToUserDictionary(_spellCheck.Word, _fiveLetterWordListLanguageName); + _userWordList.Add(_spellCheck.Word.Trim().ToLower()); + Utilities.AddToUserDictionary(_spellCheck.Word.Trim().ToLower(), _fiveLetterWordListLanguageName); } result.Word = _spellCheck.Word; result.Fixed = true; @@ -800,7 +812,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR private static string AddToGuessList(List list, string word, int index, string letter, string replaceLetters) { - if (string.IsNullOrEmpty(word) || index < 0 || index >= word.Length) + if (string.IsNullOrEmpty(word) || index < 0 || index + letter.Length - 1 >= word.Length) return word; string s = word.Remove(index, letter.Length);