diff --git a/src/Forms/VobSubOcr.Designer.cs b/src/Forms/VobSubOcr.Designer.cs index 757f1414b..20ed4be9f 100644 --- a/src/Forms/VobSubOcr.Designer.cs +++ b/src/Forms/VobSubOcr.Designer.cs @@ -255,6 +255,8 @@ namespace Nikse.SubtitleEdit.Forms // checkBoxUseModiInTesseractForUnknownWords // this.checkBoxUseModiInTesseractForUnknownWords.AutoSize = true; + this.checkBoxUseModiInTesseractForUnknownWords.Checked = true; + this.checkBoxUseModiInTesseractForUnknownWords.CheckState = System.Windows.Forms.CheckState.Checked; this.checkBoxUseModiInTesseractForUnknownWords.Enabled = false; this.checkBoxUseModiInTesseractForUnknownWords.Location = new System.Drawing.Point(22, 74); this.checkBoxUseModiInTesseractForUnknownWords.Name = "checkBoxUseModiInTesseractForUnknownWords"; diff --git a/src/Forms/VobSubOcr.cs b/src/Forms/VobSubOcr.cs index b8f3b6fdd..9d9302947 100644 --- a/src/Forms/VobSubOcr.cs +++ b/src/Forms/VobSubOcr.cs @@ -999,43 +999,38 @@ namespace Nikse.SubtitleEdit.Forms comboBoxModiLanguage.SelectedIndex = -1; } - var sb = new StringBuilder(); int badWords = 0; - var textWithOutFixes = new StringBuilder(); - textWithOutFixes.Append(Tesseract3DoOcrViaExe(bitmap, _languageId)); + string textWithOutFixes = Tesseract3DoOcrViaExe(bitmap, _languageId); if (textWithOutFixes.ToString().Trim().Length == 0) { - textWithOutFixes = new StringBuilder(); - textWithOutFixes.Append(TesseractResizeAndRetry(bitmap)); + textWithOutFixes = TesseractResizeAndRetry(bitmap); } - sb.Append(textWithOutFixes.ToString()); - int numberOfWords = sb.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length; + int numberOfWords = textWithOutFixes.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length; - string line = sb.ToString().Trim(); + string line = textWithOutFixes.ToString().Trim(); if (_ocrFixEngine.IsDictionaryLoaded) { if (checkBoxAutoFixCommonErrors.Checked) line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); - int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line); + int correctWords; + int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords); - if (wordsNotFound > 0) + if (wordsNotFound > 0 || correctWords == 0) { - string newText = TesseractResizeAndRetry(bitmap); - newText = _ocrFixEngine.FixOcrErrors(newText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); - int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText); + string newUnfixedText = TesseractResizeAndRetry(bitmap); + string newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked); + int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords); if (newWordsNotFound < wordsNotFound) { wordsNotFound = newWordsNotFound; - textWithOutFixes = new StringBuilder(); - textWithOutFixes.Append(newText); - sb = new StringBuilder(); - sb.Append(newText); + textWithOutFixes = newUnfixedText; + line = newText; } } - if (wordsNotFound > 0 || sb.ToString().Replace("~", string.Empty).Trim().Length == 0) + if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length == 0) { _ocrFixEngine.AutoGuessesUsed.Clear(); _ocrFixEngine.UnknownWordsFound.Clear(); @@ -1051,13 +1046,13 @@ namespace Nikse.SubtitleEdit.Forms if (modiText.Length > 1) { - int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiText); + int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiText, out correctWords); if (modiWordsNotFound > 0) { string modiTextOcrFixed = modiText; if (checkBoxAutoFixCommonErrors.Checked) modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked); - int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed); + int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out correctWords); if (modiOcrCorrectedWordsNotFound < modiWordsNotFound) modiText = modiTextOcrFixed; } diff --git a/src/Logic/OCR/OcrFixEngine.cs b/src/Logic/OCR/OcrFixEngine.cs index e98b00499..4029fcb5d 100644 --- a/src/Logic/OCR/OcrFixEngine.cs +++ b/src/Logic/OCR/OcrFixEngine.cs @@ -583,12 +583,15 @@ namespace Nikse.SubtitleEdit.Logic.OCR } else { + if (i==0) + guesses.Add(word.Replace(@"\/", "V")); + else + guesses.Add(word.Replace(@"\/", "v")); guesses.Add(word.Replace("fi", "fi")); guesses.Add(word.Replace("fi", "fj")); guesses.Add(word.Replace("fl", "fl")); if (!word.EndsWith("€") && !word.StartsWith("€")) guesses.Add(word.Replace("€", "e")); - guesses.Add(word.Replace("fi", "fj")); } foreach (string guess in guesses) { @@ -884,8 +887,9 @@ namespace Nikse.SubtitleEdit.Logic.OCR return false; } - public int CountUnknownWordsViaDictionary(string line) + public int CountUnknownWordsViaDictionary(string line, out int numberOfCorrectWords) { + numberOfCorrectWords = 0; if (_hunspell == null) return 0; @@ -900,8 +904,11 @@ namespace Nikse.SubtitleEdit.Logic.OCR if (!correct) correct = _hunspell.Spell(word.Trim('\'')); - if (!correct) + if (correct) + numberOfCorrectWords++; + else wordsNotFound++; + } } return wordsNotFound;