From 5af5982b6bc0bcf7f1dc79d093934140144567d2 Mon Sep 17 00:00:00 2001 From: niksedk Date: Mon, 26 Oct 2015 20:01:03 +0100 Subject: [PATCH] OCR fix regarding Tesseract - thx jpsdr :) --- src/Forms/VobSubOcr.cs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/Forms/VobSubOcr.cs b/src/Forms/VobSubOcr.cs index 5b7815438..a3a39a785 100644 --- a/src/Forms/VobSubOcr.cs +++ b/src/Forms/VobSubOcr.cs @@ -5830,8 +5830,20 @@ namespace Nikse.SubtitleEdit.Forms !psm.Contains('Y') && textWithOutFixes.Contains('Y') || !psm.Contains('\'') && textWithOutFixes.Contains('\'') || !psm.Contains('€') && textWithOutFixes.Contains('€')) - + { textWithOutFixes = psm; + } + else if (_ocrFixEngine != null && !psm.Contains('$') && !psm.Contains('•') && !psm.Contains('€')) + { + int correctWordsNoFixes; + int wordsNotFoundNoFixes = _ocrFixEngine.CountUnknownWordsViaDictionary(textWithOutFixes, out correctWordsNoFixes); + int correctWordsPsm7; + int wordsNotFoundPsm7 = _ocrFixEngine.CountUnknownWordsViaDictionary(psm, out correctWordsPsm7); + if (wordsNotFoundPsm7 <= wordsNotFoundNoFixes && correctWordsPsm7 > correctWordsNoFixes) + { + textWithOutFixes = psm; + } + } } else if (psm.Length == textWithOutFixes.Length && (!psm.Contains('0') && textWithOutFixes.Contains('0') || // these chars are often mistaken