OCR fix regarding Tesseract - thx jpsdr :)

2024-11-22 11:12:36 +01:00 · 2015-10-26 20:01:03 +01:00 · 2015-10-26 20:01:03 +01:00 · 5af5982b6b
commit 5af5982b6b
parent 49716f16cf
1 changed files with 13 additions and 1 deletions
--- a/src/Forms/VobSubOcr.cs
+++ b/src/Forms/VobSubOcr.cs
@ -5830,9 +5830,21 @@ namespace Nikse.SubtitleEdit.Forms
                            !psm.Contains('Y') && textWithOutFixes.Contains('Y') ||
                            !psm.Contains('\'') && textWithOutFixes.Contains('\'') ||
                            !psm.Contains('€') && textWithOutFixes.Contains('€'))
-
+                        {
                            textWithOutFixes = psm;
                        }
+                        else if (_ocrFixEngine != null && !psm.Contains('$') && !psm.Contains('•') && !psm.Contains('€'))
+                        {
+                            int correctWordsNoFixes;
+                            int wordsNotFoundNoFixes = _ocrFixEngine.CountUnknownWordsViaDictionary(textWithOutFixes, out correctWordsNoFixes);
+                            int correctWordsPsm7;
+                            int wordsNotFoundPsm7 = _ocrFixEngine.CountUnknownWordsViaDictionary(psm, out correctWordsPsm7);
+                            if (wordsNotFoundPsm7 <= wordsNotFoundNoFixes && correctWordsPsm7 > correctWordsNoFixes)
+                            {
+                                textWithOutFixes = psm;
+                            }
+                        }
+                    }
                    else if (psm.Length == textWithOutFixes.Length &&
                             (!psm.Contains('0') && textWithOutFixes.Contains('0') ||  // these chars are often mistaken
                              !psm.Contains('9') && textWithOutFixes.Contains('9') ||