diff --git a/Ocr/Latin.nocr b/Ocr/Latin.nocr index 7688fb2e4..04f6b28b4 100644 Binary files a/Ocr/Latin.nocr and b/Ocr/Latin.nocr differ diff --git a/src/libse/Dictionaries/OcrFixReplaceList.cs b/src/libse/Dictionaries/OcrFixReplaceList.cs index 752736af6..711c3357f 100644 --- a/src/libse/Dictionaries/OcrFixReplaceList.cs +++ b/src/libse/Dictionaries/OcrFixReplaceList.cs @@ -444,7 +444,26 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries previousGuesses = new List(list); } - return list; + // do not keep one letter consonants + var results = new List(); + foreach (var s in list) + { + var keep = true; + var words = s.Split(' '); + foreach (var w in words) + { + if (w.Length == 1 && char.IsLetter(w[0]) && !"aeiouæøåöüäAEIOUÆØÅÖÜÄ".Contains(w)) + { + keep = false; + } + } + if (keep) + { + results.Add(s); + } + } + + return results; } public string FixCommonWordErrors(string input) diff --git a/src/ui/Forms/BinaryEdit/BinEdit.cs b/src/ui/Forms/BinaryEdit/BinEdit.cs index 864fb381b..3a292b1b5 100644 --- a/src/ui/Forms/BinaryEdit/BinEdit.cs +++ b/src/ui/Forms/BinaryEdit/BinEdit.cs @@ -131,7 +131,12 @@ namespace Nikse.SubtitleEdit.Forms.BinaryEdit { _nOcrFileName = "Latin"; } + _nOcrFileName = Path.Combine(Configuration.OcrDirectory, _nOcrFileName + ".nocr"); + _ocrLowercaseHeightsTotal = 0; + _ocrLowercaseHeightsTotalCount = 0; + _ocrUppercaseHeightsTotal = 0; + _ocrUppercaseHeightsTotalCount = 0; } private void OpenBinSubtitle(string fileName) @@ -1848,11 +1853,6 @@ namespace Nikse.SubtitleEdit.Forms.BinaryEdit videoPlayerContainer1.CurrentPosition = p.StartTime.TotalSeconds; } - private void ocrTextsToolStripMenuItem_Click(object sender, EventArgs e) - { - - } - private static void OcrParagraph(Extra extra, IBinaryParagraphWithPosition s, NOcrDb nOcrDb, Paragraph p) { var bmp = extra.Bitmap != null ? (Bitmap)extra.Bitmap.Clone() : s.GetBitmap(); @@ -1872,7 +1872,7 @@ namespace Nikse.SubtitleEdit.Forms.BinaryEdit else { var match = nOcrDb.GetMatch(item.NikseBitmap, item.Top, true, 40); - sb.Append(match != null ? match.Text : "*"); + sb.Append(match != null ? FixUppercaseLowercaseIssues(item, match) : "*"); } } @@ -2271,6 +2271,96 @@ namespace Nikse.SubtitleEdit.Forms.BinaryEdit } } + private static readonly HashSet UppercaseLikeLowercase = new HashSet { "V", "W", "U", "S", "Z", "O", "X", "Ø", "C" }; + private static readonly HashSet LowercaseLikeUppercase = new HashSet { "v", "w", "u", "s", "z", "o", "x", "ø", "c" }; + private static readonly HashSet UppercaseWithAccent = new HashSet { "Č", "Š", "Ž", "Ś", "Ż", "Ś", "Ö", "Ü", "Ú", "Ï", "Í", "Ç", "Ì", "Ò", "Ù", "Ó", "Í" }; + private static readonly HashSet LowercaseWithAccent = new HashSet { "č", "š", "ž", "ś", "ż", "ś", "ö", "ü", "ú", "ï", "í", "ç", "ì", "ò", "ù", "ó", "í" }; + + private static long _ocrLowercaseHeightsTotal; + private static int _ocrLowercaseHeightsTotalCount; + private static long _ocrUppercaseHeightsTotal; + private static int _ocrUppercaseHeightsTotalCount; + + /// + /// Fix uppercase/lowercase issues (not I/l) + /// + private static string FixUppercaseLowercaseIssues(ImageSplitterItem targetItem, NOcrChar result) + { + if (result.Text == "e" || result.Text == "a" || result.Text == "d" || result.Text == "t") + { + _ocrLowercaseHeightsTotalCount++; + _ocrLowercaseHeightsTotal += targetItem.NikseBitmap.Height; + if (_ocrUppercaseHeightsTotalCount < 3) + { + _ocrUppercaseHeightsTotalCount++; + _ocrUppercaseHeightsTotal += targetItem.NikseBitmap.Height + 10; + } + } + + if (result.Text == "E" || result.Text == "H" || result.Text == "R" || result.Text == "D" || result.Text == "T" || result.Text == "M") + { + _ocrUppercaseHeightsTotalCount++; + _ocrUppercaseHeightsTotal += targetItem.NikseBitmap.Height; + if (_ocrLowercaseHeightsTotalCount < 3 && targetItem.NikseBitmap.Height > 20) + { + _ocrLowercaseHeightsTotalCount++; + _ocrLowercaseHeightsTotal += targetItem.NikseBitmap.Height - 10; + } + } + + if (_ocrLowercaseHeightsTotalCount <= 2 || _ocrUppercaseHeightsTotalCount <= 2) + { + return result.Text; + } + + // Latin letters where lowercase versions look like uppercase version + if (UppercaseLikeLowercase.Contains(result.Text)) + { + var averageLowercase = _ocrLowercaseHeightsTotal / _ocrLowercaseHeightsTotalCount; + var averageUppercase = _ocrUppercaseHeightsTotal / _ocrUppercaseHeightsTotalCount; + if (Math.Abs(averageLowercase - targetItem.NikseBitmap.Height) < Math.Abs(averageUppercase - targetItem.NikseBitmap.Height)) + { + return result.Text.ToLowerInvariant(); + } + + return result.Text; + } + + if (LowercaseLikeUppercase.Contains(result.Text)) + { + var averageLowercase = _ocrLowercaseHeightsTotal / _ocrLowercaseHeightsTotalCount; + var averageUppercase = _ocrUppercaseHeightsTotal / _ocrUppercaseHeightsTotalCount; + if (Math.Abs(averageLowercase - targetItem.NikseBitmap.Height) > Math.Abs(averageUppercase - targetItem.NikseBitmap.Height)) + { + return result.Text.ToUpperInvariant(); + } + + return result.Text; + } + + if (UppercaseWithAccent.Contains(result.Text)) + { + var averageUppercase = _ocrUppercaseHeightsTotal / (double)_ocrUppercaseHeightsTotalCount; + if (targetItem.NikseBitmap.Height < averageUppercase + 3) + { + return result.Text.ToLowerInvariant(); + } + + return result.Text; + } + + if (LowercaseWithAccent.Contains(result.Text)) + { + var averageUppercase = _ocrUppercaseHeightsTotal / (double)_ocrUppercaseHeightsTotalCount; + if (targetItem.NikseBitmap.Height > averageUppercase + 4) + { + return result.Text.ToUpperInvariant(); + } + } + + return result.Text; + } + private void quickOCRTextsforOverviewOnlyToolStripMenuItem_Click(object sender, EventArgs e) { if (subtitleListView1.Items.Count < 1) diff --git a/src/ui/Logic/Ocr/OcrFixEngine.cs b/src/ui/Logic/Ocr/OcrFixEngine.cs index 748f4946b..f421e3b8e 100644 --- a/src/ui/Logic/Ocr/OcrFixEngine.cs +++ b/src/ui/Logic/Ocr/OcrFixEngine.cs @@ -1432,7 +1432,12 @@ namespace Nikse.SubtitleEdit.Logic.Ocr if (!correct && word.Length > 3 && !word.EndsWith("ss", StringComparison.Ordinal) && !string.IsNullOrEmpty(_threeLetterIsoLanguageName) && (_threeLetterIsoLanguageName == "eng" || _threeLetterIsoLanguageName == "dan" || _threeLetterIsoLanguageName == "swe" || _threeLetterIsoLanguageName == "nld")) { - correct = DoSpell(word.TrimEnd('s')); + var w = word.TrimEnd('s'); + correct = DoSpell(w); + if (!correct && w.EndsWith('\'')) + { + correct = DoSpell(w.Remove(w.Length - 1, 1)); + } } } else