Ocr work (minor)

This commit is contained in:
Nikolaj Olsson 2021-01-05 07:54:12 +01:00
parent e26df65a33
commit 7bb0f1f39c
4 changed files with 122 additions and 8 deletions

Binary file not shown.

View File

@ -444,7 +444,26 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries
previousGuesses = new List<string>(list);
}
return list;
// do not keep one letter consonants
var results = new List<string>();
foreach (var s in list)
{
var keep = true;
var words = s.Split(' ');
foreach (var w in words)
{
if (w.Length == 1 && char.IsLetter(w[0]) && !"aeiouæøåöüäAEIOUÆØÅÖÜÄ".Contains(w))
{
keep = false;
}
}
if (keep)
{
results.Add(s);
}
}
return results;
}
public string FixCommonWordErrors(string input)

View File

@ -131,7 +131,12 @@ namespace Nikse.SubtitleEdit.Forms.BinaryEdit
{
_nOcrFileName = "Latin";
}
_nOcrFileName = Path.Combine(Configuration.OcrDirectory, _nOcrFileName + ".nocr");
_ocrLowercaseHeightsTotal = 0;
_ocrLowercaseHeightsTotalCount = 0;
_ocrUppercaseHeightsTotal = 0;
_ocrUppercaseHeightsTotalCount = 0;
}
private void OpenBinSubtitle(string fileName)
@ -1848,11 +1853,6 @@ namespace Nikse.SubtitleEdit.Forms.BinaryEdit
videoPlayerContainer1.CurrentPosition = p.StartTime.TotalSeconds;
}
private void ocrTextsToolStripMenuItem_Click(object sender, EventArgs e)
{
}
private static void OcrParagraph(Extra extra, IBinaryParagraphWithPosition s, NOcrDb nOcrDb, Paragraph p)
{
var bmp = extra.Bitmap != null ? (Bitmap)extra.Bitmap.Clone() : s.GetBitmap();
@ -1872,7 +1872,7 @@ namespace Nikse.SubtitleEdit.Forms.BinaryEdit
else
{
var match = nOcrDb.GetMatch(item.NikseBitmap, item.Top, true, 40);
sb.Append(match != null ? match.Text : "*");
sb.Append(match != null ? FixUppercaseLowercaseIssues(item, match) : "*");
}
}
@ -2271,6 +2271,96 @@ namespace Nikse.SubtitleEdit.Forms.BinaryEdit
}
}
private static readonly HashSet<string> UppercaseLikeLowercase = new HashSet<string> { "V", "W", "U", "S", "Z", "O", "X", "Ø", "C" };
private static readonly HashSet<string> LowercaseLikeUppercase = new HashSet<string> { "v", "w", "u", "s", "z", "o", "x", "ø", "c" };
private static readonly HashSet<string> UppercaseWithAccent = new HashSet<string> { "Č", "Š", "Ž", "Ś", "Ż", "Ś", "Ö", "Ü", "Ú", "Ï", "Í", "Ç", "Ì", "Ò", "Ù", "Ó", "Í" };
private static readonly HashSet<string> LowercaseWithAccent = new HashSet<string> { "č", "š", "ž", "ś", "ż", "ś", "ö", "ü", "ú", "ï", "í", "ç", "ì", "ò", "ù", "ó", "í" };
private static long _ocrLowercaseHeightsTotal;
private static int _ocrLowercaseHeightsTotalCount;
private static long _ocrUppercaseHeightsTotal;
private static int _ocrUppercaseHeightsTotalCount;
/// <summary>
/// Fix uppercase/lowercase issues (not I/l)
/// </summary>
private static string FixUppercaseLowercaseIssues(ImageSplitterItem targetItem, NOcrChar result)
{
if (result.Text == "e" || result.Text == "a" || result.Text == "d" || result.Text == "t")
{
_ocrLowercaseHeightsTotalCount++;
_ocrLowercaseHeightsTotal += targetItem.NikseBitmap.Height;
if (_ocrUppercaseHeightsTotalCount < 3)
{
_ocrUppercaseHeightsTotalCount++;
_ocrUppercaseHeightsTotal += targetItem.NikseBitmap.Height + 10;
}
}
if (result.Text == "E" || result.Text == "H" || result.Text == "R" || result.Text == "D" || result.Text == "T" || result.Text == "M")
{
_ocrUppercaseHeightsTotalCount++;
_ocrUppercaseHeightsTotal += targetItem.NikseBitmap.Height;
if (_ocrLowercaseHeightsTotalCount < 3 && targetItem.NikseBitmap.Height > 20)
{
_ocrLowercaseHeightsTotalCount++;
_ocrLowercaseHeightsTotal += targetItem.NikseBitmap.Height - 10;
}
}
if (_ocrLowercaseHeightsTotalCount <= 2 || _ocrUppercaseHeightsTotalCount <= 2)
{
return result.Text;
}
// Latin letters where lowercase versions look like uppercase version
if (UppercaseLikeLowercase.Contains(result.Text))
{
var averageLowercase = _ocrLowercaseHeightsTotal / _ocrLowercaseHeightsTotalCount;
var averageUppercase = _ocrUppercaseHeightsTotal / _ocrUppercaseHeightsTotalCount;
if (Math.Abs(averageLowercase - targetItem.NikseBitmap.Height) < Math.Abs(averageUppercase - targetItem.NikseBitmap.Height))
{
return result.Text.ToLowerInvariant();
}
return result.Text;
}
if (LowercaseLikeUppercase.Contains(result.Text))
{
var averageLowercase = _ocrLowercaseHeightsTotal / _ocrLowercaseHeightsTotalCount;
var averageUppercase = _ocrUppercaseHeightsTotal / _ocrUppercaseHeightsTotalCount;
if (Math.Abs(averageLowercase - targetItem.NikseBitmap.Height) > Math.Abs(averageUppercase - targetItem.NikseBitmap.Height))
{
return result.Text.ToUpperInvariant();
}
return result.Text;
}
if (UppercaseWithAccent.Contains(result.Text))
{
var averageUppercase = _ocrUppercaseHeightsTotal / (double)_ocrUppercaseHeightsTotalCount;
if (targetItem.NikseBitmap.Height < averageUppercase + 3)
{
return result.Text.ToLowerInvariant();
}
return result.Text;
}
if (LowercaseWithAccent.Contains(result.Text))
{
var averageUppercase = _ocrUppercaseHeightsTotal / (double)_ocrUppercaseHeightsTotalCount;
if (targetItem.NikseBitmap.Height > averageUppercase + 4)
{
return result.Text.ToUpperInvariant();
}
}
return result.Text;
}
private void quickOCRTextsforOverviewOnlyToolStripMenuItem_Click(object sender, EventArgs e)
{
if (subtitleListView1.Items.Count < 1)

View File

@ -1432,7 +1432,12 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
if (!correct && word.Length > 3 && !word.EndsWith("ss", StringComparison.Ordinal) && !string.IsNullOrEmpty(_threeLetterIsoLanguageName) &&
(_threeLetterIsoLanguageName == "eng" || _threeLetterIsoLanguageName == "dan" || _threeLetterIsoLanguageName == "swe" || _threeLetterIsoLanguageName == "nld"))
{
correct = DoSpell(word.TrimEnd('s'));
var w = word.TrimEnd('s');
correct = DoSpell(w);
if (!correct && w.EndsWith('\''))
{
correct = DoSpell(w.Remove(w.Length - 1, 1));
}
}
}
else