Work on OCR

This commit is contained in:
Nikolaj Olsson 2017-12-22 21:03:10 +01:00
parent 7ee5bca54a
commit f42261af54
5 changed files with 71 additions and 10 deletions

View File

@ -1209,6 +1209,7 @@ This file is generated/updated by Multi Translator
<name>Coleman</name>
<name>Colette</name>
<name>Colin</name>
<name>Colin Firth</name>
<name>Colleen</name>
<name>Collier</name>
<name>Colliers</name>
@ -2067,6 +2068,7 @@ This file is generated/updated by Multi Translator
<name>Galactica</name>
<name>Galacticas</name>
<name>Galadriel</name>
<name>Galahad</name>
<name>Gale</name>
<name>Galilei</name>
<name>Galileo</name>
@ -3589,6 +3591,7 @@ This file is generated/updated by Multi Translator
<name>Marissa</name>
<name>Marissas</name>
<name>Marjorie</name>
<name>Mark Hamill</name>
<name>Mark Zuckerberg</name>
<name>Marko</name>
<name>Markus</name>
@ -3832,6 +3835,7 @@ This file is generated/updated by Multi Translator
<name>Mias</name>
<name>Micah</name>
<name>Michael</name>
<name>Michael Caine</name>
<name>Michaela</name>
<name>Michaels</name>
<name>Michas</name>

View File

@ -8,13 +8,15 @@
public int Top { get; set; }
public NikseBitmap NikseBitmap { get; set; }
public string SpecialCharacter { get; set; }
public bool CouldBeSpace { get; set; }
public ImageSplitterItem(int x, int y, NikseBitmap bitmap)
public ImageSplitterItem(int x, int y, NikseBitmap bitmap, bool couldBeSpace = false)
{
X = x;
Y = y;
NikseBitmap = bitmap;
SpecialCharacter = null;
CouldBeSpace = couldBeSpace;
}
public ImageSplitterItem(string specialCharacter)

View File

@ -4464,7 +4464,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
string name = SaveCompareItemNew(item, text, _vobSubOcrCharacter.IsItalic, null);
var addition = new ImageCompareAddition(name, text, item.NikseBitmap, _vobSubOcrCharacter.IsItalic, listViewIndex);
_lastAdditions.Add(addition);
matches.Add(new CompareMatch(text, _vobSubOcrCharacter.IsItalic, 0, null));
matches.Add(new CompareMatch(text, _vobSubOcrCharacter.IsItalic, 0, null, item));
SetBinOcrLowercaseUppercase(item.NikseBitmap.Height, text);
}
else if (result == DialogResult.Abort)
@ -4473,13 +4473,13 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
else
{
matches.Add(new CompareMatch("*", false, 0, null));
matches.Add(new CompareMatch("*", false, 0, null, item));
}
_italicCheckedLast = _vobSubOcrCharacter.IsItalic;
}
else // found image match
{
matches.Add(new CompareMatch(match.Text, match.Italic, 0, null));
matches.Add(new CompareMatch(match.Text, match.Italic, 0, null, item));
if (match.ExpandCount > 0)
index += match.ExpandCount - 1;
}
@ -4516,6 +4516,41 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
int correctWords;
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
// smaller space pixels for italic
if (correctWords > 0 && wordsNotFound > 0 && line.Contains("<i>", StringComparison.Ordinal) && matches.Any(p=>p?.ImageSplitterItem?.CouldBeSpace == true))
{
int j = 0;
while (j < matches.Count)
{
if (matches[j]?.ImageSplitterItem?.CouldBeSpace == true)
{
matches[j].ImageSplitterItem.CouldBeSpace = false;
matches.Insert(j, new CompareMatch(" ", false, 0, string.Empty, new ImageSplitterItem(" ")));
}
j++;
}
var tempLine = GetStringWithItalicTags(matches);
var oldAutoGuessesUsed = new List<string>(_ocrFixEngine.AutoGuessesUsed);
var oldUnknownWordsFound = new List<string>(_ocrFixEngine.UnknownWordsFound);
_ocrFixEngine.AutoGuessesUsed.Clear();
_ocrFixEngine.UnknownWordsFound.Clear();
if (checkBoxAutoFixCommonErrors.Checked)
tempLine = _ocrFixEngine.FixOcrErrors(tempLine, listViewIndex, _lastLine, true, autoGuessLevel);
int tempCorrectWords;
int tempWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(tempLine, out tempCorrectWords);
if (tempWordsNotFound == 0 && tempCorrectWords > 0)
{
wordsNotFound = tempWordsNotFound;
correctWords = tempCorrectWords;
line = tempLine;
}
else
{
_ocrFixEngine.AutoGuessesUsed = oldAutoGuessesUsed;
_ocrFixEngine.UnknownWordsFound = oldUnknownWordsFound;
}
}
if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && string.IsNullOrWhiteSpace(textWithOutFixes.Replace("~", string.Empty)))
{
_ocrFixEngine.AutoGuessesUsed.Clear();
@ -5999,12 +6034,12 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
string text = string.Empty;
// var sw = Stopwatch.StartNew();
if (_ocrMethodIndex == _ocrMethodTesseract)
if (_ocrMethodIndex == _ocrMethodBinaryImageCompare)
text = SplitAndOcrBinaryImageCompare(bmp, i);
else if (_ocrMethodIndex == _ocrMethodTesseract)
text = OcrViaTesseract(bmp, i);
else if (_ocrMethodIndex == _ocrMethodImageCompare)
text = SplitAndOcrBitmapNormal(bmp, i);
else if (_ocrMethodIndex == _ocrMethodBinaryImageCompare)
text = SplitAndOcrBinaryImageCompare(bmp, i);
else if (_ocrMethodIndex == _ocrMethodNocr)
text = OcrViaNOCR(bmp, i);
else if (_ocrMethodIndex == _ocrMethodModi)
@ -7470,7 +7505,10 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
else
{
comboBoxDictionaries.SelectedIndex = 0;
if (comboBoxDictionaries.SelectedIndex < 0)
comboBoxDictionaries.SelectedIndex = 0;
else
comboBoxDictionaries_SelectedIndexChanged(null, null);
}
if (_modiEnabled && checkBoxUseModiInTesseractForUnknownWords.Checked)

View File

@ -774,11 +774,16 @@ namespace Nikse.SubtitleEdit.Logic
int addY;
b1 = CropTopAndBottom(b1, out addY);
var couldBeSpace = false;
if (spacePixels >= xOrMorePixelsMakesSpace && parts.Count > 0)
parts.Add(new ImageSplitterItem(" ") { Y = addY + lineSplitterItem.Y });
else if (xOrMorePixelsMakesSpace > 9 && spacePixels >= xOrMorePixelsMakesSpace - 2 && parts.Count > 0)
couldBeSpace = true;
else if (xOrMorePixelsMakesSpace > 3 && spacePixels >= xOrMorePixelsMakesSpace - 1 && parts.Count > 0)
couldBeSpace = true;
if (b1.Width > 0 && b1.Height > 0)
parts.Add(new ImageSplitterItem(startX + lineSplitterItem.X, addY + lineSplitterItem.Y, b1)); //y is what?
parts.Add(new ImageSplitterItem(startX + lineSplitterItem.X, addY + lineSplitterItem.Y, b1, couldBeSpace)); //y is what?
// remove pixels before next letter;
const int begin = 0;

View File

@ -931,6 +931,18 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
}
}
if (input.EndsWith(". \"</i>", StringComparison.Ordinal))
input = input.Remove(input.Length - 6, 1);
if (input.Contains(". \"</i>" + Environment.NewLine, StringComparison.Ordinal))
{
idx = input.IndexOf(". \"</i>" + Environment.NewLine);
if (idx > 0)
{
input = input.Remove(idx + 1, 1);
}
}
return input;
}