mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-11-22 03:02:35 +01:00
Work on OCR
This commit is contained in:
parent
7ee5bca54a
commit
f42261af54
@ -1209,6 +1209,7 @@ This file is generated/updated by Multi Translator
|
||||
<name>Coleman</name>
|
||||
<name>Colette</name>
|
||||
<name>Colin</name>
|
||||
<name>Colin Firth</name>
|
||||
<name>Colleen</name>
|
||||
<name>Collier</name>
|
||||
<name>Colliers</name>
|
||||
@ -2067,6 +2068,7 @@ This file is generated/updated by Multi Translator
|
||||
<name>Galactica</name>
|
||||
<name>Galacticas</name>
|
||||
<name>Galadriel</name>
|
||||
<name>Galahad</name>
|
||||
<name>Gale</name>
|
||||
<name>Galilei</name>
|
||||
<name>Galileo</name>
|
||||
@ -3589,6 +3591,7 @@ This file is generated/updated by Multi Translator
|
||||
<name>Marissa</name>
|
||||
<name>Marissas</name>
|
||||
<name>Marjorie</name>
|
||||
<name>Mark Hamill</name>
|
||||
<name>Mark Zuckerberg</name>
|
||||
<name>Marko</name>
|
||||
<name>Markus</name>
|
||||
@ -3832,6 +3835,7 @@ This file is generated/updated by Multi Translator
|
||||
<name>Mias</name>
|
||||
<name>Micah</name>
|
||||
<name>Michael</name>
|
||||
<name>Michael Caine</name>
|
||||
<name>Michaela</name>
|
||||
<name>Michaels</name>
|
||||
<name>Michas</name>
|
||||
|
@ -8,13 +8,15 @@
|
||||
public int Top { get; set; }
|
||||
public NikseBitmap NikseBitmap { get; set; }
|
||||
public string SpecialCharacter { get; set; }
|
||||
public bool CouldBeSpace { get; set; }
|
||||
|
||||
public ImageSplitterItem(int x, int y, NikseBitmap bitmap)
|
||||
public ImageSplitterItem(int x, int y, NikseBitmap bitmap, bool couldBeSpace = false)
|
||||
{
|
||||
X = x;
|
||||
Y = y;
|
||||
NikseBitmap = bitmap;
|
||||
SpecialCharacter = null;
|
||||
CouldBeSpace = couldBeSpace;
|
||||
}
|
||||
|
||||
public ImageSplitterItem(string specialCharacter)
|
||||
|
@ -4464,7 +4464,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
string name = SaveCompareItemNew(item, text, _vobSubOcrCharacter.IsItalic, null);
|
||||
var addition = new ImageCompareAddition(name, text, item.NikseBitmap, _vobSubOcrCharacter.IsItalic, listViewIndex);
|
||||
_lastAdditions.Add(addition);
|
||||
matches.Add(new CompareMatch(text, _vobSubOcrCharacter.IsItalic, 0, null));
|
||||
matches.Add(new CompareMatch(text, _vobSubOcrCharacter.IsItalic, 0, null, item));
|
||||
SetBinOcrLowercaseUppercase(item.NikseBitmap.Height, text);
|
||||
}
|
||||
else if (result == DialogResult.Abort)
|
||||
@ -4473,13 +4473,13 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
}
|
||||
else
|
||||
{
|
||||
matches.Add(new CompareMatch("*", false, 0, null));
|
||||
matches.Add(new CompareMatch("*", false, 0, null, item));
|
||||
}
|
||||
_italicCheckedLast = _vobSubOcrCharacter.IsItalic;
|
||||
}
|
||||
else // found image match
|
||||
{
|
||||
matches.Add(new CompareMatch(match.Text, match.Italic, 0, null));
|
||||
matches.Add(new CompareMatch(match.Text, match.Italic, 0, null, item));
|
||||
if (match.ExpandCount > 0)
|
||||
index += match.ExpandCount - 1;
|
||||
}
|
||||
@ -4495,7 +4495,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
}
|
||||
}
|
||||
|
||||
string line = GetStringWithItalicTags(matches);
|
||||
string line = GetStringWithItalicTags(matches);
|
||||
|
||||
if (checkBoxAutoFixCommonErrors.Checked && _ocrFixEngine != null)
|
||||
line = _ocrFixEngine.FixOcrErrorsViaHardcodedRules(line, _lastLine, null); // TODO: Add abbreviations list
|
||||
@ -4516,6 +4516,41 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
int correctWords;
|
||||
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
|
||||
|
||||
// smaller space pixels for italic
|
||||
if (correctWords > 0 && wordsNotFound > 0 && line.Contains("<i>", StringComparison.Ordinal) && matches.Any(p=>p?.ImageSplitterItem?.CouldBeSpace == true))
|
||||
{
|
||||
int j = 0;
|
||||
while (j < matches.Count)
|
||||
{
|
||||
if (matches[j]?.ImageSplitterItem?.CouldBeSpace == true)
|
||||
{
|
||||
matches[j].ImageSplitterItem.CouldBeSpace = false;
|
||||
matches.Insert(j, new CompareMatch(" ", false, 0, string.Empty, new ImageSplitterItem(" ")));
|
||||
}
|
||||
j++;
|
||||
}
|
||||
var tempLine = GetStringWithItalicTags(matches);
|
||||
var oldAutoGuessesUsed = new List<string>(_ocrFixEngine.AutoGuessesUsed);
|
||||
var oldUnknownWordsFound = new List<string>(_ocrFixEngine.UnknownWordsFound);
|
||||
_ocrFixEngine.AutoGuessesUsed.Clear();
|
||||
_ocrFixEngine.UnknownWordsFound.Clear();
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
tempLine = _ocrFixEngine.FixOcrErrors(tempLine, listViewIndex, _lastLine, true, autoGuessLevel);
|
||||
int tempCorrectWords;
|
||||
int tempWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(tempLine, out tempCorrectWords);
|
||||
if (tempWordsNotFound == 0 && tempCorrectWords > 0)
|
||||
{
|
||||
wordsNotFound = tempWordsNotFound;
|
||||
correctWords = tempCorrectWords;
|
||||
line = tempLine;
|
||||
}
|
||||
else
|
||||
{
|
||||
_ocrFixEngine.AutoGuessesUsed = oldAutoGuessesUsed;
|
||||
_ocrFixEngine.UnknownWordsFound = oldUnknownWordsFound;
|
||||
}
|
||||
}
|
||||
|
||||
if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && string.IsNullOrWhiteSpace(textWithOutFixes.Replace("~", string.Empty)))
|
||||
{
|
||||
_ocrFixEngine.AutoGuessesUsed.Clear();
|
||||
@ -5999,12 +6034,12 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
|
||||
string text = string.Empty;
|
||||
// var sw = Stopwatch.StartNew();
|
||||
if (_ocrMethodIndex == _ocrMethodTesseract)
|
||||
if (_ocrMethodIndex == _ocrMethodBinaryImageCompare)
|
||||
text = SplitAndOcrBinaryImageCompare(bmp, i);
|
||||
else if (_ocrMethodIndex == _ocrMethodTesseract)
|
||||
text = OcrViaTesseract(bmp, i);
|
||||
else if (_ocrMethodIndex == _ocrMethodImageCompare)
|
||||
text = SplitAndOcrBitmapNormal(bmp, i);
|
||||
else if (_ocrMethodIndex == _ocrMethodBinaryImageCompare)
|
||||
text = SplitAndOcrBinaryImageCompare(bmp, i);
|
||||
else if (_ocrMethodIndex == _ocrMethodNocr)
|
||||
text = OcrViaNOCR(bmp, i);
|
||||
else if (_ocrMethodIndex == _ocrMethodModi)
|
||||
@ -7470,7 +7505,10 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
}
|
||||
else
|
||||
{
|
||||
comboBoxDictionaries.SelectedIndex = 0;
|
||||
if (comboBoxDictionaries.SelectedIndex < 0)
|
||||
comboBoxDictionaries.SelectedIndex = 0;
|
||||
else
|
||||
comboBoxDictionaries_SelectedIndexChanged(null, null);
|
||||
}
|
||||
|
||||
if (_modiEnabled && checkBoxUseModiInTesseractForUnknownWords.Checked)
|
||||
|
@ -774,11 +774,16 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
int addY;
|
||||
b1 = CropTopAndBottom(b1, out addY);
|
||||
|
||||
var couldBeSpace = false;
|
||||
if (spacePixels >= xOrMorePixelsMakesSpace && parts.Count > 0)
|
||||
parts.Add(new ImageSplitterItem(" ") { Y = addY + lineSplitterItem.Y });
|
||||
else if (xOrMorePixelsMakesSpace > 9 && spacePixels >= xOrMorePixelsMakesSpace - 2 && parts.Count > 0)
|
||||
couldBeSpace = true;
|
||||
else if (xOrMorePixelsMakesSpace > 3 && spacePixels >= xOrMorePixelsMakesSpace - 1 && parts.Count > 0)
|
||||
couldBeSpace = true;
|
||||
|
||||
if (b1.Width > 0 && b1.Height > 0)
|
||||
parts.Add(new ImageSplitterItem(startX + lineSplitterItem.X, addY + lineSplitterItem.Y, b1)); //y is what?
|
||||
parts.Add(new ImageSplitterItem(startX + lineSplitterItem.X, addY + lineSplitterItem.Y, b1, couldBeSpace)); //y is what?
|
||||
|
||||
// remove pixels before next letter;
|
||||
const int begin = 0;
|
||||
|
@ -931,6 +931,18 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
||||
}
|
||||
}
|
||||
|
||||
if (input.EndsWith(". \"</i>", StringComparison.Ordinal))
|
||||
input = input.Remove(input.Length - 6, 1);
|
||||
|
||||
if (input.Contains(". \"</i>" + Environment.NewLine, StringComparison.Ordinal))
|
||||
{
|
||||
idx = input.IndexOf(". \"</i>" + Environment.NewLine);
|
||||
if (idx > 0)
|
||||
{
|
||||
input = input.Remove(idx + 1, 1);
|
||||
}
|
||||
}
|
||||
|
||||
return input;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user