Fix bug in OCR auto guesses +fix new italic space detect for nOcr

This commit is contained in:
Nikolaj Olsson 2020-05-18 19:02:34 +02:00
parent 728597ef45
commit 32b8d875dc
3 changed files with 106 additions and 38 deletions

View File

@ -338,55 +338,62 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries
return newText; return newText;
} }
private static string AddToGuessList(List<string> list, string word, int index, string letter, string replaceLetters) private static void AddToGuessList(List<string> list, string guess)
{ {
if (string.IsNullOrEmpty(word) || index < 0 || index + letter.Length - 1 >= word.Length) if (string.IsNullOrEmpty(guess))
{ {
return word; return;
} }
string s = word.Remove(index, letter.Length); if (!list.Contains(guess))
if (index >= s.Length)
{ {
s += replaceLetters; list.Add(guess);
} }
else
{
s = s.Insert(index, replaceLetters);
}
if (!list.Contains(s))
{
list.Add(s);
}
return s;
} }
public IEnumerable<string> CreateGuessesFromLetters(string word) public IEnumerable<string> CreateGuessesFromLetters(string word)
{ {
var list = new List<string>(); var list = new List<string>();
var previousGuesses = new List<string>();
foreach (string letter in _partialWordReplaceList.Keys) foreach (string letter in _partialWordReplaceList.Keys)
{ {
string s = word; var indexes = new List<int>();
int i = 0; for (int i = 1; i < word.Length - letter.Length; i++)
while (s.Contains(letter) && i < 10)
{ {
int index = s.FastIndexOf(letter); if (word.Substring(i).StartsWith(letter, StringComparison.Ordinal))
s = AddToGuessList(list, s, index, letter, _partialWordReplaceList[letter]); {
AddToGuessList(list, word, index, letter, _partialWordReplaceList[letter]); indexes.Add(i);
i++; var guess = word.Remove(i, letter.Length).Insert(i, _partialWordReplaceList[letter]);
AddToGuessList(list, guess);
}
} }
s = word;
i = 0; if (indexes.Count > 1)
while (s.Contains(letter) && i < 10)
{ {
int index = s.LastIndexOf(letter, StringComparison.Ordinal); var multiGuess = word;
s = AddToGuessList(list, s, index, letter, _partialWordReplaceList[letter]); for (int i = indexes.Count-1; i >= 0; i--)
AddToGuessList(list, word, index, letter, _partialWordReplaceList[letter]); {
i++; var idx = indexes[i];
multiGuess = multiGuess.Remove(idx, letter.Length).Insert(idx, _partialWordReplaceList[letter]);
AddToGuessList(list, multiGuess);
}
} }
foreach (var previousGuess in previousGuesses)
{
for (int i = 1; i < previousGuess.Length - letter.Length; i++)
{
if (previousGuess.Substring(i).StartsWith(letter, StringComparison.Ordinal))
{
var guess = previousGuess.Remove(i, letter.Length).Insert(i, _partialWordReplaceList[letter]);
AddToGuessList(list, guess);
}
}
}
previousGuesses = new List<string>(list);
} }
return list; return list;
} }

View File

@ -251,9 +251,11 @@
this.Controls.Add(this.buttonOK); this.Controls.Add(this.buttonOK);
this.Controls.Add(this.buttonCancel); this.Controls.Add(this.buttonCancel);
this.KeyPreview = true; this.KeyPreview = true;
this.MinimumSize = new System.Drawing.Size(840, 460);
this.Name = "VobSubNOcrCharacterInspect"; this.Name = "VobSubNOcrCharacterInspect";
this.ShowIcon = false; this.ShowIcon = false;
this.ShowInTaskbar = false; this.ShowInTaskbar = false;
this.StartPosition = System.Windows.Forms.FormStartPosition.CenterParent;
this.Text = "VobSubNOcrCharacterInspect"; this.Text = "VobSubNOcrCharacterInspect";
this.KeyDown += new System.Windows.Forms.KeyEventHandler(this.VobSubNOcrCharacterInspect_KeyDown); this.KeyDown += new System.Windows.Forms.KeyEventHandler(this.VobSubNOcrCharacterInspect_KeyDown);
this.groupBoxInspectItems.ResumeLayout(false); this.groupBoxInspectItems.ResumeLayout(false);

View File

@ -3122,7 +3122,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
var expandedResult = NOcrFindExpandedMatch(parentBitmap, targetItem, nOcrDb.OcrCharactersExpanded); var expandedResult = NOcrFindExpandedMatch(parentBitmap, targetItem, nOcrDb.OcrCharactersExpanded);
if (expandedResult != null) if (expandedResult != null)
{ {
return new CompareMatch(expandedResult.Text, expandedResult.Italic, expandedResult.ExpandCount, null, expandedResult); return new CompareMatch(expandedResult.Text, expandedResult.Italic, expandedResult.ExpandCount, null, expandedResult) { ImageSplitterItem = targetItem };
} }
var result = NOcrFindBestMatchNew(targetItem, targetItem.Y - targetItem.ParentY, out var italic, nOcrDb, deepSeek); var result = NOcrFindBestMatchNew(targetItem, targetItem.Y - targetItem.ParentY, out var italic, nOcrDb, deepSeek);
@ -4399,11 +4399,11 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
line = _nocrThreadResults[listViewIndex]; line = _nocrThreadResults[listViewIndex];
} }
var matches = new List<CompareMatch>();
var nbmpInput = new NikseBitmap(bitmap);
if (string.IsNullOrEmpty(line)) if (string.IsNullOrEmpty(line))
{ {
var nbmpInput = new NikseBitmap(bitmap);
var matches = new List<CompareMatch>();
int minLineHeight = GetLastBinOcrLowercaseHeight() - 3; int minLineHeight = GetLastBinOcrLowercaseHeight() - 3;
if (minLineHeight < 5) if (minLineHeight < 5)
@ -4512,7 +4512,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
//string name = SaveCompareItem(item.NikseBitmap, text, _vobSubOcrNOcrCharacter.IsItalic, 0); //string name = SaveCompareItem(item.NikseBitmap, text, _vobSubOcrNOcrCharacter.IsItalic, 0);
//var addition = new ImageCompareAddition(name, text, item.NikseBitmap, _vobSubOcrNOcrCharacter.IsItalic, listViewIndex); //var addition = new ImageCompareAddition(name, text, item.NikseBitmap, _vobSubOcrNOcrCharacter.IsItalic, listViewIndex);
//_lastAdditions.Add(addition); //_lastAdditions.Add(addition);
matches.Add(new CompareMatch(text, _vobSubOcrNOcrCharacter.IsItalic, 0, null)); matches.Add(new CompareMatch(text, _vobSubOcrNOcrCharacter.IsItalic, 0, null) { ImageSplitterItem = item });
} }
else if (result == DialogResult.Abort) else if (result == DialogResult.Abort)
{ {
@ -4527,7 +4527,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
} }
else // found image match else // found image match
{ {
matches.Add(new CompareMatch(match.Text, match.Italic, 0, null)); matches.Add(new CompareMatch(match.Text, match.Italic, 0, null) { ImageSplitterItem = item });
if (match.ExpandCount > 0) if (match.ExpandCount > 0)
{ {
index += match.ExpandCount - 1; index += match.ExpandCount - 1;
@ -4553,6 +4553,8 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
} }
line = MatchesToItalicStringConverter.GetStringWithItalicTags(matches); line = MatchesToItalicStringConverter.GetStringWithItalicTags(matches);
} }
line = FixNocrHardcodedStuff(line); line = FixNocrHardcodedStuff(line);
@ -4575,6 +4577,54 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel()); line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel());
} }
// smaller space pixels for italic
if (wordsNotFound > 0 && line.Contains("<i>", StringComparison.Ordinal))
{
AddItalicCouldBeSpace(matches, nbmpInput, _unItalicFactor, _numericUpDownPixelsIsSpace);
}
if (wordsNotFound > 0 && line.Contains("<i>", StringComparison.Ordinal) && matches.Any(p => p?.ImageSplitterItem?.CouldBeSpaceBefore == true))
{
int j = 0;
while (j < matches.Count)
{
var match = matches[j];
if (match.ImageSplitterItem?.CouldBeSpaceBefore == true)
{
match.ImageSplitterItem.CouldBeSpaceBefore = false;
if (match.Italic)
{
matches.Insert(j, new CompareMatch(" ", false, 0, string.Empty, new ImageSplitterItem(" ")));
}
}
j++;
}
var tempLine = MatchesToItalicStringConverter.GetStringWithItalicTags(matches);
var oldAutoGuessesUsed = new List<LogItem>(_ocrFixEngine.AutoGuessesUsed);
var oldUnknownWordsFound = new List<LogItem>(_ocrFixEngine.UnknownWordsFound);
_ocrFixEngine.AutoGuessesUsed.Clear();
_ocrFixEngine.UnknownWordsFound.Clear();
if (checkBoxAutoFixCommonErrors.Checked)
{
tempLine = _ocrFixEngine.FixOcrErrors(tempLine, listViewIndex, _lastLine, true, GetAutoGuessLevel());
}
int tempWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(tempLine, out var tempCorrectWords);
if (tempWordsNotFound <= wordsNotFound && tempCorrectWords > correctWords)
{
wordsNotFound = tempWordsNotFound;
correctWords = tempCorrectWords;
line = tempLine;
}
else
{
_ocrFixEngine.AutoGuessesUsed = oldAutoGuessesUsed;
_ocrFixEngine.UnknownWordsFound = oldUnknownWordsFound;
}
}
if (_ocrFixEngine.Abort) if (_ocrFixEngine.Abort)
{ {
ButtonStopClick(null, null); ButtonStopClick(null, null);
@ -4590,7 +4640,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
_ocrFixEngine.AutoGuessesUsed.Clear(); _ocrFixEngine.AutoGuessesUsed.Clear();
// Log unkown words guess (found via spelling dictionaries) // Log unknown words guess (found via spelling dictionaries)
LogUnknownWords(); LogUnknownWords();
ColorLineByNumberOfUnknownWords(listViewIndex, wordsNotFound, line); ColorLineByNumberOfUnknownWords(listViewIndex, wordsNotFound, line);
@ -4608,6 +4658,11 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
private string FixNocrHardcodedStuff(string input) private string FixNocrHardcodedStuff(string input)
{ {
if (!Configuration.Settings.Tools.OcrFixUseHardcodedRules)
{
return input;
}
var line = input; var line = input;
if (LanguageString.StartsWith("en", StringComparison.OrdinalIgnoreCase)) if (LanguageString.StartsWith("en", StringComparison.OrdinalIgnoreCase))
@ -9177,6 +9232,10 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{ {
InspectImageCompareMatchesForCurrentImageToolStripMenuItem_Click(null, null); InspectImageCompareMatchesForCurrentImageToolStripMenuItem_Click(null, null);
} }
else if (subtitleListView1.SelectedItems.Count > 0 && _ocrMethodIndex == _ocrMethodNocr)
{
toolStripMenuItemInspectNOcrMatches_Click(null, null);
}
} }
private void comboBoxTesseractEngineMode_SelectedIndexChanged(object sender, EventArgs e) private void comboBoxTesseractEngineMode_SelectedIndexChanged(object sender, EventArgs e)