mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-10-27 22:42:38 +01:00
Fix bug in OCR auto guesses +fix new italic space detect for nOcr
This commit is contained in:
parent
728597ef45
commit
32b8d875dc
@ -338,55 +338,62 @@ namespace Nikse.SubtitleEdit.Core.Dictionaries
|
||||
return newText;
|
||||
}
|
||||
|
||||
private static string AddToGuessList(List<string> list, string word, int index, string letter, string replaceLetters)
|
||||
private static void AddToGuessList(List<string> list, string guess)
|
||||
{
|
||||
if (string.IsNullOrEmpty(word) || index < 0 || index + letter.Length - 1 >= word.Length)
|
||||
if (string.IsNullOrEmpty(guess))
|
||||
{
|
||||
return word;
|
||||
return;
|
||||
}
|
||||
|
||||
string s = word.Remove(index, letter.Length);
|
||||
if (index >= s.Length)
|
||||
if (!list.Contains(guess))
|
||||
{
|
||||
s += replaceLetters;
|
||||
list.Add(guess);
|
||||
}
|
||||
else
|
||||
{
|
||||
s = s.Insert(index, replaceLetters);
|
||||
}
|
||||
|
||||
if (!list.Contains(s))
|
||||
{
|
||||
list.Add(s);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
public IEnumerable<string> CreateGuessesFromLetters(string word)
|
||||
{
|
||||
var list = new List<string>();
|
||||
var previousGuesses = new List<string>();
|
||||
foreach (string letter in _partialWordReplaceList.Keys)
|
||||
{
|
||||
string s = word;
|
||||
int i = 0;
|
||||
while (s.Contains(letter) && i < 10)
|
||||
var indexes = new List<int>();
|
||||
for (int i = 1; i < word.Length - letter.Length; i++)
|
||||
{
|
||||
int index = s.FastIndexOf(letter);
|
||||
s = AddToGuessList(list, s, index, letter, _partialWordReplaceList[letter]);
|
||||
AddToGuessList(list, word, index, letter, _partialWordReplaceList[letter]);
|
||||
i++;
|
||||
if (word.Substring(i).StartsWith(letter, StringComparison.Ordinal))
|
||||
{
|
||||
indexes.Add(i);
|
||||
var guess = word.Remove(i, letter.Length).Insert(i, _partialWordReplaceList[letter]);
|
||||
AddToGuessList(list, guess);
|
||||
}
|
||||
}
|
||||
s = word;
|
||||
i = 0;
|
||||
while (s.Contains(letter) && i < 10)
|
||||
|
||||
if (indexes.Count > 1)
|
||||
{
|
||||
int index = s.LastIndexOf(letter, StringComparison.Ordinal);
|
||||
s = AddToGuessList(list, s, index, letter, _partialWordReplaceList[letter]);
|
||||
AddToGuessList(list, word, index, letter, _partialWordReplaceList[letter]);
|
||||
i++;
|
||||
var multiGuess = word;
|
||||
for (int i = indexes.Count-1; i >= 0; i--)
|
||||
{
|
||||
var idx = indexes[i];
|
||||
multiGuess = multiGuess.Remove(idx, letter.Length).Insert(idx, _partialWordReplaceList[letter]);
|
||||
AddToGuessList(list, multiGuess);
|
||||
}
|
||||
}
|
||||
|
||||
foreach (var previousGuess in previousGuesses)
|
||||
{
|
||||
for (int i = 1; i < previousGuess.Length - letter.Length; i++)
|
||||
{
|
||||
if (previousGuess.Substring(i).StartsWith(letter, StringComparison.Ordinal))
|
||||
{
|
||||
var guess = previousGuess.Remove(i, letter.Length).Insert(i, _partialWordReplaceList[letter]);
|
||||
AddToGuessList(list, guess);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
previousGuesses = new List<string>(list);
|
||||
}
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
|
@ -251,9 +251,11 @@
|
||||
this.Controls.Add(this.buttonOK);
|
||||
this.Controls.Add(this.buttonCancel);
|
||||
this.KeyPreview = true;
|
||||
this.MinimumSize = new System.Drawing.Size(840, 460);
|
||||
this.Name = "VobSubNOcrCharacterInspect";
|
||||
this.ShowIcon = false;
|
||||
this.ShowInTaskbar = false;
|
||||
this.StartPosition = System.Windows.Forms.FormStartPosition.CenterParent;
|
||||
this.Text = "VobSubNOcrCharacterInspect";
|
||||
this.KeyDown += new System.Windows.Forms.KeyEventHandler(this.VobSubNOcrCharacterInspect_KeyDown);
|
||||
this.groupBoxInspectItems.ResumeLayout(false);
|
||||
|
@ -3122,7 +3122,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
var expandedResult = NOcrFindExpandedMatch(parentBitmap, targetItem, nOcrDb.OcrCharactersExpanded);
|
||||
if (expandedResult != null)
|
||||
{
|
||||
return new CompareMatch(expandedResult.Text, expandedResult.Italic, expandedResult.ExpandCount, null, expandedResult);
|
||||
return new CompareMatch(expandedResult.Text, expandedResult.Italic, expandedResult.ExpandCount, null, expandedResult) { ImageSplitterItem = targetItem };
|
||||
}
|
||||
|
||||
var result = NOcrFindBestMatchNew(targetItem, targetItem.Y - targetItem.ParentY, out var italic, nOcrDb, deepSeek);
|
||||
@ -4399,11 +4399,11 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
line = _nocrThreadResults[listViewIndex];
|
||||
}
|
||||
|
||||
var matches = new List<CompareMatch>();
|
||||
var nbmpInput = new NikseBitmap(bitmap);
|
||||
|
||||
if (string.IsNullOrEmpty(line))
|
||||
{
|
||||
var nbmpInput = new NikseBitmap(bitmap);
|
||||
|
||||
var matches = new List<CompareMatch>();
|
||||
|
||||
int minLineHeight = GetLastBinOcrLowercaseHeight() - 3;
|
||||
if (minLineHeight < 5)
|
||||
@ -4512,7 +4512,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
//string name = SaveCompareItem(item.NikseBitmap, text, _vobSubOcrNOcrCharacter.IsItalic, 0);
|
||||
//var addition = new ImageCompareAddition(name, text, item.NikseBitmap, _vobSubOcrNOcrCharacter.IsItalic, listViewIndex);
|
||||
//_lastAdditions.Add(addition);
|
||||
matches.Add(new CompareMatch(text, _vobSubOcrNOcrCharacter.IsItalic, 0, null));
|
||||
matches.Add(new CompareMatch(text, _vobSubOcrNOcrCharacter.IsItalic, 0, null) { ImageSplitterItem = item });
|
||||
}
|
||||
else if (result == DialogResult.Abort)
|
||||
{
|
||||
@ -4527,7 +4527,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
}
|
||||
else // found image match
|
||||
{
|
||||
matches.Add(new CompareMatch(match.Text, match.Italic, 0, null));
|
||||
matches.Add(new CompareMatch(match.Text, match.Italic, 0, null) { ImageSplitterItem = item });
|
||||
if (match.ExpandCount > 0)
|
||||
{
|
||||
index += match.ExpandCount - 1;
|
||||
@ -4553,6 +4553,8 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
}
|
||||
|
||||
line = MatchesToItalicStringConverter.GetStringWithItalicTags(matches);
|
||||
|
||||
|
||||
}
|
||||
|
||||
line = FixNocrHardcodedStuff(line);
|
||||
@ -4575,6 +4577,54 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
line = _ocrFixEngine.FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, line, listViewIndex, bitmap, checkBoxAutoFixCommonErrors.Checked, checkBoxPromptForUnknownWords.Checked, true, GetAutoGuessLevel());
|
||||
}
|
||||
|
||||
|
||||
// smaller space pixels for italic
|
||||
if (wordsNotFound > 0 && line.Contains("<i>", StringComparison.Ordinal))
|
||||
{
|
||||
AddItalicCouldBeSpace(matches, nbmpInput, _unItalicFactor, _numericUpDownPixelsIsSpace);
|
||||
}
|
||||
if (wordsNotFound > 0 && line.Contains("<i>", StringComparison.Ordinal) && matches.Any(p => p?.ImageSplitterItem?.CouldBeSpaceBefore == true))
|
||||
{
|
||||
int j = 0;
|
||||
while (j < matches.Count)
|
||||
{
|
||||
var match = matches[j];
|
||||
if (match.ImageSplitterItem?.CouldBeSpaceBefore == true)
|
||||
{
|
||||
match.ImageSplitterItem.CouldBeSpaceBefore = false;
|
||||
if (match.Italic)
|
||||
{
|
||||
matches.Insert(j, new CompareMatch(" ", false, 0, string.Empty, new ImageSplitterItem(" ")));
|
||||
}
|
||||
}
|
||||
|
||||
j++;
|
||||
}
|
||||
var tempLine = MatchesToItalicStringConverter.GetStringWithItalicTags(matches);
|
||||
var oldAutoGuessesUsed = new List<LogItem>(_ocrFixEngine.AutoGuessesUsed);
|
||||
var oldUnknownWordsFound = new List<LogItem>(_ocrFixEngine.UnknownWordsFound);
|
||||
_ocrFixEngine.AutoGuessesUsed.Clear();
|
||||
_ocrFixEngine.UnknownWordsFound.Clear();
|
||||
if (checkBoxAutoFixCommonErrors.Checked)
|
||||
{
|
||||
tempLine = _ocrFixEngine.FixOcrErrors(tempLine, listViewIndex, _lastLine, true, GetAutoGuessLevel());
|
||||
}
|
||||
|
||||
int tempWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(tempLine, out var tempCorrectWords);
|
||||
if (tempWordsNotFound <= wordsNotFound && tempCorrectWords > correctWords)
|
||||
{
|
||||
wordsNotFound = tempWordsNotFound;
|
||||
correctWords = tempCorrectWords;
|
||||
line = tempLine;
|
||||
}
|
||||
else
|
||||
{
|
||||
_ocrFixEngine.AutoGuessesUsed = oldAutoGuessesUsed;
|
||||
_ocrFixEngine.UnknownWordsFound = oldUnknownWordsFound;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (_ocrFixEngine.Abort)
|
||||
{
|
||||
ButtonStopClick(null, null);
|
||||
@ -4590,7 +4640,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
|
||||
_ocrFixEngine.AutoGuessesUsed.Clear();
|
||||
|
||||
// Log unkown words guess (found via spelling dictionaries)
|
||||
// Log unknown words guess (found via spelling dictionaries)
|
||||
LogUnknownWords();
|
||||
|
||||
ColorLineByNumberOfUnknownWords(listViewIndex, wordsNotFound, line);
|
||||
@ -4608,6 +4658,11 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
|
||||
private string FixNocrHardcodedStuff(string input)
|
||||
{
|
||||
if (!Configuration.Settings.Tools.OcrFixUseHardcodedRules)
|
||||
{
|
||||
return input;
|
||||
}
|
||||
|
||||
var line = input;
|
||||
|
||||
if (LanguageString.StartsWith("en", StringComparison.OrdinalIgnoreCase))
|
||||
@ -9177,6 +9232,10 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
{
|
||||
InspectImageCompareMatchesForCurrentImageToolStripMenuItem_Click(null, null);
|
||||
}
|
||||
else if (subtitleListView1.SelectedItems.Count > 0 && _ocrMethodIndex == _ocrMethodNocr)
|
||||
{
|
||||
toolStripMenuItemInspectNOcrMatches_Click(null, null);
|
||||
}
|
||||
}
|
||||
|
||||
private void comboBoxTesseractEngineMode_SelectedIndexChanged(object sender, EventArgs e)
|
||||
|
Loading…
Reference in New Issue
Block a user