Work on OCR/italic

This commit is contained in:
Nikolaj Olsson 2020-05-17 23:06:01 +02:00
parent 0519c9dbb5
commit a4310aec3d
4 changed files with 24 additions and 32 deletions

View File

@ -3332,6 +3332,8 @@
<RegEx find="^_\.\.(\p{L})" replaceWith="...$1" />
<RegEx find=" l([!?\.])" replaceWith=" I$1" />
<RegEx find="\b\|\b" replaceWith="I" />
<RegEx find="\b1 (know|will|almost|didn't|get|got|have|apologize|paid|like|think|would|hope|shall|chose|choose|won|am|was|don't|just|start|run|saw|said|believe|try|ever|need|certainly|can't|anticipated|did|can|rang|heard|gave|came|decided|should|took|wanted|read|thought|was|still|do|love|want|overstepped|accept|authorized|owe|understand|made|guess|bumped|wasn't|mean|admire|had|spent|told|see|walk|were|help|definitely|could|say|take|brought|assume|proposed|realized|loved|base|left|change|changed|rule|feel|date|dated|imagine|went|kind|couldn't|wouldn't|work|care|make|lost)\b" replaceWith="I $1" />
<RegEx find="\b(1|l) (know|will|almost|didn't|get|got|have|apologize|paid|like|think|would|hope|shall|chose|choose|won|am|was|don't|just|start|run|saw|said|believe|try|ever|need|certainly|can't|anticipated|did|can|rang|heard|gave|came|decided|should|took|wanted|read|thought|was|still|do|love|want|overstepped|accept|authorized|owe|understand|made|guess|bumped|wasn't|mean|admire|had|spent|told|see|walk|were|help|definitely|could|say|take|brought|assume|proposed|realized|loved|base|left|change|changed|rule|feel|date|dated|imagine|went|kind|couldn't|wouldn't|work|care|make|lost)\b" replaceWith="I $2" />
<RegEx find=",\.\." replaceWith="..." />
<RegEx find="\bI KEA\b" replaceWith="IKEA" />
</RegularExpressions>
</OCRFixReplaceList>

View File

@ -8,15 +8,15 @@
public int Top { get; set; }
public NikseBitmap NikseBitmap { get; set; }
public string SpecialCharacter { get; set; }
public bool CouldBeSpace { get; set; }
public bool CouldBeSpaceBefore { get; set; }
public ImageSplitterItem(int x, int y, NikseBitmap bitmap, bool couldBeSpace = false)
public ImageSplitterItem(int x, int y, NikseBitmap bitmap, bool couldBeSpaceBefore = false)
{
X = x;
Y = y;
NikseBitmap = bitmap;
SpecialCharacter = null;
CouldBeSpace = couldBeSpace;
CouldBeSpaceBefore = couldBeSpaceBefore;
}
public ImageSplitterItem(string specialCharacter)

View File

@ -4035,10 +4035,10 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
break;
}
if (result == DialogResult.OK && _vobSubOcrCharacter.ShrinkSelection)
{
shrinkSelection = true;
shrinkSelection = true;
index--;
if (expandSelectionList.Count > 0)
{
@ -4186,15 +4186,19 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
AddItalicCouldBeSpace(matches, parentBitmap, _unItalicFactor);
}
if (wordsNotFound > 0 && line.Contains("<i>", StringComparison.Ordinal) && matches.Any(p => p?.ImageSplitterItem?.CouldBeSpace == true))
if (wordsNotFound > 0 && line.Contains("<i>", StringComparison.Ordinal) && matches.Any(p => p?.ImageSplitterItem?.CouldBeSpaceBefore == true))
{
int j = 0;
while (j < matches.Count)
{
if (matches[j]?.ImageSplitterItem?.CouldBeSpace == true)
var match = matches[j];
if (match.ImageSplitterItem?.CouldBeSpaceBefore == true)
{
matches[j].ImageSplitterItem.CouldBeSpace = false;
matches.Insert(j, new CompareMatch(" ", false, 0, string.Empty, new ImageSplitterItem(" ")));
match.ImageSplitterItem.CouldBeSpaceBefore = false;
if (match.Italic)
{
matches.Insert(j, new CompareMatch(" ", false, 0, string.Empty, new ImageSplitterItem(" ")));
}
}
j++;
@ -4211,8 +4215,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
}
int tempWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(tempLine, out var tempCorrectWords);
//if (tempWordsNotFound == 0 && tempCorrectWords > 0)
if (tempWordsNotFound < wordsNotFound && tempCorrectWords > 0)
if (tempWordsNotFound <= wordsNotFound && tempCorrectWords > correctWords)
{
wordsNotFound = tempWordsNotFound;
correctWords = tempCorrectWords;
@ -4275,9 +4278,9 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
{
var match = matches[i];
var matchNext = matches[i + 1];
if (!match.Italic || !matchNext.Italic ||
if (!match.Italic || !matchNext.Italic || match.Text == "," ||
string.IsNullOrWhiteSpace(match.Text) || string.IsNullOrWhiteSpace(matchNext.Text) ||
match.ImageSplitterItem == null || match.ImageSplitterItem.CouldBeSpace)
match.ImageSplitterItem == null || match.ImageSplitterItem.CouldBeSpaceBefore)
{
continue;
}
@ -4285,7 +4288,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
int blankVerticalLines = IsVerticalAngledLineTransparent(parentBitmap, match, matchNext, unItalicFactor);
if (blankVerticalLines >= _numericUpDownPixelsIsSpace)
{
matchNext.ImageSplitterItem.CouldBeSpace = true; // TODO: Rename to "could be space before"
matchNext.ImageSplitterItem.CouldBeSpaceBefore = true;
}
}
}
@ -4295,7 +4298,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
int blanks = 0;
var min = match.ImageSplitterItem.X + match.ImageSplitterItem.NikseBitmap.Width;
var max = next.ImageSplitterItem.X + next.ImageSplitterItem.NikseBitmap.Width / 2;
bool abort = false;
for (int startX = min; startX < max; startX++)
{
var lineBlank = true;
@ -4305,35 +4307,23 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
if (x >= 0)
{
var color = parentBitmap.GetPixel((int)Math.Round(x), y);
if (color.A == 0)
if (color.A != 0)
{
// parentBitmap.SetPixel((int)Math.Round(x), y, Color.LawnGreen);
}
else
{
// parentBitmap.SetPixel((int)Math.Round(x), y, Color.Red);
lineBlank = false;
if (blanks > 0)
{
abort = true;
break;
return blanks;
}
}
}
}
if (abort)
{
break;
}
if (lineBlank)
{
blanks++;
}
}
//parentBitmap.GetBitmap().Save(@"J:\Temp\" + DateTime.UtcNow.Ticks + "_" + match.Text + "_" + blanks + ".bmp");
return blanks;
}

View File

@ -2004,10 +2004,10 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
}
int wordsNotFound = 0;
var words = HtmlUtil.RemoveOpenCloseTags(line, HtmlUtil.TagItalic).Split(SpellCheckWordLists.SplitChars.ToArray(), StringSplitOptions.RemoveEmptyEntries);
var words = HtmlUtil.RemoveOpenCloseTags(line, HtmlUtil.TagItalic).Split(" \r\n\t".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
for (int i = 0; i < words.Length; i++)
{
string word = words[i];
string word = words[i].Trim(SpellCheckWordLists.SplitChars.ToArray());
if (word.Length >= minLength)
{
if (!IsWordKnownOrNumber(word, line))