mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-10-27 14:32:35 +01:00
Work on OCR/italic
This commit is contained in:
parent
0519c9dbb5
commit
a4310aec3d
@ -3332,6 +3332,8 @@
|
||||
<RegEx find="^_\.\.(\p{L})" replaceWith="...$1" />
|
||||
<RegEx find=" l([!?\.])" replaceWith=" I$1" />
|
||||
<RegEx find="\b\|\b" replaceWith="I" />
|
||||
<RegEx find="\b1 (know|will|almost|didn't|get|got|have|apologize|paid|like|think|would|hope|shall|chose|choose|won|am|was|don't|just|start|run|saw|said|believe|try|ever|need|certainly|can't|anticipated|did|can|rang|heard|gave|came|decided|should|took|wanted|read|thought|was|still|do|love|want|overstepped|accept|authorized|owe|understand|made|guess|bumped|wasn't|mean|admire|had|spent|told|see|walk|were|help|definitely|could|say|take|brought|assume|proposed|realized|loved|base|left|change|changed|rule|feel|date|dated|imagine|went|kind|couldn't|wouldn't|work|care|make|lost)\b" replaceWith="I $1" />
|
||||
<RegEx find="\b(1|l) (know|will|almost|didn't|get|got|have|apologize|paid|like|think|would|hope|shall|chose|choose|won|am|was|don't|just|start|run|saw|said|believe|try|ever|need|certainly|can't|anticipated|did|can|rang|heard|gave|came|decided|should|took|wanted|read|thought|was|still|do|love|want|overstepped|accept|authorized|owe|understand|made|guess|bumped|wasn't|mean|admire|had|spent|told|see|walk|were|help|definitely|could|say|take|brought|assume|proposed|realized|loved|base|left|change|changed|rule|feel|date|dated|imagine|went|kind|couldn't|wouldn't|work|care|make|lost)\b" replaceWith="I $2" />
|
||||
<RegEx find=",\.\." replaceWith="..." />
|
||||
<RegEx find="\bI KEA\b" replaceWith="IKEA" />
|
||||
</RegularExpressions>
|
||||
</OCRFixReplaceList>
|
@ -8,15 +8,15 @@
|
||||
public int Top { get; set; }
|
||||
public NikseBitmap NikseBitmap { get; set; }
|
||||
public string SpecialCharacter { get; set; }
|
||||
public bool CouldBeSpace { get; set; }
|
||||
public bool CouldBeSpaceBefore { get; set; }
|
||||
|
||||
public ImageSplitterItem(int x, int y, NikseBitmap bitmap, bool couldBeSpace = false)
|
||||
public ImageSplitterItem(int x, int y, NikseBitmap bitmap, bool couldBeSpaceBefore = false)
|
||||
{
|
||||
X = x;
|
||||
Y = y;
|
||||
NikseBitmap = bitmap;
|
||||
SpecialCharacter = null;
|
||||
CouldBeSpace = couldBeSpace;
|
||||
CouldBeSpaceBefore = couldBeSpaceBefore;
|
||||
}
|
||||
|
||||
public ImageSplitterItem(string specialCharacter)
|
||||
|
@ -4035,10 +4035,10 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
if (result == DialogResult.OK && _vobSubOcrCharacter.ShrinkSelection)
|
||||
{
|
||||
shrinkSelection = true;
|
||||
shrinkSelection = true;
|
||||
index--;
|
||||
if (expandSelectionList.Count > 0)
|
||||
{
|
||||
@ -4186,15 +4186,19 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
{
|
||||
AddItalicCouldBeSpace(matches, parentBitmap, _unItalicFactor);
|
||||
}
|
||||
if (wordsNotFound > 0 && line.Contains("<i>", StringComparison.Ordinal) && matches.Any(p => p?.ImageSplitterItem?.CouldBeSpace == true))
|
||||
if (wordsNotFound > 0 && line.Contains("<i>", StringComparison.Ordinal) && matches.Any(p => p?.ImageSplitterItem?.CouldBeSpaceBefore == true))
|
||||
{
|
||||
int j = 0;
|
||||
while (j < matches.Count)
|
||||
{
|
||||
if (matches[j]?.ImageSplitterItem?.CouldBeSpace == true)
|
||||
var match = matches[j];
|
||||
if (match.ImageSplitterItem?.CouldBeSpaceBefore == true)
|
||||
{
|
||||
matches[j].ImageSplitterItem.CouldBeSpace = false;
|
||||
matches.Insert(j, new CompareMatch(" ", false, 0, string.Empty, new ImageSplitterItem(" ")));
|
||||
match.ImageSplitterItem.CouldBeSpaceBefore = false;
|
||||
if (match.Italic)
|
||||
{
|
||||
matches.Insert(j, new CompareMatch(" ", false, 0, string.Empty, new ImageSplitterItem(" ")));
|
||||
}
|
||||
}
|
||||
|
||||
j++;
|
||||
@ -4211,8 +4215,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
}
|
||||
|
||||
int tempWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(tempLine, out var tempCorrectWords);
|
||||
//if (tempWordsNotFound == 0 && tempCorrectWords > 0)
|
||||
if (tempWordsNotFound < wordsNotFound && tempCorrectWords > 0)
|
||||
if (tempWordsNotFound <= wordsNotFound && tempCorrectWords > correctWords)
|
||||
{
|
||||
wordsNotFound = tempWordsNotFound;
|
||||
correctWords = tempCorrectWords;
|
||||
@ -4275,9 +4278,9 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
{
|
||||
var match = matches[i];
|
||||
var matchNext = matches[i + 1];
|
||||
if (!match.Italic || !matchNext.Italic ||
|
||||
if (!match.Italic || !matchNext.Italic || match.Text == "," ||
|
||||
string.IsNullOrWhiteSpace(match.Text) || string.IsNullOrWhiteSpace(matchNext.Text) ||
|
||||
match.ImageSplitterItem == null || match.ImageSplitterItem.CouldBeSpace)
|
||||
match.ImageSplitterItem == null || match.ImageSplitterItem.CouldBeSpaceBefore)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
@ -4285,7 +4288,7 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
int blankVerticalLines = IsVerticalAngledLineTransparent(parentBitmap, match, matchNext, unItalicFactor);
|
||||
if (blankVerticalLines >= _numericUpDownPixelsIsSpace)
|
||||
{
|
||||
matchNext.ImageSplitterItem.CouldBeSpace = true; // TODO: Rename to "could be space before"
|
||||
matchNext.ImageSplitterItem.CouldBeSpaceBefore = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -4295,7 +4298,6 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
int blanks = 0;
|
||||
var min = match.ImageSplitterItem.X + match.ImageSplitterItem.NikseBitmap.Width;
|
||||
var max = next.ImageSplitterItem.X + next.ImageSplitterItem.NikseBitmap.Width / 2;
|
||||
bool abort = false;
|
||||
for (int startX = min; startX < max; startX++)
|
||||
{
|
||||
var lineBlank = true;
|
||||
@ -4305,35 +4307,23 @@ namespace Nikse.SubtitleEdit.Forms.Ocr
|
||||
if (x >= 0)
|
||||
{
|
||||
var color = parentBitmap.GetPixel((int)Math.Round(x), y);
|
||||
if (color.A == 0)
|
||||
if (color.A != 0)
|
||||
{
|
||||
// parentBitmap.SetPixel((int)Math.Round(x), y, Color.LawnGreen);
|
||||
}
|
||||
else
|
||||
{
|
||||
// parentBitmap.SetPixel((int)Math.Round(x), y, Color.Red);
|
||||
lineBlank = false;
|
||||
if (blanks > 0)
|
||||
{
|
||||
abort = true;
|
||||
break;
|
||||
return blanks;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (abort)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if (lineBlank)
|
||||
{
|
||||
blanks++;
|
||||
}
|
||||
}
|
||||
|
||||
//parentBitmap.GetBitmap().Save(@"J:\Temp\" + DateTime.UtcNow.Ticks + "_" + match.Text + "_" + blanks + ".bmp");
|
||||
return blanks;
|
||||
}
|
||||
|
||||
|
@ -2004,10 +2004,10 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
|
||||
}
|
||||
|
||||
int wordsNotFound = 0;
|
||||
var words = HtmlUtil.RemoveOpenCloseTags(line, HtmlUtil.TagItalic).Split(SpellCheckWordLists.SplitChars.ToArray(), StringSplitOptions.RemoveEmptyEntries);
|
||||
var words = HtmlUtil.RemoveOpenCloseTags(line, HtmlUtil.TagItalic).Split(" \r\n\t".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
|
||||
for (int i = 0; i < words.Length; i++)
|
||||
{
|
||||
string word = words[i];
|
||||
string word = words[i].Trim(SpellCheckWordLists.SplitChars.ToArray());
|
||||
if (word.Length >= minLength)
|
||||
{
|
||||
if (!IsWordKnownOrNumber(word, line))
|
||||
|
Loading…
Reference in New Issue
Block a user