mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-11-25 04:33:04 +01:00
Bugfix in ocr + a few minor improvements
git-svn-id: https://subtitleedit.googlecode.com/svn/trunk@119 99eadd0c-20b8-1223-b5c4-2a2b2df33de2
This commit is contained in:
parent
81bac71766
commit
0e4e91281f
2
src/Forms/VobSubOcr.Designer.cs
generated
2
src/Forms/VobSubOcr.Designer.cs
generated
@ -255,6 +255,8 @@ namespace Nikse.SubtitleEdit.Forms
|
|||||||
// checkBoxUseModiInTesseractForUnknownWords
|
// checkBoxUseModiInTesseractForUnknownWords
|
||||||
//
|
//
|
||||||
this.checkBoxUseModiInTesseractForUnknownWords.AutoSize = true;
|
this.checkBoxUseModiInTesseractForUnknownWords.AutoSize = true;
|
||||||
|
this.checkBoxUseModiInTesseractForUnknownWords.Checked = true;
|
||||||
|
this.checkBoxUseModiInTesseractForUnknownWords.CheckState = System.Windows.Forms.CheckState.Checked;
|
||||||
this.checkBoxUseModiInTesseractForUnknownWords.Enabled = false;
|
this.checkBoxUseModiInTesseractForUnknownWords.Enabled = false;
|
||||||
this.checkBoxUseModiInTesseractForUnknownWords.Location = new System.Drawing.Point(22, 74);
|
this.checkBoxUseModiInTesseractForUnknownWords.Location = new System.Drawing.Point(22, 74);
|
||||||
this.checkBoxUseModiInTesseractForUnknownWords.Name = "checkBoxUseModiInTesseractForUnknownWords";
|
this.checkBoxUseModiInTesseractForUnknownWords.Name = "checkBoxUseModiInTesseractForUnknownWords";
|
||||||
|
@ -999,43 +999,38 @@ namespace Nikse.SubtitleEdit.Forms
|
|||||||
comboBoxModiLanguage.SelectedIndex = -1;
|
comboBoxModiLanguage.SelectedIndex = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
var sb = new StringBuilder();
|
|
||||||
int badWords = 0;
|
int badWords = 0;
|
||||||
var textWithOutFixes = new StringBuilder();
|
string textWithOutFixes = Tesseract3DoOcrViaExe(bitmap, _languageId);
|
||||||
textWithOutFixes.Append(Tesseract3DoOcrViaExe(bitmap, _languageId));
|
|
||||||
|
|
||||||
if (textWithOutFixes.ToString().Trim().Length == 0)
|
if (textWithOutFixes.ToString().Trim().Length == 0)
|
||||||
{
|
{
|
||||||
textWithOutFixes = new StringBuilder();
|
textWithOutFixes = TesseractResizeAndRetry(bitmap);
|
||||||
textWithOutFixes.Append(TesseractResizeAndRetry(bitmap));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sb.Append(textWithOutFixes.ToString());
|
int numberOfWords = textWithOutFixes.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length;
|
||||||
int numberOfWords = sb.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length;
|
|
||||||
|
|
||||||
string line = sb.ToString().Trim();
|
string line = textWithOutFixes.ToString().Trim();
|
||||||
if (_ocrFixEngine.IsDictionaryLoaded)
|
if (_ocrFixEngine.IsDictionaryLoaded)
|
||||||
{
|
{
|
||||||
if (checkBoxAutoFixCommonErrors.Checked)
|
if (checkBoxAutoFixCommonErrors.Checked)
|
||||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
||||||
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line);
|
int correctWords;
|
||||||
|
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line, out correctWords);
|
||||||
|
|
||||||
if (wordsNotFound > 0)
|
if (wordsNotFound > 0 || correctWords == 0)
|
||||||
{
|
{
|
||||||
string newText = TesseractResizeAndRetry(bitmap);
|
string newUnfixedText = TesseractResizeAndRetry(bitmap);
|
||||||
newText = _ocrFixEngine.FixOcrErrors(newText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
string newText = _ocrFixEngine.FixOcrErrors(newUnfixedText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
||||||
int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText);
|
int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText, out correctWords);
|
||||||
if (newWordsNotFound < wordsNotFound)
|
if (newWordsNotFound < wordsNotFound)
|
||||||
{
|
{
|
||||||
wordsNotFound = newWordsNotFound;
|
wordsNotFound = newWordsNotFound;
|
||||||
textWithOutFixes = new StringBuilder();
|
textWithOutFixes = newUnfixedText;
|
||||||
textWithOutFixes.Append(newText);
|
line = newText;
|
||||||
sb = new StringBuilder();
|
|
||||||
sb.Append(newText);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (wordsNotFound > 0 || sb.ToString().Replace("~", string.Empty).Trim().Length == 0)
|
if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length == 0)
|
||||||
{
|
{
|
||||||
_ocrFixEngine.AutoGuessesUsed.Clear();
|
_ocrFixEngine.AutoGuessesUsed.Clear();
|
||||||
_ocrFixEngine.UnknownWordsFound.Clear();
|
_ocrFixEngine.UnknownWordsFound.Clear();
|
||||||
@ -1051,13 +1046,13 @@ namespace Nikse.SubtitleEdit.Forms
|
|||||||
|
|
||||||
if (modiText.Length > 1)
|
if (modiText.Length > 1)
|
||||||
{
|
{
|
||||||
int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiText);
|
int modiWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiText, out correctWords);
|
||||||
if (modiWordsNotFound > 0)
|
if (modiWordsNotFound > 0)
|
||||||
{
|
{
|
||||||
string modiTextOcrFixed = modiText;
|
string modiTextOcrFixed = modiText;
|
||||||
if (checkBoxAutoFixCommonErrors.Checked)
|
if (checkBoxAutoFixCommonErrors.Checked)
|
||||||
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked);
|
modiTextOcrFixed = _ocrFixEngine.FixOcrErrors(modiText, index, _lastLine, false, checkBoxGuessUnknownWords.Checked);
|
||||||
int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed);
|
int modiOcrCorrectedWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(modiTextOcrFixed, out correctWords);
|
||||||
if (modiOcrCorrectedWordsNotFound < modiWordsNotFound)
|
if (modiOcrCorrectedWordsNotFound < modiWordsNotFound)
|
||||||
modiText = modiTextOcrFixed;
|
modiText = modiTextOcrFixed;
|
||||||
}
|
}
|
||||||
|
@ -583,12 +583,15 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
if (i==0)
|
||||||
|
guesses.Add(word.Replace(@"\/", "V"));
|
||||||
|
else
|
||||||
|
guesses.Add(word.Replace(@"\/", "v"));
|
||||||
guesses.Add(word.Replace("fi", "fi"));
|
guesses.Add(word.Replace("fi", "fi"));
|
||||||
guesses.Add(word.Replace("fi", "fj"));
|
guesses.Add(word.Replace("fi", "fj"));
|
||||||
guesses.Add(word.Replace("fl", "fl"));
|
guesses.Add(word.Replace("fl", "fl"));
|
||||||
if (!word.EndsWith("€") && !word.StartsWith("€"))
|
if (!word.EndsWith("€") && !word.StartsWith("€"))
|
||||||
guesses.Add(word.Replace("€", "e"));
|
guesses.Add(word.Replace("€", "e"));
|
||||||
guesses.Add(word.Replace("fi", "fj"));
|
|
||||||
}
|
}
|
||||||
foreach (string guess in guesses)
|
foreach (string guess in guesses)
|
||||||
{
|
{
|
||||||
@ -884,8 +887,9 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int CountUnknownWordsViaDictionary(string line)
|
public int CountUnknownWordsViaDictionary(string line, out int numberOfCorrectWords)
|
||||||
{
|
{
|
||||||
|
numberOfCorrectWords = 0;
|
||||||
if (_hunspell == null)
|
if (_hunspell == null)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
@ -900,8 +904,11 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
|||||||
if (!correct)
|
if (!correct)
|
||||||
correct = _hunspell.Spell(word.Trim('\''));
|
correct = _hunspell.Spell(word.Trim('\''));
|
||||||
|
|
||||||
if (!correct)
|
if (correct)
|
||||||
|
numberOfCorrectWords++;
|
||||||
|
else
|
||||||
wordsNotFound++;
|
wordsNotFound++;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return wordsNotFound;
|
return wordsNotFound;
|
||||||
|
Loading…
Reference in New Issue
Block a user