Some minor improvements to the OCR'ing (image resize for tesseract if first hit is bad)

Tesseract now has "Try modi for unknown words" disabled as it almost never is better than Tesseract!


git-svn-id: https://subtitleedit.googlecode.com/svn/trunk@112 99eadd0c-20b8-1223-b5c4-2a2b2df33de2
This commit is contained in:
niksedk 2010-10-29 21:25:06 +00:00
parent 1441b8ca05
commit b6d5e0b970
3 changed files with 69 additions and 21 deletions

View File

@ -255,8 +255,6 @@ namespace Nikse.SubtitleEdit.Forms
// checkBoxUseModiInTesseractForUnknownWords
//
this.checkBoxUseModiInTesseractForUnknownWords.AutoSize = true;
this.checkBoxUseModiInTesseractForUnknownWords.Checked = true;
this.checkBoxUseModiInTesseractForUnknownWords.CheckState = System.Windows.Forms.CheckState.Checked;
this.checkBoxUseModiInTesseractForUnknownWords.Location = new System.Drawing.Point(22, 74);
this.checkBoxUseModiInTesseractForUnknownWords.Name = "checkBoxUseModiInTesseractForUnknownWords";
this.checkBoxUseModiInTesseractForUnknownWords.Size = new System.Drawing.Size(165, 17);

View File

@ -996,12 +996,20 @@ namespace Nikse.SubtitleEdit.Forms
i++;
}
}
comboBoxModiLanguage.SelectedIndex = -1;
}
var sb = new StringBuilder();
int badWords = 0;
var textWithOutFixes = new StringBuilder();
textWithOutFixes.Append(Tesseract3DoOcrViaExe(bitmap, _languageId));
if (textWithOutFixes.ToString().Trim().Length == 0)
{
textWithOutFixes = new StringBuilder();
textWithOutFixes.Append(TesseractResizeAndRetry(bitmap));
}
sb.Append(textWithOutFixes.ToString());
int numberOfWords = sb.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length;
@ -1012,6 +1020,22 @@ namespace Nikse.SubtitleEdit.Forms
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line);
if (wordsNotFound > 0)
{
string newText = TesseractResizeAndRetry(bitmap);
newText = _ocrFixEngine.FixOcrErrors(newText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText);
if (newWordsNotFound < wordsNotFound)
{
wordsNotFound = newWordsNotFound;
textWithOutFixes = new StringBuilder();
textWithOutFixes.Append(newText);
sb = new StringBuilder();
sb.Append(newText);
}
}
if (wordsNotFound > 0 || sb.ToString().Replace("~", string.Empty).Trim().Length == 0)
{
_ocrFixEngine.AutoGuessesUsed.Clear();
@ -1106,6 +1130,18 @@ namespace Nikse.SubtitleEdit.Forms
return line;
}
private string TesseractResizeAndRetry(Bitmap bitmap)
{
string result = Tesseract3DoOcrViaExe(ResizeBitmap(bitmap, bitmap.Width * 2, bitmap.Height * 2), _languageId);
if (result.Trim().Length == 0)
{
result = Tesseract3DoOcrViaExe(ResizeBitmap(bitmap, bitmap.Width * 3, bitmap.Height * 2), _languageId);
if (result.ToString().Trim().Length == 0)
result = Tesseract3DoOcrViaExe(ResizeBitmap(bitmap, bitmap.Width * 4, bitmap.Height * 2), _languageId);
}
return result.TrimEnd();
}
private void LogOcrFix(int index, string oldLine, string newLine)
{
listBoxLog.Items.Add(string.Format("#{0}: {1} -> {2}", index+1, oldLine.Replace(Environment.NewLine, " "), newLine.Replace(Environment.NewLine, " ")));
@ -1452,7 +1488,7 @@ namespace Nikse.SubtitleEdit.Forms
saveFileDialog1.Title = Configuration.Settings.Language.VobSubOcr.SaveSubtitleImageAs;
saveFileDialog1.AddExtension = true;
saveFileDialog1.FileName = "Image" + _selectedIndex;
saveFileDialog1.Filter = "PNG image|*.png|BMP image|*.bmp|GIF image|*.gif";
saveFileDialog1.Filter = "PNG image|*.png|BMP image|*.bmp|GIF image|*.gif|TIFF image|*.tiff";
saveFileDialog1.FilterIndex = 0;
DialogResult result = saveFileDialog1.ShowDialog(this);
@ -1471,8 +1507,10 @@ namespace Nikse.SubtitleEdit.Forms
bmp.Save(saveFileDialog1.FileName, System.Drawing.Imaging.ImageFormat.Png);
else if (saveFileDialog1.FilterIndex == 1)
bmp.Save(saveFileDialog1.FileName);
else
else if (saveFileDialog1.FilterIndex == 2)
bmp.Save(saveFileDialog1.FileName, System.Drawing.Imaging.ImageFormat.Gif);
else
bmp.Save(saveFileDialog1.FileName, System.Drawing.Imaging.ImageFormat.Tiff);
}
catch (Exception exception)
{

View File

@ -575,9 +575,22 @@ namespace Nikse.SubtitleEdit.Logic.OCR
if (autoFix && useAutoGuess)
{
List<string> guesses = new List<string>();
if (word.Length > 5)
{
foreach (string guess in CreateGuessesFromLetters(word))
guesses = (List<string>)CreateGuessesFromLetters(word);
}
else
{
guesses.Add(word.Replace("fi", "fi"));
guesses.Add(word.Replace("fi", "fj"));
guesses.Add(word.Replace("fl", "fl"));
if (!word.EndsWith("€") && !word.StartsWith("€"))
guesses.Add(word.Replace("€", "e"));
guesses.Add(word.Replace("fi", "fj"));
}
foreach (string guess in guesses)
{
if (IsWordOrWordsCorrect(_hunspell, guess))
{
@ -596,7 +609,6 @@ namespace Nikse.SubtitleEdit.Logic.OCR
}
}
}
}
if (!correct && promptForFixingErrors)
{
List<string> suggestions = _hunspell.Suggest(word);
@ -636,8 +648,8 @@ namespace Nikse.SubtitleEdit.Logic.OCR
case OcrSpellCheck.Action.AddToUserDictionary:
if (_userWordListXmlFileName != null)
{
_userWordList.Add(_spellCheck.Word);
Utilities.AddToUserDictionary(_spellCheck.Word, _fiveLetterWordListLanguageName);
_userWordList.Add(_spellCheck.Word.Trim().ToLower());
Utilities.AddToUserDictionary(_spellCheck.Word.Trim().ToLower(), _fiveLetterWordListLanguageName);
}
result.Word = _spellCheck.Word;
result.Fixed = true;
@ -800,7 +812,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
private static string AddToGuessList(List<string> list, string word, int index, string letter, string replaceLetters)
{
if (string.IsNullOrEmpty(word) || index < 0 || index >= word.Length)
if (string.IsNullOrEmpty(word) || index < 0 || index + letter.Length - 1 >= word.Length)
return word;
string s = word.Remove(index, letter.Length);