mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-11-22 03:02:35 +01:00
Some minor improvements to the OCR'ing (image resize for tesseract if first hit is bad)
Tesseract now has "Try modi for unknown words" disabled as it almost never is better than Tesseract! git-svn-id: https://subtitleedit.googlecode.com/svn/trunk@112 99eadd0c-20b8-1223-b5c4-2a2b2df33de2
This commit is contained in:
parent
1441b8ca05
commit
b6d5e0b970
2
src/Forms/VobSubOcr.Designer.cs
generated
2
src/Forms/VobSubOcr.Designer.cs
generated
@ -255,8 +255,6 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
// checkBoxUseModiInTesseractForUnknownWords
|
||||
//
|
||||
this.checkBoxUseModiInTesseractForUnknownWords.AutoSize = true;
|
||||
this.checkBoxUseModiInTesseractForUnknownWords.Checked = true;
|
||||
this.checkBoxUseModiInTesseractForUnknownWords.CheckState = System.Windows.Forms.CheckState.Checked;
|
||||
this.checkBoxUseModiInTesseractForUnknownWords.Location = new System.Drawing.Point(22, 74);
|
||||
this.checkBoxUseModiInTesseractForUnknownWords.Name = "checkBoxUseModiInTesseractForUnknownWords";
|
||||
this.checkBoxUseModiInTesseractForUnknownWords.Size = new System.Drawing.Size(165, 17);
|
||||
|
@ -996,12 +996,20 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
i++;
|
||||
}
|
||||
}
|
||||
comboBoxModiLanguage.SelectedIndex = -1;
|
||||
}
|
||||
|
||||
var sb = new StringBuilder();
|
||||
int badWords = 0;
|
||||
var textWithOutFixes = new StringBuilder();
|
||||
textWithOutFixes.Append(Tesseract3DoOcrViaExe(bitmap, _languageId));
|
||||
|
||||
if (textWithOutFixes.ToString().Trim().Length == 0)
|
||||
{
|
||||
textWithOutFixes = new StringBuilder();
|
||||
textWithOutFixes.Append(TesseractResizeAndRetry(bitmap));
|
||||
}
|
||||
|
||||
sb.Append(textWithOutFixes.ToString());
|
||||
int numberOfWords = sb.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length;
|
||||
|
||||
@ -1012,6 +1020,22 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
line = _ocrFixEngine.FixOcrErrors(line, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
||||
int wordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(line);
|
||||
|
||||
if (wordsNotFound > 0)
|
||||
{
|
||||
string newText = TesseractResizeAndRetry(bitmap);
|
||||
newText = _ocrFixEngine.FixOcrErrors(newText, index, _lastLine, true, checkBoxGuessUnknownWords.Checked);
|
||||
int newWordsNotFound = _ocrFixEngine.CountUnknownWordsViaDictionary(newText);
|
||||
if (newWordsNotFound < wordsNotFound)
|
||||
{
|
||||
wordsNotFound = newWordsNotFound;
|
||||
textWithOutFixes = new StringBuilder();
|
||||
textWithOutFixes.Append(newText);
|
||||
sb = new StringBuilder();
|
||||
sb.Append(newText);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (wordsNotFound > 0 || sb.ToString().Replace("~", string.Empty).Trim().Length == 0)
|
||||
{
|
||||
_ocrFixEngine.AutoGuessesUsed.Clear();
|
||||
@ -1024,7 +1048,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
if (modiText.Length == 0)
|
||||
modiText = CallModi(index); // retry... strange MODI
|
||||
if (modiText.Length == 0)
|
||||
modiText = CallModi(index); // retry... strange MODI
|
||||
modiText = CallModi(index); // retry... strange MODI
|
||||
|
||||
if (modiText.Length > 1)
|
||||
{
|
||||
@ -1106,6 +1130,18 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
return line;
|
||||
}
|
||||
|
||||
private string TesseractResizeAndRetry(Bitmap bitmap)
|
||||
{
|
||||
string result = Tesseract3DoOcrViaExe(ResizeBitmap(bitmap, bitmap.Width * 2, bitmap.Height * 2), _languageId);
|
||||
if (result.Trim().Length == 0)
|
||||
{
|
||||
result = Tesseract3DoOcrViaExe(ResizeBitmap(bitmap, bitmap.Width * 3, bitmap.Height * 2), _languageId);
|
||||
if (result.ToString().Trim().Length == 0)
|
||||
result = Tesseract3DoOcrViaExe(ResizeBitmap(bitmap, bitmap.Width * 4, bitmap.Height * 2), _languageId);
|
||||
}
|
||||
return result.TrimEnd();
|
||||
}
|
||||
|
||||
private void LogOcrFix(int index, string oldLine, string newLine)
|
||||
{
|
||||
listBoxLog.Items.Add(string.Format("#{0}: {1} -> {2}", index+1, oldLine.Replace(Environment.NewLine, " "), newLine.Replace(Environment.NewLine, " ")));
|
||||
@ -1452,7 +1488,7 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
saveFileDialog1.Title = Configuration.Settings.Language.VobSubOcr.SaveSubtitleImageAs;
|
||||
saveFileDialog1.AddExtension = true;
|
||||
saveFileDialog1.FileName = "Image" + _selectedIndex;
|
||||
saveFileDialog1.Filter = "PNG image|*.png|BMP image|*.bmp|GIF image|*.gif";
|
||||
saveFileDialog1.Filter = "PNG image|*.png|BMP image|*.bmp|GIF image|*.gif|TIFF image|*.tiff";
|
||||
saveFileDialog1.FilterIndex = 0;
|
||||
|
||||
DialogResult result = saveFileDialog1.ShowDialog(this);
|
||||
@ -1471,8 +1507,10 @@ namespace Nikse.SubtitleEdit.Forms
|
||||
bmp.Save(saveFileDialog1.FileName, System.Drawing.Imaging.ImageFormat.Png);
|
||||
else if (saveFileDialog1.FilterIndex == 1)
|
||||
bmp.Save(saveFileDialog1.FileName);
|
||||
else
|
||||
else if (saveFileDialog1.FilterIndex == 2)
|
||||
bmp.Save(saveFileDialog1.FileName, System.Drawing.Imaging.ImageFormat.Gif);
|
||||
else
|
||||
bmp.Save(saveFileDialog1.FileName, System.Drawing.Imaging.ImageFormat.Tiff);
|
||||
}
|
||||
catch (Exception exception)
|
||||
{
|
||||
|
@ -575,24 +575,36 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
|
||||
if (autoFix && useAutoGuess)
|
||||
{
|
||||
List<string> guesses = new List<string>();
|
||||
|
||||
if (word.Length > 5)
|
||||
{
|
||||
foreach (string guess in CreateGuessesFromLetters(word))
|
||||
guesses = (List<string>)CreateGuessesFromLetters(word);
|
||||
}
|
||||
else
|
||||
{
|
||||
guesses.Add(word.Replace("fi", "fi"));
|
||||
guesses.Add(word.Replace("fi", "fj"));
|
||||
guesses.Add(word.Replace("fl", "fl"));
|
||||
if (!word.EndsWith("€") && !word.StartsWith("€"))
|
||||
guesses.Add(word.Replace("€", "e"));
|
||||
guesses.Add(word.Replace("fi", "fj"));
|
||||
}
|
||||
foreach (string guess in guesses)
|
||||
{
|
||||
if (IsWordOrWordsCorrect(_hunspell, guess))
|
||||
{
|
||||
if (IsWordOrWordsCorrect(_hunspell, guess))
|
||||
var regex = new Regex(@"\b" + word + @"\b");
|
||||
Match match = regex.Match(line);
|
||||
if (match.Success)
|
||||
{
|
||||
var regex = new Regex(@"\b" + word + @"\b");
|
||||
Match match = regex.Match(line);
|
||||
if (match.Success)
|
||||
{
|
||||
if (log)
|
||||
AutoGuessesUsed.Add(string.Format("#{0}: {1} -> {2} in line via '{3}': {4}", index + 1, word, guess, "OCRFixReplaceList.xml", line.Replace(Environment.NewLine, " ")));
|
||||
if (log)
|
||||
AutoGuessesUsed.Add(string.Format("#{0}: {1} -> {2} in line via '{3}': {4}", index + 1, word, guess, "OCRFixReplaceList.xml", line.Replace(Environment.NewLine, " ")));
|
||||
|
||||
line = line.Remove(match.Index, match.Value.Length).Insert(match.Index, guess);
|
||||
wordsNotFound--;
|
||||
correct = true;
|
||||
break;
|
||||
}
|
||||
line = line.Remove(match.Index, match.Value.Length).Insert(match.Index, guess);
|
||||
wordsNotFound--;
|
||||
correct = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -636,8 +648,8 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
case OcrSpellCheck.Action.AddToUserDictionary:
|
||||
if (_userWordListXmlFileName != null)
|
||||
{
|
||||
_userWordList.Add(_spellCheck.Word);
|
||||
Utilities.AddToUserDictionary(_spellCheck.Word, _fiveLetterWordListLanguageName);
|
||||
_userWordList.Add(_spellCheck.Word.Trim().ToLower());
|
||||
Utilities.AddToUserDictionary(_spellCheck.Word.Trim().ToLower(), _fiveLetterWordListLanguageName);
|
||||
}
|
||||
result.Word = _spellCheck.Word;
|
||||
result.Fixed = true;
|
||||
@ -800,7 +812,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
|
||||
|
||||
private static string AddToGuessList(List<string> list, string word, int index, string letter, string replaceLetters)
|
||||
{
|
||||
if (string.IsNullOrEmpty(word) || index < 0 || index >= word.Length)
|
||||
if (string.IsNullOrEmpty(word) || index < 0 || index + letter.Length - 1 >= word.Length)
|
||||
return word;
|
||||
|
||||
string s = word.Remove(index, letter.Length);
|
||||
|
Loading…
Reference in New Issue
Block a user