Some OCR improvements + hack for buggy nhunspell

git-svn-id: https://subtitleedit.googlecode.com/svn/trunk@378 99eadd0c-20b8-1223-b5c4-2a2b2df33de2
This commit is contained in:
niksedk 2011-03-30 17:45:36 +00:00
parent b5360a5b97
commit fa15215da9
9 changed files with 194 additions and 186 deletions

View File

@ -92,48 +92,41 @@ namespace Nikse.SubtitleEdit.Forms
private static void HighLightWord(RichTextBox richTextBoxParagraph, string word)
{
bool startApos = false;
if (word.StartsWith("'") && word.Length > 1)
if (word != null && richTextBoxParagraph.Text.Contains(word))
{
startApos = true;
word = word.Substring(1);
}
Regex regex = Utilities.MakeWordSearchRegex(word);
Match match = regex.Match(richTextBoxParagraph.Text);
if (!match.Success)
{
regex = Utilities.MakeWordSearchRegexWithNumbers(word);
match = regex.Match(richTextBoxParagraph.Text);
}
while (match.Success)
{
if (startApos)
for (int i = 0; i < richTextBoxParagraph.Text.Length; i++)
{
richTextBoxParagraph.SelectionStart = match.Index-1;
richTextBoxParagraph.SelectionLength = match.Length+1;
while (richTextBoxParagraph.SelectedText != "'" + match.Value && richTextBoxParagraph.SelectionStart > 0)
{
richTextBoxParagraph.SelectionStart = richTextBoxParagraph.SelectionStart - 1;
richTextBoxParagraph.SelectionLength = match.Length+1;
if (richTextBoxParagraph.Text.Substring(i).StartsWith(word))
{
bool startOk = i == 0;
if (!startOk)
startOk = (" <>-\"”“[]'`´¶()♪¿¡.…—!?,:;/" + Environment.NewLine).Contains(richTextBoxParagraph.Text.Substring(i - 1, 1));
if (startOk)
{
bool endOK = (i + word.Length == richTextBoxParagraph.Text.Length);
if (!endOK)
endOK = (" <>-\"”“[]'`´¶()♪¿¡.…—!?,:;/" + Environment.NewLine).Contains(richTextBoxParagraph.Text.Substring(i + word.Length, 1));
if (endOK)
{
richTextBoxParagraph.SelectionStart = i+ 1;
richTextBoxParagraph.SelectionLength = word.Length;
while (richTextBoxParagraph.SelectedText != word && richTextBoxParagraph.SelectionStart > 0)
{
richTextBoxParagraph.SelectionStart = richTextBoxParagraph.SelectionStart - 1;
richTextBoxParagraph.SelectionLength = word.Length;
}
if (richTextBoxParagraph.SelectedText == word)
{
richTextBoxParagraph.SelectionColor = Color.Red;
}
}
}
}
}
else
{
richTextBoxParagraph.SelectionStart = match.Index + 1;
richTextBoxParagraph.SelectionLength = match.Length;
while (richTextBoxParagraph.SelectedText != match.Value && richTextBoxParagraph.SelectionStart > 0)
{
richTextBoxParagraph.SelectionStart = richTextBoxParagraph.SelectionStart - 1;
richTextBoxParagraph.SelectionLength = match.Length;
}
}
richTextBoxParagraph.SelectionColor = Color.Red;
match = match.NextMatch();
}
richTextBoxParagraph.SelectionLength = 0;
richTextBoxParagraph.SelectionStart = 0;
richTextBoxParagraph.SelectionLength = 0;
richTextBoxParagraph.SelectionStart = 0;
}
}
private void ButtonEditWholeTextClick(object sender, EventArgs e)

View File

@ -550,7 +550,11 @@ namespace Nikse.SubtitleEdit.Forms
{
_mainWindow.FocusParagraph(_currentIndex);
List<string> suggestions = _hunspell.Suggest(_currentWord);
List<string> suggestions = new List<string>();
if (_currentWord.Length > 4 || !_currentWord.Contains("'")) //TODO: get fixed nhunspell
suggestions = _hunspell.Suggest(_currentWord); //TODO: 0.9.6 fails on "Lt'S"
if (AutoFixNames && _currentWord.Length > 1 && suggestions.Contains(_currentWord.Substring(0, 1).ToUpper() + _currentWord.Substring(1)))
{
ChangeWord = _currentWord.Substring(0, 1).ToUpper() + _currentWord.Substring(1);

View File

@ -67,6 +67,7 @@ namespace Nikse.SubtitleEdit.Forms
this.buttonStop = new System.Windows.Forms.Button();
this.buttonStartOcr = new System.Windows.Forms.Button();
this.groupBoxOcrAutoFix = new System.Windows.Forms.GroupBox();
this.comboBoxDictionaries = new System.Windows.Forms.ComboBox();
this.checkBoxGuessUnknownWords = new System.Windows.Forms.CheckBox();
this.tabControlLogs = new System.Windows.Forms.TabControl();
this.tabPageAllFixes = new System.Windows.Forms.TabPage();
@ -94,7 +95,6 @@ namespace Nikse.SubtitleEdit.Forms
this.checkBoxShowOnlyForced = new System.Windows.Forms.CheckBox();
this.checkBoxUseTimeCodesFromIdx = new System.Windows.Forms.CheckBox();
this.folderBrowserDialog1 = new System.Windows.Forms.FolderBrowserDialog();
this.comboBoxDictionaries = new System.Windows.Forms.ComboBox();
this.subtitleListView1 = new Nikse.SubtitleEdit.Controls.SubtitleListView();
((System.ComponentModel.ISupportInitialize)(this.pictureBoxSubtitleImage)).BeginInit();
this.contextMenuStripListview.SuspendLayout();
@ -183,8 +183,8 @@ namespace Nikse.SubtitleEdit.Forms
//
// progressBar1
//
this.progressBar1.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Bottom | System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.progressBar1.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Bottom | System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.progressBar1.Location = new System.Drawing.Point(12, 552);
this.progressBar1.Name = "progressBar1";
this.progressBar1.Size = new System.Drawing.Size(827, 10);
@ -271,7 +271,7 @@ namespace Nikse.SubtitleEdit.Forms
this.checkBoxUseModiInTesseractForUnknownWords.Enabled = false;
this.checkBoxUseModiInTesseractForUnknownWords.Location = new System.Drawing.Point(22, 74);
this.checkBoxUseModiInTesseractForUnknownWords.Name = "checkBoxUseModiInTesseractForUnknownWords";
this.checkBoxUseModiInTesseractForUnknownWords.Size = new System.Drawing.Size(167, 17);
this.checkBoxUseModiInTesseractForUnknownWords.Size = new System.Drawing.Size(165, 17);
this.checkBoxUseModiInTesseractForUnknownWords.TabIndex = 39;
this.checkBoxUseModiInTesseractForUnknownWords.Text = "Try MODI for unknown words";
this.checkBoxUseModiInTesseractForUnknownWords.UseVisualStyleBackColor = true;
@ -346,7 +346,7 @@ namespace Nikse.SubtitleEdit.Forms
this.checkBoxRightToLeft.AutoSize = true;
this.checkBoxRightToLeft.Location = new System.Drawing.Point(128, 112);
this.checkBoxRightToLeft.Name = "checkBoxRightToLeft";
this.checkBoxRightToLeft.Size = new System.Drawing.Size(83, 17);
this.checkBoxRightToLeft.Size = new System.Drawing.Size(80, 17);
this.checkBoxRightToLeft.TabIndex = 40;
this.checkBoxRightToLeft.Text = "Right to left";
this.checkBoxRightToLeft.UseVisualStyleBackColor = true;
@ -502,9 +502,9 @@ namespace Nikse.SubtitleEdit.Forms
//
// groupBoxOcrAutoFix
//
this.groupBoxOcrAutoFix.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.groupBoxOcrAutoFix.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.groupBoxOcrAutoFix.Controls.Add(this.comboBoxDictionaries);
this.groupBoxOcrAutoFix.Controls.Add(this.checkBoxGuessUnknownWords);
this.groupBoxOcrAutoFix.Controls.Add(this.tabControlLogs);
@ -520,6 +520,18 @@ namespace Nikse.SubtitleEdit.Forms
this.groupBoxOcrAutoFix.TabStop = false;
this.groupBoxOcrAutoFix.Text = "OCR auto correction / spellchecking";
//
// comboBoxDictionaries
//
this.comboBoxDictionaries.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.comboBoxDictionaries.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList;
this.comboBoxDictionaries.FormattingEnabled = true;
this.comboBoxDictionaries.Location = new System.Drawing.Point(127, 15);
this.comboBoxDictionaries.Name = "comboBoxDictionaries";
this.comboBoxDictionaries.Size = new System.Drawing.Size(171, 21);
this.comboBoxDictionaries.TabIndex = 41;
this.comboBoxDictionaries.SelectedIndexChanged += new System.EventHandler(this.comboBoxDictionaries_SelectedIndexChanged);
//
// checkBoxGuessUnknownWords
//
this.checkBoxGuessUnknownWords.AutoSize = true;
@ -527,16 +539,16 @@ namespace Nikse.SubtitleEdit.Forms
this.checkBoxGuessUnknownWords.CheckState = System.Windows.Forms.CheckState.Checked;
this.checkBoxGuessUnknownWords.Location = new System.Drawing.Point(11, 83);
this.checkBoxGuessUnknownWords.Name = "checkBoxGuessUnknownWords";
this.checkBoxGuessUnknownWords.Size = new System.Drawing.Size(164, 17);
this.checkBoxGuessUnknownWords.Size = new System.Drawing.Size(162, 17);
this.checkBoxGuessUnknownWords.TabIndex = 39;
this.checkBoxGuessUnknownWords.Text = "Try to guess unknown words";
this.checkBoxGuessUnknownWords.UseVisualStyleBackColor = true;
//
// tabControlLogs
//
this.tabControlLogs.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.tabControlLogs.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.tabControlLogs.Controls.Add(this.tabPageAllFixes);
this.tabControlLogs.Controls.Add(this.tabPageSuggestions);
this.tabControlLogs.Controls.Add(this.tabPageUnknownWords);
@ -559,9 +571,9 @@ namespace Nikse.SubtitleEdit.Forms
//
// listBoxLog
//
this.listBoxLog.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.listBoxLog.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.listBoxLog.Font = new System.Drawing.Font("Tahoma", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((byte)(0)));
this.listBoxLog.FormattingEnabled = true;
this.listBoxLog.Location = new System.Drawing.Point(5, 6);
@ -583,9 +595,9 @@ namespace Nikse.SubtitleEdit.Forms
//
// listBoxLogSuggestions
//
this.listBoxLogSuggestions.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.listBoxLogSuggestions.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.listBoxLogSuggestions.Font = new System.Drawing.Font("Tahoma", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((byte)(0)));
this.listBoxLogSuggestions.FormattingEnabled = true;
this.listBoxLogSuggestions.Location = new System.Drawing.Point(5, 6);
@ -606,9 +618,9 @@ namespace Nikse.SubtitleEdit.Forms
//
// listBoxUnknownWords
//
this.listBoxUnknownWords.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.listBoxUnknownWords.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.listBoxUnknownWords.Font = new System.Drawing.Font("Tahoma", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((byte)(0)));
this.listBoxUnknownWords.FormattingEnabled = true;
this.listBoxUnknownWords.Location = new System.Drawing.Point(5, 6);
@ -633,7 +645,7 @@ namespace Nikse.SubtitleEdit.Forms
this.checkBoxPromptForUnknownWords.CheckState = System.Windows.Forms.CheckState.Checked;
this.checkBoxPromptForUnknownWords.Location = new System.Drawing.Point(11, 61);
this.checkBoxPromptForUnknownWords.Name = "checkBoxPromptForUnknownWords";
this.checkBoxPromptForUnknownWords.Size = new System.Drawing.Size(255, 17);
this.checkBoxPromptForUnknownWords.Size = new System.Drawing.Size(246, 17);
this.checkBoxPromptForUnknownWords.TabIndex = 38;
this.checkBoxPromptForUnknownWords.Text = "Prompt for unknown words (requires dictionary)";
this.checkBoxPromptForUnknownWords.UseVisualStyleBackColor = true;
@ -645,7 +657,7 @@ namespace Nikse.SubtitleEdit.Forms
this.checkBoxAutoBreakLines.CheckState = System.Windows.Forms.CheckState.Checked;
this.checkBoxAutoBreakLines.Location = new System.Drawing.Point(11, 105);
this.checkBoxAutoBreakLines.Name = "checkBoxAutoBreakLines";
this.checkBoxAutoBreakLines.Size = new System.Drawing.Size(208, 17);
this.checkBoxAutoBreakLines.Size = new System.Drawing.Size(200, 17);
this.checkBoxAutoBreakLines.TabIndex = 37;
this.checkBoxAutoBreakLines.Text = "Auto break subtitle, if line number > 2";
this.checkBoxAutoBreakLines.UseVisualStyleBackColor = true;
@ -666,7 +678,7 @@ namespace Nikse.SubtitleEdit.Forms
this.checkBoxAutoFixCommonErrors.CheckState = System.Windows.Forms.CheckState.Checked;
this.checkBoxAutoFixCommonErrors.Location = new System.Drawing.Point(11, 39);
this.checkBoxAutoFixCommonErrors.Name = "checkBoxAutoFixCommonErrors";
this.checkBoxAutoFixCommonErrors.Size = new System.Drawing.Size(139, 17);
this.checkBoxAutoFixCommonErrors.Size = new System.Drawing.Size(137, 17);
this.checkBoxAutoFixCommonErrors.TabIndex = 34;
this.checkBoxAutoFixCommonErrors.Text = "Fix common OCR errors";
this.checkBoxAutoFixCommonErrors.UseVisualStyleBackColor = true;
@ -692,7 +704,7 @@ namespace Nikse.SubtitleEdit.Forms
this.checkBoxEmphasis2Transparent.AutoSize = true;
this.checkBoxEmphasis2Transparent.Location = new System.Drawing.Point(437, 19);
this.checkBoxEmphasis2Transparent.Name = "checkBoxEmphasis2Transparent";
this.checkBoxEmphasis2Transparent.Size = new System.Drawing.Size(85, 17);
this.checkBoxEmphasis2Transparent.Size = new System.Drawing.Size(83, 17);
this.checkBoxEmphasis2Transparent.TabIndex = 6;
this.checkBoxEmphasis2Transparent.Text = "Transparent";
this.checkBoxEmphasis2Transparent.UseVisualStyleBackColor = true;
@ -703,7 +715,7 @@ namespace Nikse.SubtitleEdit.Forms
this.checkBoxEmphasis1Transparent.AutoSize = true;
this.checkBoxEmphasis1Transparent.Location = new System.Drawing.Point(304, 19);
this.checkBoxEmphasis1Transparent.Name = "checkBoxEmphasis1Transparent";
this.checkBoxEmphasis1Transparent.Size = new System.Drawing.Size(85, 17);
this.checkBoxEmphasis1Transparent.Size = new System.Drawing.Size(83, 17);
this.checkBoxEmphasis1Transparent.TabIndex = 5;
this.checkBoxEmphasis1Transparent.Text = "Transparent";
this.checkBoxEmphasis1Transparent.UseVisualStyleBackColor = true;
@ -714,7 +726,7 @@ namespace Nikse.SubtitleEdit.Forms
this.checkBoxPatternTransparent.AutoSize = true;
this.checkBoxPatternTransparent.Location = new System.Drawing.Point(167, 19);
this.checkBoxPatternTransparent.Name = "checkBoxPatternTransparent";
this.checkBoxPatternTransparent.Size = new System.Drawing.Size(85, 17);
this.checkBoxPatternTransparent.Size = new System.Drawing.Size(83, 17);
this.checkBoxPatternTransparent.TabIndex = 4;
this.checkBoxPatternTransparent.Text = "Transparent";
this.checkBoxPatternTransparent.UseVisualStyleBackColor = true;
@ -799,23 +811,10 @@ namespace Nikse.SubtitleEdit.Forms
this.checkBoxUseTimeCodesFromIdx.UseVisualStyleBackColor = true;
this.checkBoxUseTimeCodesFromIdx.CheckedChanged += new System.EventHandler(this.checkBoxUseTimeCodesFromIdx_CheckedChanged);
//
// comboBoxDictionaries
//
this.comboBoxDictionaries.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.comboBoxDictionaries.DropDownStyle = System.Windows.Forms.ComboBoxStyle.DropDownList;
this.comboBoxDictionaries.FormattingEnabled = true;
this.comboBoxDictionaries.Location = new System.Drawing.Point(127, 15);
this.comboBoxDictionaries.Name = "comboBoxDictionaries";
this.comboBoxDictionaries.Size = new System.Drawing.Size(171, 21);
this.comboBoxDictionaries.TabIndex = 41;
this.comboBoxDictionaries.Visible = false;
this.comboBoxDictionaries.SelectedIndexChanged += new System.EventHandler(this.comboBoxDictionaries_SelectedIndexChanged);
//
// subtitleListView1
//
this.subtitleListView1.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)));
this.subtitleListView1.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom)
| System.Windows.Forms.AnchorStyles.Left)));
this.subtitleListView1.ContextMenuStrip = this.contextMenuStripListview;
this.subtitleListView1.FirstVisibleIndex = -1;
this.subtitleListView1.Font = new System.Drawing.Font("Tahoma", 9F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((byte)(0)));

View File

@ -145,8 +145,10 @@ namespace Nikse.SubtitleEdit.Forms
groupBoxSubtitleImage.Text = string.Empty;
labelFixesMade.Text = string.Empty;
labelFixesMade.Left = checkBoxAutoFixCommonErrors.Left + checkBoxAutoFixCommonErrors.Width;
labelDictionaryLoaded.Text = string.Empty;
comboBoxDictionaries.Visible = false;
labelDictionaryLoaded.Text = string.Format(Configuration.Settings.Language.VobSubOcr.DictionaryX, string.Empty);
comboBoxDictionaries.Left = labelDictionaryLoaded.Left + labelDictionaryLoaded.Width;
groupBoxImageCompareMethod.Text = language.OcrViaImageCompare;
groupBoxModiMethod.Text = language.OcrViaModi;
checkBoxAutoFixCommonErrors.Text = language.FixOcrErrors;
@ -1272,54 +1274,13 @@ namespace Nikse.SubtitleEdit.Forms
private string OcrViaTessnet(Bitmap bitmap, int index)
{
if (_ocrFixEngine == null)
{
_languageId = (comboBoxTesseractLanguages.SelectedItem as TesseractLanguage).Id;
_ocrFixEngine = new OcrFixEngine(_languageId, this);
if (_ocrFixEngine.IsDictionaryLoaded)
{
labelDictionaryLoaded.Text = string.Format(Configuration.Settings.Language.VobSubOcr.DictionaryX, string.Empty); // _ocrFixEngine.DictionaryCulture.NativeName);
string loadedDictionaryName = _ocrFixEngine.SpellCheckDictionaryName;
int i = 0;
comboBoxDictionaries.SelectedIndexChanged -= comboBoxDictionaries_SelectedIndexChanged;
foreach (string item in comboBoxDictionaries.Items)
{
if (item.Contains("[" + loadedDictionaryName + "]"))
comboBoxDictionaries.SelectedIndex = i;
i++;
}
comboBoxDictionaries.SelectedIndexChanged += comboBoxDictionaries_SelectedIndexChanged;
comboBoxDictionaries.Left = labelDictionaryLoaded.Left + labelDictionaryLoaded.Width;
comboBoxDictionaries.Width = groupBoxOcrAutoFix.Width - (comboBoxDictionaries.Left + 5);
comboBoxDictionaries.Visible = true;
}
else
{
labelDictionaryLoaded.Text = string.Format(Configuration.Settings.Language.VobSubOcr.DictionaryX, Configuration.Settings.Language.General.None);
comboBoxDictionaries.SelectedIndex = 0;
}
if (_modiEnabled && checkBoxUseModiInTesseractForUnknownWords.Checked)
{
string tesseractLanguageText = (comboBoxTesseractLanguages.SelectedItem as TesseractLanguage).Text;
int i = 0;
foreach (var modiLanguage in comboBoxModiLanguage.Items)
{
if ((modiLanguage as ModiLanguage).Text == tesseractLanguageText)
comboBoxModiLanguage.SelectedIndex = i;
i++;
}
}
comboBoxModiLanguage.SelectedIndex = -1;
}
LoadOcrFixEngine();
int badWords = 0;
string textWithOutFixes = Tesseract3DoOcrViaExe(bitmap, _languageId);
if (textWithOutFixes.ToString().Trim().Length == 0)
{
textWithOutFixes = TesseractResizeAndRetry(bitmap);
}
int numberOfWords = textWithOutFixes.ToString().Split((" " + Environment.NewLine).ToCharArray(), StringSplitOptions.RemoveEmptyEntries).Length;
@ -1344,10 +1305,11 @@ namespace Nikse.SubtitleEdit.Forms
}
}
if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length == 0)
{
if (wordsNotFound > 0 || correctWords == 0 || textWithOutFixes != null && textWithOutFixes.ToString().Replace("~", string.Empty).Trim().Length == 0)
{
_ocrFixEngine.AutoGuessesUsed.Clear();
_ocrFixEngine.UnknownWordsFound.Clear();
if (_modiEnabled && checkBoxUseModiInTesseractForUnknownWords.Checked)
{
// which is best - modi or tesseract - we find out here
@ -1726,6 +1688,45 @@ namespace Nikse.SubtitleEdit.Forms
{
Configuration.Settings.VobSubOcr.TesseractLastLanguage = (comboBoxTesseractLanguages.SelectedItem as TesseractLanguage).Id;
_ocrFixEngine = null;
LoadOcrFixEngine();
}
private void LoadOcrFixEngine()
{
_languageId = (comboBoxTesseractLanguages.SelectedItem as TesseractLanguage).Id;
_ocrFixEngine = new OcrFixEngine(_languageId, this);
if (_ocrFixEngine.IsDictionaryLoaded)
{
string loadedDictionaryName = _ocrFixEngine.SpellCheckDictionaryName;
int i = 0;
comboBoxDictionaries.SelectedIndexChanged -= comboBoxDictionaries_SelectedIndexChanged;
foreach (string item in comboBoxDictionaries.Items)
{
if (item.Contains("[" + loadedDictionaryName + "]"))
comboBoxDictionaries.SelectedIndex = i;
i++;
}
comboBoxDictionaries.SelectedIndexChanged += comboBoxDictionaries_SelectedIndexChanged;
comboBoxDictionaries.Left = labelDictionaryLoaded.Left + labelDictionaryLoaded.Width;
comboBoxDictionaries.Width = groupBoxOcrAutoFix.Width - (comboBoxDictionaries.Left + 5);
}
else
{
comboBoxDictionaries.SelectedIndex = 0;
}
if (_modiEnabled && checkBoxUseModiInTesseractForUnknownWords.Checked)
{
string tesseractLanguageText = (comboBoxTesseractLanguages.SelectedItem as TesseractLanguage).Text;
int i = 0;
foreach (var modiLanguage in comboBoxModiLanguage.Items)
{
if ((modiLanguage as ModiLanguage).Text == tesseractLanguageText)
comboBoxModiLanguage.SelectedIndex = i;
i++;
}
}
comboBoxModiLanguage.SelectedIndex = -1;
}
private void ComboBoxOcrMethodSelectedIndexChanged(object sender, EventArgs e)

View File

@ -698,12 +698,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
{
if (newText.Contains(from))
{
var regex = new Regex(@"\b" + from + @"\b");
Match match = regex.Match(newText);
if (match.Success)
{
newText = newText.Remove(match.Index, match.Value.Length).Insert(match.Index, _partialLineReplaceList[from]);
}
newText = ReplaceWord(newText, from, _partialLineReplaceList[from]);
}
}
return newText;
@ -724,7 +719,6 @@ namespace Nikse.SubtitleEdit.Logic.OCR
bool correct = _hunspell.Spell(word);
if (!correct)
correct = _hunspell.Spell(word.Trim('\''));
if (!correct)
{
wordsNotFound++;
@ -734,10 +728,13 @@ namespace Nikse.SubtitleEdit.Logic.OCR
if (autoFix && useAutoGuess)
{
List<string> guesses = new List<string>();
if (word.Length > 5)
{
guesses = (List<string>)CreateGuessesFromLetters(word);
string wordWithCasingChanged = GetWordWithDominatedCasing(word);
if (_hunspell.Spell(word.ToLower()))
guesses.Insert(0, wordWithCasingChanged);
}
else
{
@ -755,14 +752,14 @@ namespace Nikse.SubtitleEdit.Logic.OCR
{
if (IsWordOrWordsCorrect(_hunspell, guess))
{
var regex = new Regex(@"\b" + word + @"\b");
Match match = regex.Match(line);
if (match.Success)
string replacedLine = ReplaceWord(line, word, guess);
if (replacedLine != line)
{
if (log)
AutoGuessesUsed.Add(string.Format("#{0}: {1} -> {2} in line via '{3}': {4}", index + 1, word, guess, "OCRFixReplaceList.xml", line.Replace(Environment.NewLine, " ")));
line = line.Remove(match.Index, match.Value.Length).Insert(match.Index, guess);
//line = line.Remove(match.Index, match.Value.Length).Insert(match.Index, guess);
line = replacedLine;
wordsNotFound--;
correct = true;
break;
@ -772,7 +769,11 @@ namespace Nikse.SubtitleEdit.Logic.OCR
}
if (!correct && promptForFixingErrors)
{
List<string> suggestions = _hunspell.Suggest(word);
List<string> suggestions = new List<string>();
if (word.Length > 4 || !word.Contains("'")) //TODO: get fixed nhunspell
suggestions = _hunspell.Suggest(word); // 0.9.6 fails on "Lt'S"
SpellcheckOcrTextResult res = SpellcheckOcrText(line, bitmap, words, i, word, suggestions);
if (res.FixedWholeLine)
{
@ -790,6 +791,25 @@ namespace Nikse.SubtitleEdit.Logic.OCR
return line;
}
private string GetWordWithDominatedCasing(string word)
{
string uppercaseLetters = Utilities.GetLetters(true, false, false);
string lowercaseLetters = Utilities.GetLetters(false, true, false);
int lowercase = 0;
int uppercase = 0;
for (int i = 0; i < word.Length; i++)
{
if (lowercaseLetters.Contains(word.Substring(i, 1)))
lowercase++;
else if (uppercaseLetters.Contains(word.Substring(i, 1)))
uppercase++;
}
if (uppercase > lowercase)
return word.ToUpper();
else
return word.ToLower();
}
/// <summary>
/// Spellcheck for ocr
/// </summary>
@ -874,44 +894,43 @@ namespace Nikse.SubtitleEdit.Logic.OCR
}
if (result.Fixed)
{
var regEx = Utilities.MakeWordSearchRegex(word);
Match match = regEx.Match(line);
if (match.Success)
{
result.Line = line.Remove(match.Index, word.Length).Insert(match.Index, result.Word);
}
else // some word containing a number or other strange character
{
if (line.EndsWith(" " + word))
{
result.Line = line.Substring(0, line.Length - word.Length) + result.Word;
}
else if (line.StartsWith(word + " ") || line.StartsWith(word + ",") || line.StartsWith(word + "."))
{
result.Line = result.Word + line.Substring(word.Length);
}
else
{
regEx = Utilities.MakeWordSearchRegexWithNumbers(word);
match = regEx.Match(line);
if (match.Success)
{
int startIndex = match.Index;
if (match.Value.StartsWith(" "))
startIndex++;
result.Line = line.Remove(startIndex, word.Length).Insert(startIndex, result.Word);
}
else
{
result.Fixed = false;
MessageBox.Show("Unable to find word via regex: " + word);
}
}
}
result.Line = ReplaceWord(line, word, result.Word);
}
return result;
}
private string ReplaceWord(string text, string word, string newWord)
{
StringBuilder sb = new StringBuilder();
if (word != null && text != null && text.Contains(word))
{
int appendFrom = 0;
for (int i = 0; i < text.Length; i++)
{
if (text.Substring(i).StartsWith(word) && i >= appendFrom)
{
bool startOk = i == 0;
if (!startOk)
startOk = (" <>-\"”“[]'`´¶()♪¿¡.…—!?,:;/" + Environment.NewLine).Contains(text.Substring(i - 1, 1));
if (startOk)
{
bool endOK = (i + word.Length == text.Length);
if (!endOK)
endOK = (" <>-\"”“[]'`´¶()♪¿¡.…—!?,:;/" + Environment.NewLine).Contains(text.Substring(i + word.Length, 1));
if (endOK)
{
sb.Append(newWord);
appendFrom = i + word.Length;
}
}
}
if (i >= appendFrom)
sb.Append(text.Substring(i, 1));
}
}
return sb.ToString();
}
private void SaveWordToWordList(string word)
{
try

View File

@ -37,7 +37,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
public override string Name
{
get { return "D-Cinema Subtitle"; }
get { return "D-Cinema"; }
}
public override bool HasLineNumber

View File

@ -43,16 +43,6 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
public override string ToText(Subtitle subtitle, string title)
{
const string paragraphWriteFormat = "{0:00}:{1:00}:{2:00}.{3:00}, {4:00}:{5:00}:{6:00}.{7:00}{8}{9}";
//00:00:07.00, 00:00:12.00
//Welche Auswirkung Mikroversicherungen auf unsere Klienten hat? Lassen wir sie für sich selber sprechen!
//
//00:00:22.00, 00:00:27.00
//Arme Menschen in Uganda leben oft in schlechten Unterkünften.
var sb = new StringBuilder();
sb.AppendLine(" " + subtitle.Paragraphs.Count.ToString() + " 4 1234 ");
@ -95,7 +85,7 @@ SRPSKI
"{2}" + Environment.NewLine +
"{3}", p.StartTime.TotalMilliseconds, p.EndTime.TotalMilliseconds, firstLine, secondLine));
}
return sb.ToString().Trim(); //.Replace(Environment.NewLine, "\n");
return sb.ToString().Trim();
}
private int RoundTo2Cifres(int milliseconds)

View File

@ -1062,6 +1062,7 @@ namespace Nikse.SubtitleEdit.Logic
}
}
sb.Append("*" + new Pac().Extension + ";");
sb.Append("*" + new Cavena890().Extension + ";");
sb.Append("*.sup");
sb.Append("|" + Configuration.Settings.Language.General.AllFiles + "|*.*");
return sb.ToString();

View File

@ -517,6 +517,7 @@
<Compile Include="Logic\StripableText.cs" />
<Compile Include="Logic\SubtitleFormats\AdobeEncore.cs" />
<Compile Include="Logic\SubtitleFormats\AdobeEncoreTabs.cs" />
<Compile Include="Logic\SubtitleFormats\Cavena890.cs" />
<Compile Include="Logic\SubtitleFormats\DCSubtitle.cs" />
<Compile Include="Logic\SubtitleFormats\FinalCutProTextXml.cs" />
<Compile Include="Logic\SubtitleFormats\FinalCutProXml.cs" />