Initial version

git-svn-id: https://subtitleedit.googlecode.com/svn/trunk@15 99eadd0c-20b8-1223-b5c4-2a2b2df33de2
This commit is contained in:
niksedk 2010-10-12 11:22:45 +00:00
parent 8f89e3146e
commit 885fd6ac11
9 changed files with 1103 additions and 0 deletions

View File

@ -0,0 +1,9 @@
namespace Nikse.SubtitleEdit.Logic.Enums
{
public enum FindType
{
Normal,
CaseSensitive,
RegEx
}
}

View File

@ -0,0 +1,15 @@
namespace Nikse.SubtitleEdit.Logic.Enums
{
public enum SpellCheckAction
{
Change,
ChangeAll,
Skip,
SkipAll,
AddToDictionary,
Abort,
ChangeLanguage,
AddToNamesEtc,
ChangeWholeText
}
}

View File

@ -0,0 +1,18 @@
using System;
using System.Collections.Generic;
using System.Text;
namespace Nikse.SubtitleEdit.Logic.Enums
{
public enum SubtitleSortCriteria
{
Number,
StartTime,
EndTime,
Duration,
Text,
TextMaxLineLength,
TextTotalLength,
TextNumberOfLines,
}
}

View File

@ -0,0 +1,50 @@
using System;
using System.Collections.Generic;
using System.Text;
using System.Windows.Forms;
namespace Nikse.SubtitleEdit.Logic.OCR
{
public class ModiLanguage
{
public int Id { get; set; }
public string Text { get; set; }
public override string ToString()
{
return Text;
}
public const int DefaultLanguageId = 2048;
public static IEnumerable<ModiLanguage> AllLanguages
{
get
{
List<ModiLanguage> list = new List<ModiLanguage>();
list.Add(new ModiLanguage { Id = DefaultLanguageId, Text = "Default" });
list.Add(new ModiLanguage { Id = 2052, Text = "Chinese simplified" });
list.Add(new ModiLanguage { Id = 1028, Text = "Chinese traditional" });
list.Add(new ModiLanguage { Id = 5, Text = "Chech" });
list.Add(new ModiLanguage { Id = 6, Text = "Danish" });
list.Add(new ModiLanguage { Id = 19, Text = "Dutch" });
list.Add(new ModiLanguage { Id = 9, Text = "English" });
list.Add(new ModiLanguage { Id = 11, Text = "Finnish" });
list.Add(new ModiLanguage { Id = 12, Text = "French" });
list.Add(new ModiLanguage { Id = 7, Text = "German" });
list.Add(new ModiLanguage { Id = 8, Text = "Greek" });
list.Add(new ModiLanguage { Id = 14, Text = "Hungarian" });
list.Add(new ModiLanguage { Id = 16, Text = "Italian" });
list.Add(new ModiLanguage { Id = 17, Text = "Japanese" });
list.Add(new ModiLanguage { Id = 18, Text = "Korean" });
list.Add(new ModiLanguage { Id = 20, Text = "Norweigian" });
list.Add(new ModiLanguage { Id = 21, Text = "Polish" });
list.Add(new ModiLanguage { Id = 22, Text = "Portuguese" });
list.Add(new ModiLanguage { Id = 25, Text = "Russian" });
list.Add(new ModiLanguage { Id = 10, Text = "Spanish" });
list.Add(new ModiLanguage { Id = 29, Text = "Swedish" });
list.Add(new ModiLanguage { Id = 31, Text = "Turkish" });
return list;
}
}
}
}

View File

@ -0,0 +1,47 @@
using System.Collections.Generic;
namespace Nikse.SubtitleEdit.Logic.OCR
{
public class OcrAlphabet
{
public OcrAlphabet()
{
OcrCharacters = new List<OcrCharacter>();
}
public List<OcrCharacter> OcrCharacters { get; private set; }
public int CalculateMaximumSize()
{
int max = 0;
foreach (OcrCharacter c in OcrCharacters)
{
foreach (OcrImage img in c.OcrImages)
{
int size = img.Bmp.Width * img.Bmp.Height;
if (size > max)
max = size;
}
}
return max;
}
public OcrCharacter GetOcrCharacter(string text, bool addIfNotExists)
{
foreach (var ocrCharacter in OcrCharacters)
{
if (ocrCharacter.Text == text)
return ocrCharacter;
}
if (addIfNotExists)
{
OcrCharacter ch = new OcrCharacter(text);
OcrCharacters.Add(ch);
return ch;
}
return null;
}
}
}

View File

@ -0,0 +1,16 @@
using System.Collections.Generic;
namespace Nikse.SubtitleEdit.Logic.OCR
{
public class OcrCharacter
{
public string Text { get; private set; }
public List<OcrImage> OcrImages { get; set; }
public OcrCharacter(string text)
{
Text = text;
OcrImages = new List<OcrImage>();
}
}
}

View File

@ -0,0 +1,900 @@
using System;
using System.Collections.Generic;
using System.Drawing;
using System.Globalization;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Windows.Forms;
using System.Xml;
using NHunspell;
using Nikse.SubtitleEdit.Forms;
namespace Nikse.SubtitleEdit.Logic.OCR
{
public class OcrFixEngine
{
// Dictionaries/spellchecking/fixing
Dictionary<string, string> _wordReplaceList;
Dictionary<string, string> _partialLineReplaceList;
Dictionary<string, string> _beginLineReplaceList;
Dictionary<string, string> _endLineReplaceList;
Dictionary<string, string> _wholeLineReplaceList;
Dictionary<string, string> _partialWordReplaceList;
string _replaceListXmlFileName;
string _userWordListXmlFileName;
string _languageName;
List<string> _namesEtcList = new List<string>();
List<string> _namesEtcListUppercase = new List<string>();
List<string> _namesEtcMultiWordList = new List<string>(); // case sensitive phrases
List<string> _userWordList = new List<string>();
List<string> _wordSkipList = new List<string>();
Hunspell _hunspell;
readonly OcrSpellCheck _spellCheck;
readonly Form _parentForm;
public bool Abort { get; set; }
public List<string> AutoGuessesUsed { get; set; }
public List<string> UnknownWordsFound { get; set; }
public bool IsDictionaryLoaded { get; private set; }
public CultureInfo DictionaryCulture { get; private set; }
/// <summary>
/// Advanced ocr fixing via replace/spelling dictionaries + some hardcoded rules
/// </summary>
/// <param name="threeLetterIsoLanguageName">E.g. eng for English</param>
/// <param name="parentForm">Used for centering/show spellcheck dialog</param>
public OcrFixEngine(string threeLetterIsoLanguageName, Form parentForm)
{
_parentForm = parentForm;
_spellCheck = new OcrSpellCheck {StartPosition = FormStartPosition.Manual};
_spellCheck.Location = new Point(parentForm.Left + (parentForm.Width / 2 - _spellCheck.Width / 2),
parentForm.Top + (parentForm.Height / 2 - _spellCheck.Height / 2));
LoadReplaceLists(threeLetterIsoLanguageName);
LoadSpellingDictionaries(threeLetterIsoLanguageName); // Hunspell etc.
AutoGuessesUsed = new List<string>();
UnknownWordsFound = new List<string>();
}
private void LoadReplaceLists(string languageId)
{
_wordReplaceList = new Dictionary<string, string>();
_partialLineReplaceList = new Dictionary<string, string>();
_beginLineReplaceList = new Dictionary<string, string>();
_endLineReplaceList = new Dictionary<string, string>();
_wholeLineReplaceList = new Dictionary<string, string>();
_partialWordReplaceList = new Dictionary<string, string>();
_replaceListXmlFileName = Utilities.DictionaryFolder + languageId + "_OCRFixReplaceList.xml";
if (File.Exists(_replaceListXmlFileName))
{
var doc = new XmlDocument();
doc.Load(_replaceListXmlFileName);
_wordReplaceList = LoadReplaceList(doc, "WholeWords");
_partialWordReplaceList = LoadReplaceList(doc, "PartialWords");
_partialLineReplaceList = LoadReplaceList(doc, "PartialLines");
_beginLineReplaceList = LoadReplaceList(doc, "BeginLines");
_endLineReplaceList = LoadReplaceList(doc, "EndLines");
_wholeLineReplaceList = LoadReplaceList(doc, "WholeLines");
}
}
private void LoadSpellingDictionaries(string threeLetterIsoLanguageName)
{
IsDictionaryLoaded = false;
foreach (var culture in CultureInfo.GetCultures(CultureTypes.NeutralCultures))
{
if (culture.ThreeLetterISOLanguageName == threeLetterIsoLanguageName)
{
_languageName = culture.TwoLetterISOLanguageName;
string dictionaryFolder = Utilities.DictionaryFolder;
if (!Directory.Exists(dictionaryFolder))
return;
string[] files = Directory.GetFiles(Utilities.DictionaryFolder, culture.TwoLetterISOLanguageName + "_??.dic");
if (files.Length == 0)
return;
string languageName = Path.GetFileName(files[0]).Substring(0, 5);
string dictionary = Utilities.DictionaryFolder + languageName;
_wordSkipList = new List<string>();
_wordSkipList.Add(Configuration.Settings.Tools.MusicSymbol);
_wordSkipList.Add("*");
_wordSkipList.Add("%");
_wordSkipList.Add("#");
_wordSkipList.Add("+");
// Load names etc list (names/noise words)
_namesEtcList = new List<string>();
_namesEtcMultiWordList = new List<string>();
Utilities.LoadNamesEtcWordLists(_namesEtcList, _namesEtcMultiWordList, languageName);
_namesEtcListUppercase = new List<string>();
foreach (string name in _namesEtcList)
_namesEtcListUppercase.Add(name.ToUpper());
// Load user words
_userWordList = new List<string>();
_userWordListXmlFileName = Utilities.LoadUserWordList(_userWordList, languageName);
// Load NHunspell spellchecker
_hunspell = new Hunspell(dictionary + ".aff", dictionary + ".dic");
IsDictionaryLoaded = true;
DictionaryCulture = culture;
return;
}
}
return;
}
internal static Dictionary<string, string> LoadReplaceList(XmlDocument doc, string name)
{
var list = new Dictionary<string, string>();
if (doc.DocumentElement != null)
{
XmlNode node = doc.DocumentElement.SelectSingleNode(name);
if (node != null)
{
foreach (XmlNode item in node.ChildNodes)
{
if (item.Attributes != null && item.Attributes["to"] != null && item.Attributes["from"] != null)
{
string to = item.Attributes["to"].InnerText;
string from = item.Attributes["from"].InnerText;
if (!list.ContainsKey(from))
list.Add(from, to);
}
}
}
}
return list;
}
public string FixOcrErrors(string text, int index, string lastLine, bool logSuggestions, bool useAutoGuess)
{
var sb = new StringBuilder();
var word = new StringBuilder();
text = FixCommenOcrLineErrors(text, lastLine);
string lastWord = null;
for (int i = 0; i < text.Length; i++)
{
if (" ,.!?:;()[]{}+-$£\"#&%\r\n".Contains(text[i].ToString()))
{
if (word.Length > 0)
{
string fixedWord = FixCommonWordErrors(word.ToString(), lastWord);
sb.Append(fixedWord);
lastWord = fixedWord;
word = new StringBuilder();
}
sb.Append(text[i]);
}
else
{
word.Append(text[i]);
}
}
if (word.Length > 0) // last word
{
string fixedWord = FixCommonWordErrors(word.ToString(), lastWord);
sb.Append(fixedWord);
}
text = FixCommenOcrLineErrors(sb.ToString(), lastLine);
int wordsNotFound;
return FixUnknownWordsViaGuessOrPrompt(out wordsNotFound, text, index, null, true, false, logSuggestions, useAutoGuess);
}
private string FixCommonWordErrors(string word, string lastWord)
{
while (word.Contains("--"))
word = word.Replace("--", "-");
if (word.Contains(""))
word = word.Replace('', '\'');
if (word.Contains("`"))
word = word.Replace('`', '\'');
if (word.Contains(""))
word = word.Replace('', '\'');
if (word.Contains("—"))
word = word.Replace('—', '-');
if (word.Contains("|"))
word = word.Replace("|", "l");
if (word.Contains("vx/"))
word = word.Replace("vx/", "w");
if (word.Contains("¤"))
{
var regex = new Regex("[A-ZÆØÅÄÖÉÈÀÙÂÊÎÔÛËÏa-zæøåäöéèàùâêîôûëï]¤");
if (regex.IsMatch(word))
word = word.Replace("¤", "o");
}
string pre = string.Empty;
string post = string.Empty;
while (word.StartsWith(Environment.NewLine) && word.Length > 2)
{
pre += Environment.NewLine;
word = word.Substring(2);
}
while (word.StartsWith("-") && word.Length > 1)
{
pre += "-";
word = word.Substring(1);
}
while (word.StartsWith(".") && word.Length > 1)
{
pre += ".";
word = word.Substring(1);
}
while (word.StartsWith("\"") && word.Length > 1)
{
pre += "\"";
word = word.Substring(1);
}
if (word.StartsWith("(") && word.Length > 1)
{
pre += "(";
word = word.Substring(1);
}
while (word.EndsWith(Environment.NewLine) && word.Length > 2)
{
post += Environment.NewLine;
word = word.Substring(0, word.Length - 2);
}
while (word.EndsWith("\"") && word.Length > 1)
{
post = post + "\"";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith(".") && word.Length > 1)
{
post = post + ".";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith(",") && word.Length > 1)
{
post = post + ",";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith("?") && word.Length > 1)
{
post = post + "?";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith("!") && word.Length > 1)
{
post = post + "!";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith(")") && word.Length > 1)
{
post = post + ")";
word = word.Substring(0, word.Length - 1);
}
if (word.Contains("?"))
{
var regex = new Regex(@"\S\?[A-ZÆØÅÄÖÉÈÀÙÂÊÎÔÛËÏa-zæøåäöéèàùâêîôûëï]");
Match match = regex.Match(word);
if (match.Success)
word = word.Insert(match.Index + 2, " ");
}
foreach (string from in _wordReplaceList.Keys)
{
if (word == from)
return pre + _wordReplaceList[from] + post;
if (word + post == from)
return pre + _wordReplaceList[from];
if (pre + word + post == from)
return _wordReplaceList[from];
}
// uppercase I or 1 inside lowercase word (will be replaced by lowercase L)
word = FixIor1InsideLowerCaseWord(word);
// uppercase 0 inside lowercase word (will be replaced by lowercase L)
word = Fix0InsideLowerCaseWord(word);
// uppercase I or 1 inside lowercase word (will be replaced by lowercase L)
word = FixIor1InsideLowerCaseWord(word);
// Retry word replace list
foreach (string from in _wordReplaceList.Keys)
{
if (word == from)
return pre + _wordReplaceList[from] + post;
if (word + post == from)
return pre + _wordReplaceList[from];
if (pre + word + post == from)
return _wordReplaceList[from];
}
return pre + word + post;
}
public static string Fix0InsideLowerCaseWord(string word)
{
var startEndEndsWithNumber = new Regex(@"^\d+.+\d$");
if (startEndEndsWithNumber.IsMatch(word))
return word;
var hexNumber = new Regex(@"^#?[\dABDEFabcdef]+$");
if (hexNumber.IsMatch(word))
return word;
if (word.LastIndexOf('0') > 0)
{
var re = new Regex(@"[a-zæøåäöé][0]", RegexOptions.Compiled);
Match match = re.Match(word);
if (match.Success)
{
while (match.Success)
{
if (word[match.Index + 1] == '0')
{
string oldText = word;
word = word.Substring(0, match.Index + 1) + "o";
if (match.Index + 2 < oldText.Length)
word += oldText.Substring(match.Index + 2);
}
match = re.Match(word);
}
}
re = new Regex(@"0[a-zæøåäöé]", RegexOptions.Compiled);
match = re.Match(word);
if (match.Success)
{
while (match.Success)
{
if (word[match.Index] == '0')
{
if (match.Index == 0 || !"123456789".Contains(word[match.Index - 1].ToString()))
{
string oldText = word;
word = word.Substring(0, match.Index) + "o";
if (match.Index + 1 < oldText.Length)
word += oldText.Substring(match.Index + 1);
}
}
match = re.Match(word, match.Index + 1);
}
}
}
return word;
}
public static string FixIor1InsideLowerCaseWord(string word)
{
var startEndEndsWithNumber = new Regex(@"^\d+.+\d$");
if (startEndEndsWithNumber.IsMatch(word))
return word;
var hexNumber = new Regex(@"^#?[\dABDEFabcdef]+$");
if (hexNumber.IsMatch(word))
return word;
if (word.LastIndexOf('I') > 0 || word.LastIndexOf('1') > 0)
{
var re = new Regex(@"[a-zæøåäöé][I1]", RegexOptions.Compiled);
Match match = re.Match(word);
if (match.Success)
{
while (match.Success)
{
if (word[match.Index + 1] == 'I' || word[match.Index + 1] == '1')
{
string oldText = word;
word = word.Substring(0, match.Index + 1) + "l";
if (match.Index + 2 < oldText.Length)
word += oldText.Substring(match.Index + 2);
}
match = re.Match(word, match.Index + 1);
}
}
}
return word;
}
private string FixCommenOcrLineErrors(string input, string lastLine)
{
input = FixOcrErrorsViaHardcodedRules(input, lastLine);
input = FixOcrErrorViaLineReplaceList(input);
// e.g. "selectionsu." -> "selections..."
if (input.EndsWith("u.") && _hunspell != null)
{
string[] words = input.Split(" .".ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
if (words.Length > 0)
{
string lastWord = words[words.Length -1].Trim();
if (lastWord.Length > 2 &&
!IsWordOrWordsCorrect(_hunspell, lastWord) &&
IsWordOrWordsCorrect(_hunspell, lastWord.Substring(0, lastWord.Length-1)))
input = input.Substring(0, input.Length-2) + "...";
}
}
// music notes
if (input.StartsWith(".'") && input.EndsWith(".'"))
{
input = input.Replace(".'", Configuration.Settings.Tools.MusicSymbol);
}
return input;
}
public static string FixOcrErrorsViaHardcodedRules(string input, string lastLine)
{
if (lastLine == null ||
lastLine.EndsWith(".") ||
lastLine.EndsWith("!") ||
lastLine.EndsWith("?"))
{
if (input.Length > 0 && input[0].ToString() != input[0].ToString().ToUpper())
input = input.Remove(0, 1).Insert(0, input[0].ToString().ToUpper());
}
// lines ending with ". should often end at ... (of no other quotes exists near by)
if ((lastLine == null || !lastLine.Contains("\"")) && input != null &&
input.EndsWith("\".") && input.IndexOf("\"") == input.LastIndexOf("\""))
{
int position = input.Length - 2;
input = input.Remove(position).Insert(position, "...");
}
// change '<number><space>1' to '<number>1'
var regEx = new Regex(@"\d\ 1");
Match match = regEx.Match(input);
while (match.Success)
{
input = input.Substring(0, match.Index + 1) + input.Substring(match.Index + 2);
match = regEx.Match(input);
}
// change '' to "
input = input.Replace("''", "\"");
// change 'sequeI of' to 'sequel of'
regEx = new Regex("[a-zæøåöäé]I.");
match = regEx.Match(input);
while (match.Success)
{
input = input.Substring(0, match.Index + 1) + "l" + input.Substring(match.Index + 2);
match = regEx.Match(input);
}
// change 'NlCE' to 'NICE'
regEx = new Regex("[A-ZÆØÅÄÖÉÁ]l[A-ZÆØÅÄÖÉÁ]");
match = regEx.Match(input);
while (match.Success)
{
input = input.Substring(0, match.Index + 1) + "I" + input.Substring(match.Index + 2);
match = regEx.Match(input);
}
return input;
}
public string FixOcrErrorViaLineReplaceList(string input)
{
// Whole line
foreach (string from in _wholeLineReplaceList.Keys)
{
if (input == from)
return _wholeLineReplaceList[from];
}
string newText = input;
string pre = string.Empty;
while (newText.Length > 1 && " -\"['¶(".Contains(newText.Substring(0, 1)))
{
pre += newText.Substring(0, 1);
newText = newText.Substring(1);
}
// begin line
foreach (string from in _beginLineReplaceList.Keys)
{
if (newText.StartsWith(from))
newText = newText.Remove(0, from.Length).Insert(0, _beginLineReplaceList[from]);
if (newText.Contains(". " + from))
newText = newText.Replace(". " + from, ". " + _beginLineReplaceList[from]);
if (newText.Contains("! " + from))
newText = newText.Replace("! " + from, "! " + _beginLineReplaceList[from]);
if (newText.Contains("? " + from))
newText = newText.Replace("? " + from, "? " + _beginLineReplaceList[from]);
if (newText.Contains("." + Environment.NewLine + from))
newText = newText.Replace(". " + Environment.NewLine + from, ". " + Environment.NewLine + _beginLineReplaceList[from]);
if (newText.Contains("! " + Environment.NewLine + from))
newText = newText.Replace("! " + Environment.NewLine + from, "! " + Environment.NewLine + _beginLineReplaceList[from]);
if (newText.Contains("? " + Environment.NewLine + from))
newText = newText.Replace("? " + Environment.NewLine + from, "? " + Environment.NewLine + _beginLineReplaceList[from]);
}
newText = pre + newText;
foreach (string from in _endLineReplaceList.Keys)
{
if (newText.EndsWith(from))
{
int position = (newText.Length - from.Length);
newText = newText.Remove(position).Insert(position, _endLineReplaceList[from]);
}
}
foreach (string from in _partialLineReplaceList.Keys)
{
if (newText.Contains(from))
{
var regex = new Regex(@"\b" + from + @"\b");
Match match = regex.Match(newText);
if (match.Success)
{
newText = newText.Remove(match.Index, match.Value.Length).Insert(match.Index, _partialLineReplaceList[from]);
}
}
}
return newText;
}
public string FixUnknownWordsViaGuessOrPrompt(out int wordsNotFound, string line, int index, Bitmap bitmap, bool autoFix, bool promptForFixingErrors, bool log, bool useAutoGuess)
{
wordsNotFound = 0;
if (_hunspell == null)
return line;
string[] words = line.Split((Environment.NewLine + " ,.!?:;()[]{}+-$£\"“#&%").ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
for (int i = 0; i < words.Length; i++)
{
string word = words[i];
if (!IsWordKnownOrNumber(word, line))
{
bool correct = _hunspell.Spell(word);
if (!correct)
correct = _hunspell.Spell(word.Trim('\''));
if (!correct)
{
wordsNotFound++;
if (log)
UnknownWordsFound.Add(string.Format("#{0}: {1}", index + 1, word));
if (autoFix && useAutoGuess)
{
if (word.Length > 5)
{
foreach (string guess in CreateGuessesFromLetters(word))
{
if (IsWordOrWordsCorrect(_hunspell, guess))
{
var regex = new Regex(@"\b" + word + @"\b");
Match match = regex.Match(line);
if (match.Success)
{
if (log)
AutoGuessesUsed.Add(string.Format("#{0}: {1} -> {2} in line via '{3}': {4}", index + 1, word, guess, "OCRFixReplaceList.xml", line.Replace(Environment.NewLine, " ")));
line = line.Remove(match.Index, match.Value.Length).Insert(match.Index, guess);
wordsNotFound--;
correct = true;
break;
}
}
}
}
}
if (!correct && promptForFixingErrors)
{
List<string> suggestions = _hunspell.Suggest(word);
SpellcheckOcrTextResult res = SpellcheckOcrText(line, bitmap, words, i, word, suggestions);
if (res.FixedWholeLine)
{
return res.Line;
}
if (res.Fixed)
{
line = res.Line;
wordsNotFound--;
}
}
}
}
}
return line;
}
/// <summary>
/// Spellcheck for ocr
/// </summary>
/// <returns>True, if word is fixed</returns>
private SpellcheckOcrTextResult SpellcheckOcrText(string line, Bitmap bitmap, string[] words, int i, string word, List<string> suggestions)
{
var result = new SpellcheckOcrTextResult { Fixed = false, FixedWholeLine = false, Line = null, Word = null };
_spellCheck.Initialize(word, suggestions, line, words, i, bitmap);
_spellCheck.ShowDialog(_parentForm);
switch (_spellCheck.ActionResult)
{
case OcrSpellCheck.Action.Abort:
Abort = true;
result.FixedWholeLine = true;
result.Line = line;
break;
case OcrSpellCheck.Action.AddToUserDictionary:
if (_userWordListXmlFileName != null)
{
_userWordList.Add(_spellCheck.Word);
Utilities.AddToUserDictionary(_spellCheck.Word, _languageName);
}
result.Word = _spellCheck.Word;
result.Fixed = true;
result.Line = line;
if (word == result.Word)
return result;
break;
case OcrSpellCheck.Action.AddToNames:
result.Word = _spellCheck.Word;
result.Fixed = true;
try
{
string s = _spellCheck.Word.Trim();
if (s.Contains(" "))
_namesEtcMultiWordList.Add(s);
else
_namesEtcList.Add(s);
Utilities.AddWordToLocalNamesEtcList(s, _languageName);
}
catch
{
_wordSkipList.Add(_spellCheck.Word);
}
result.Line = line;
if (word == result.Word)
return result;
break;
case OcrSpellCheck.Action.AllwaysUseSuggestion:
SaveWordToWordList(word);
result.Fixed = true;
result.Word = _spellCheck.Word;
break;
case OcrSpellCheck.Action.ChangeAndSave:
SaveWordToWordList(word);
result.Fixed = true;
result.Word = _spellCheck.Word;
break;
case OcrSpellCheck.Action.ChangeOnce:
result.Fixed = true;
result.Word = _spellCheck.Word;
break;
case OcrSpellCheck.Action.ChangeWholeText:
result.Line = _spellCheck.Paragraph;
result.FixedWholeLine = true;
break;
case OcrSpellCheck.Action.SkipAll:
_wordSkipList.Add(_spellCheck.Word);
break;
case OcrSpellCheck.Action.SkipOnce:
break;
case OcrSpellCheck.Action.SkipWholeText:
result.Line = line;
result.FixedWholeLine = true;
break;
case OcrSpellCheck.Action.UseSuggestion:
result.Word = _spellCheck.Word;
result.Fixed = true;
break;
default:
break;
}
if (result.Fixed)
{
var regEx = Utilities.MakeWordSearchRegex(word);
Match match = regEx.Match(line);
if (match.Success)
{
result.Line = line.Remove(match.Index, word.Length).Insert(match.Index, result.Word);
}
else // some word containing a number or other strange character
{
if (line.EndsWith(" " + word))
{
result.Line = line.Substring(0, line.Length - word.Length) + result.Word;
}
else if (line.StartsWith(word + " ") || line.StartsWith(word + ",") || line.StartsWith(word + "."))
{
result.Line = result.Word + line.Substring(word.Length);
}
else
{
regEx = Utilities.MakeWordSearchRegexWithNumbers(word);
match = regEx.Match(line);
if (match.Success)
{
int startIndex = match.Index;
if (match.Value.StartsWith(" "))
startIndex++;
result.Line = line.Remove(startIndex, word.Length).Insert(startIndex, result.Word);
}
else
{
result.Fixed = false;
MessageBox.Show("Unable to find word via regex: " + word);
}
}
}
}
return result;
}
private void SaveWordToWordList(string word)
{
try
{
if (_replaceListXmlFileName != null)
{
var doc = new XmlDocument();
if (File.Exists(_replaceListXmlFileName))
{
try
{
doc.Load(_replaceListXmlFileName);
}
catch
{
doc.LoadXml("<ReplaceList><WholeWords/><PartialLines/><BeginLines/><EndLines/><WholeLines/><Letters/></ReplaceList>");
}
}
else
{
doc.LoadXml("<ReplaceList><WholeWords/><PartialLines/><BeginLines/><EndLines/><WholeLines/><Letters/></ReplaceList>");
}
if (!_wordReplaceList.ContainsKey(word))
_wordReplaceList.Add(word, _spellCheck.Word);
XmlNode wholeWordsNode = doc.DocumentElement.SelectSingleNode("WholeWords");
if (wholeWordsNode != null)
{
XmlNode newNode = doc.CreateNode(XmlNodeType.Element, "Word", null);
XmlAttribute aFrom = doc.CreateAttribute("from");
XmlAttribute aTo = doc.CreateAttribute("to");
aFrom.InnerText = word;
aTo.InnerText = _spellCheck.Word;
newNode.Attributes.Append(aFrom);
newNode.Attributes.Append(aTo);
wholeWordsNode.AppendChild(newNode);
doc.Save(_replaceListXmlFileName);
}
}
}
catch (Exception exception)
{
MessageBox.Show(exception + Environment.NewLine + exception.StackTrace);
_wordSkipList.Add(word);
}
}
public bool IsWordOrWordsCorrect(Hunspell hunspell, string word)
{
foreach (string s in word.Split(' '))
{
if (!hunspell.Spell(s))
{
if (!IsWordKnownOrNumber(word, word))
return false;
}
}
return true;
}
private static string AddToGuessList(List<string> list, string word, int index, string letter, string replaceLetters)
{
if (string.IsNullOrEmpty(word) || index < 0 || index >= word.Length)
return word;
string s = word.Remove(index, letter.Length);
if (index >= s.Length)
s += replaceLetters;
else
s = s.Insert(index, replaceLetters);
if (!list.Contains(s))
list.Add(s);
return s;
}
public IEnumerable<string> CreateGuessesFromLetters(string word)
{
var list = new List<string>();
foreach (string letter in _partialWordReplaceList.Keys)
{
string s = word;
int i = 0;
while (s.Contains(letter) && i < 10)
{
int index = s.IndexOf(letter);
s = AddToGuessList(list, s, index, letter, _partialWordReplaceList[letter]);
AddToGuessList(list, word, index, letter, _partialWordReplaceList[letter]);
i++;
}
s = word;
i = 0;
while (s.Contains(letter) && i < 10)
{
int index = s.LastIndexOf(letter);
s = AddToGuessList(list, s, index, letter, _partialWordReplaceList[letter]);
AddToGuessList(list, word, index, letter, _partialWordReplaceList[letter]);
i++;
}
}
return list;
}
public bool IsWordKnownOrNumber(string word, string line)
{
double number;
if (double.TryParse(word.TrimStart('\'').Replace("$", string.Empty).Replace("£", string.Empty).Replace("¢", string.Empty), out number))
return true;
if (_wordSkipList.IndexOf(word) >= 0)
return true;
if (_namesEtcList.IndexOf(word) >= 0)
return true;
if (_namesEtcList.IndexOf(word.Trim('\'')) >= 0)
return true;
if (_userWordList.IndexOf(word.ToLower()) >= 0)
return true;
if (_userWordList.IndexOf(word.Trim('\'').ToLower()) >= 0)
return true;
if (word.Length > 2 && _namesEtcListUppercase.IndexOf(word) >= 0)
return true;
if (Utilities.IsInNamesEtcMultiWordList(_namesEtcMultiWordList, line, word))
return true;
return false;
}
public int CountUnknownWordsViaDictionary(string line)
{
if (_hunspell == null)
return 0;
int wordsNotFound = 0;
string[] words = line.Split((Environment.NewLine + " ,.!?:;()[]{}+-$£\"#&%").ToCharArray(), StringSplitOptions.RemoveEmptyEntries);
for (int i = 0; i < words.Length; i++)
{
string word = words[i];
if (!IsWordKnownOrNumber(word, line))
{
bool correct = _hunspell.Spell(word);
if (!correct)
correct = _hunspell.Spell(word.Trim('\''));
if (!correct)
wordsNotFound++;
}
}
return wordsNotFound;
}
}
}

37
src/Logic/OCR/OcrImage.cs Normal file
View File

@ -0,0 +1,37 @@
using System.Drawing;
namespace Nikse.SubtitleEdit.Logic.OCR
{
public class OcrImage
{
public bool Italic { get; set; }
public Bitmap Bmp { get; set; }
public double[] GetTrainingData(int size)
{
int i = 0;
var data = new double[size];
for (int y = 0; y < Bmp.Height; y++)
{
for (int x = 0; x < Bmp.Width; x++)
{
Color c = Bmp.GetPixel(x, y);
if (i < size)
{
if (c == Color.Transparent)
{
data[i] = -0.5;
}
else
{
int value = c.R + c.R + c.B;
data[i] = value/766.0;
}
}
i++;
}
}
return data;
}
}
}

View File

@ -0,0 +1,11 @@

namespace Nikse.SubtitleEdit.Logic.OCR
{
public class SpellcheckOcrTextResult
{
public bool Fixed { get; set; }
public bool FixedWholeLine { get; set; }
public string Word { get; set; }
public string Line { get; set; }
}
}