SubtitleEdit/libse/Dictionaries/OcrFixReplaceList.cs
Nikolaj Olsson 1342b062fb Merge pull request #1801 from ivandrofly/patch-ocr
[OCR] - Fix from prev PR.
2016-06-18 23:47:09 +02:00

938 lines
37 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml;
namespace Nikse.SubtitleEdit.Core.Dictionaries
{
public class OcrFixReplaceList
{
private static readonly Regex RegExQuestion = new Regex(@"\S\?[A-ZÆØÅÄÖÉÈÀÙÂÊÎÔÛËÏa-zæøåäöéèàùâêîôûëï]", RegexOptions.Compiled);
private static readonly Regex RegExIandZero = new Regex(@"[a-zæøåöääöéèàùâêîôûëï][I1]", RegexOptions.Compiled);
private static readonly Regex RegExTime1 = new Regex(@"[a-zæøåöääöéèàùâêîôûëï]0", RegexOptions.Compiled);
private static readonly Regex RegExTime2 = new Regex(@"0[a-zæøåöääöéèàùâêîôûëï]", RegexOptions.Compiled);
private static readonly Regex HexNumber = new Regex(@"^#?[\dABDEFabcdef]+$", RegexOptions.Compiled);
private static readonly Regex StartsAndEndsWithNumber = new Regex(@"^\d+.+\d$", RegexOptions.Compiled);
public readonly Dictionary<string, string> WordReplaceList;
public readonly Dictionary<string, string> PartialLineWordBoundaryReplaceList;
private readonly Dictionary<string, string> _partialLineAlwaysReplaceList;
private readonly Dictionary<string, string> _beginLineReplaceList;
private readonly Dictionary<string, string> _endLineReplaceList;
private readonly Dictionary<string, string> _wholeLineReplaceList;
private readonly Dictionary<string, string> _partialWordReplaceListAlways;
private readonly Dictionary<string, string> _partialWordReplaceList;
private readonly Dictionary<string, string> _regExList;
private readonly string _replaceListXmlFileName;
public OcrFixReplaceList(string replaceListXmlFileName)
{
_replaceListXmlFileName = replaceListXmlFileName;
WordReplaceList = new Dictionary<string, string>();
PartialLineWordBoundaryReplaceList = new Dictionary<string, string>();
_partialLineAlwaysReplaceList = new Dictionary<string, string>();
_beginLineReplaceList = new Dictionary<string, string>();
_endLineReplaceList = new Dictionary<string, string>();
_wholeLineReplaceList = new Dictionary<string, string>();
_partialWordReplaceListAlways = new Dictionary<string, string>();
_partialWordReplaceList = new Dictionary<string, string>();
_regExList = new Dictionary<string, string>();
var doc = LoadXmlReplaceListDocument();
var userDoc = LoadXmlReplaceListUserDocument();
WordReplaceList = LoadReplaceList(doc, "WholeWords");
_partialWordReplaceListAlways = LoadReplaceList(doc, "PartialWordsAlways");
_partialWordReplaceList = LoadReplaceList(doc, "PartialWords");
PartialLineWordBoundaryReplaceList = LoadReplaceList(doc, "PartialLines");
_partialLineAlwaysReplaceList = LoadReplaceList(doc, "PartialAlwaysLines");
_beginLineReplaceList = LoadReplaceList(doc, "BeginLines");
_endLineReplaceList = LoadReplaceList(doc, "EndLines");
_wholeLineReplaceList = LoadReplaceList(doc, "WholeLines");
_regExList = LoadRegExList(doc, "RegularExpressions");
foreach (var kp in LoadReplaceList(userDoc, "RemovedWholeWords"))
{
if (WordReplaceList.ContainsKey(kp.Key))
WordReplaceList.Remove(kp.Key);
}
foreach (var kp in LoadReplaceList(userDoc, "WholeWords"))
{
if (!WordReplaceList.ContainsKey(kp.Key))
WordReplaceList.Add(kp.Key, kp.Value);
}
foreach (var kp in LoadReplaceList(userDoc, "RemovedPartialLines"))
{
if (PartialLineWordBoundaryReplaceList.ContainsKey(kp.Key))
PartialLineWordBoundaryReplaceList.Remove(kp.Key);
}
foreach (var kp in LoadReplaceList(userDoc, "PartialLines"))
{
if (!PartialLineWordBoundaryReplaceList.ContainsKey(kp.Key))
PartialLineWordBoundaryReplaceList.Add(kp.Key, kp.Value);
}
}
public static OcrFixReplaceList FromLanguageId(string languageId)
{
return new OcrFixReplaceList(Configuration.DictionariesFolder + languageId + "_OCRFixReplaceList.xml");
}
private static Dictionary<string, string> LoadReplaceList(XmlDocument doc, string name)
{
var list = new Dictionary<string, string>();
if (!IsValidXmlDocument(doc, name))
return list;
foreach (XmlNode item in doc.DocumentElement.SelectSingleNode(name).ChildNodes)
{
if (HasValidAttributes(item, false))
{
string to = item.Attributes["to"].Value;
string from = item.Attributes["from"].Value;
if (!list.ContainsKey(from))
list.Add(from, to);
}
}
return list;
}
private static Dictionary<string, string> LoadRegExList(XmlDocument doc, string name)
{
var list = new Dictionary<string, string>();
if (!IsValidXmlDocument(doc, name))
return list;
foreach (XmlNode item in doc.DocumentElement.SelectSingleNode(name).ChildNodes)
{
if (HasValidAttributes(item, true))
{
string to = item.Attributes["replaceWith"].Value;
string from = item.Attributes["find"].Value;
if (!list.ContainsKey(from))
list.Add(from, to);
}
}
return list;
}
private static bool IsValidXmlDocument(XmlDocument doc, string elementName)
{
if (doc.DocumentElement == null || doc.DocumentElement.SelectSingleNode(elementName) == null)
return false;
return true;
}
private static bool HasValidAttributes(XmlNode node, bool isRegex)
{
if (node == null || node.Attributes == null)
return false;
if (isRegex)
{
if (node.Attributes["find"] != null && node.Attributes["replaceWith"] != null)
{
return Utilities.IsValidRegex(node.Attributes["find"].Value);
}
}
else
{
if (node.Attributes["from"] != null && node.Attributes["to"] != null)
{
return (node.Attributes["from"].Value != node.Attributes["to"].Value);
}
}
return false;
}
public string FixOcrErrorViaLineReplaceList(string input)
{
// Whole fromLine
foreach (string from in _wholeLineReplaceList.Keys)
{
if (input == from)
return _wholeLineReplaceList[from];
}
string newText = input;
string pre = string.Empty;
if (newText.StartsWith("<i>", StringComparison.Ordinal))
{
pre += "<i>";
newText = newText.Remove(0, 3);
}
while (newText.Length > 1 && @" -""['¶(".Contains(newText[0]))
{
pre += newText[0];
newText = newText.Substring(1);
}
if (newText.StartsWith("<i>", StringComparison.Ordinal))
{
pre += "<i>";
newText = newText.Remove(0, 3);
}
// begin fromLine
var lines = newText.SplitToLines();
var sb = new StringBuilder();
foreach (string l in lines)
{
string s = l;
foreach (string from in _beginLineReplaceList.Keys)
{
if (s.Contains(from))
{
if (s.StartsWith(from, StringComparison.Ordinal))
s = s.Remove(0, from.Length).Insert(0, _beginLineReplaceList[from]);
s = s.Replace(". " + from, ". " + _beginLineReplaceList[from]);
s = s.Replace("! " + from, "! " + _beginLineReplaceList[from]);
s = s.Replace("? " + from, "? " + _beginLineReplaceList[from]);
s = s.Replace(". " + Environment.NewLine + from, ". " + Environment.NewLine + _beginLineReplaceList[from]);
s = s.Replace("! " + Environment.NewLine + from, "! " + Environment.NewLine + _beginLineReplaceList[from]);
s = s.Replace("? " + Environment.NewLine + from, "? " + Environment.NewLine + _beginLineReplaceList[from]);
if (s.StartsWith("\"" + from, StringComparison.Ordinal) && !from.StartsWith('"'))
s = s.Replace("\"" + from, "\"" + _beginLineReplaceList[from]);
}
}
sb.AppendLine(s);
}
newText = pre + sb.ToString().TrimEnd(Utilities.NewLineChars);
string post = string.Empty;
if (newText.EndsWith("</i>", StringComparison.Ordinal))
{
newText = newText.Remove(newText.Length - 4, 4);
post = "</i>";
}
foreach (string from in _endLineReplaceList.Keys)
{
if (newText.EndsWith(from, StringComparison.Ordinal))
{
int position = (newText.Length - from.Length);
newText = newText.Remove(position).Insert(position, _endLineReplaceList[from]);
}
}
newText += post;
foreach (string from in PartialLineWordBoundaryReplaceList.Keys)
{
if (newText.FastIndexOf(from) >= 0)
newText = ReplaceWord(newText, from, PartialLineWordBoundaryReplaceList[from]);
}
foreach (string from in _partialLineAlwaysReplaceList.Keys)
{
if (newText.FastIndexOf(from) >= 0)
newText = newText.Replace(from, _partialLineAlwaysReplaceList[from]);
}
foreach (string findWhat in _regExList.Keys)
{
newText = Regex.Replace(newText, findWhat, _regExList[findWhat], RegexOptions.Multiline);
}
return newText;
}
private static string AddToGuessList(List<string> list, string word, int index, string letter, string replaceLetters)
{
if (string.IsNullOrEmpty(word) || index < 0 || index + letter.Length - 1 >= word.Length)
return word;
string s = word.Remove(index, letter.Length);
if (index >= s.Length)
s += replaceLetters;
else
s = s.Insert(index, replaceLetters);
if (!list.Contains(s))
list.Add(s);
return s;
}
public IEnumerable<string> CreateGuessesFromLetters(string word)
{
var list = new List<string>();
foreach (string letter in _partialWordReplaceList.Keys)
{
string s = word;
int i = 0;
while (s.Contains(letter) && i < 10)
{
int index = s.FastIndexOf(letter);
s = AddToGuessList(list, s, index, letter, _partialWordReplaceList[letter]);
AddToGuessList(list, word, index, letter, _partialWordReplaceList[letter]);
i++;
}
s = word;
i = 0;
while (s.Contains(letter) && i < 10)
{
int index = s.LastIndexOf(letter, StringComparison.Ordinal);
s = AddToGuessList(list, s, index, letter, _partialWordReplaceList[letter]);
AddToGuessList(list, word, index, letter, _partialWordReplaceList[letter]);
i++;
}
}
return list;
}
public string FixCommonWordErrors(string word)
{
if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
{
word = word.Replace("fi", "fi");
word = word.Replace('ν', 'v'); // NOTE: first 'v' is a special unicode character!!!!
word = word.Replace('', '\'');
word = word.Replace('`', '\'');
word = word.Replace('', '\'');
word = word.Replace('—', '-');
while(word.Contains("--"))
word = word.Replace("--", "-");
word = word.Replace('|', 'l');
word = word.Replace("vx/", "w");
if (word.Contains('¤'))
{
if (Regex.IsMatch(word, "[A-ZÆØÅÄÖÉÈÀÙÂÊÎÔÛËÏa-zæøåäöéèàùâêîôûëï]¤"))
word = word.Replace('¤', 'o');
}
}
//always replace list
foreach (string letter in _partialWordReplaceListAlways.Keys)
word = word.Replace(letter, _partialWordReplaceListAlways[letter]);
string pre = string.Empty;
string post = string.Empty;
if (word.StartsWith("<i>", StringComparison.Ordinal))
{
pre += "<i>";
word = word.Remove(0, 3);
}
while (word.Length > 2 && word.StartsWith(Environment.NewLine, StringComparison.Ordinal))
{
pre += Environment.NewLine;
word = word.Substring(2);
}
while (word.Length > 1 && word[0] == '-')
{
pre += "-";
word = word.Substring(1);
}
while (word.Length > 1 && word[0] == '.')
{
pre += ".";
word = word.Substring(1);
}
while (word.Length > 1 && word[0] == '"')
{
pre += "\"";
word = word.Substring(1);
}
if (word.Length > 1 && word[0] == '(')
{
pre += "(";
word = word.Substring(1);
}
if (word.StartsWith("<i>", StringComparison.Ordinal))
{
pre += "<i>";
word = word.Remove(0, 3);
}
while (word.Length > 2 && word.EndsWith(Environment.NewLine))
{
post += Environment.NewLine;
word = word.Substring(0, word.Length - 2);
}
while (word.Length > 1 && word.EndsWith('"'))
{
post = post + "\"";
word = word.Substring(0, word.Length - 1);
}
while (word.Length > 1 && word.EndsWith('.'))
{
post = post + ".";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith(',') && word.Length > 1)
{
post = post + ",";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith('?') && word.Length > 1)
{
post = post + "?";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith('!') && word.Length > 1)
{
post = post + "!";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith(')') && word.Length > 1)
{
post = post + ")";
word = word.Substring(0, word.Length - 1);
}
if (word.EndsWith("</i>", StringComparison.Ordinal))
{
post = post + "</i>";
word = word.Remove(word.Length - 4, 4);
}
string preWordPost = pre + word + post;
if (word.Length == 0)
return preWordPost;
if (word.Contains('?'))
{
var match = RegExQuestion.Match(word);
if (match.Success)
word = word.Insert(match.Index + 2, " ");
}
foreach (string from in WordReplaceList.Keys)
{
if (word.Length == from.Length)
{
if (word == from)
return pre + WordReplaceList[from] + post;
}
else if (word.Length + post.Length == from.Length)
{
if (string.CompareOrdinal(word + post, from) == 0)
return pre + WordReplaceList[from];
}
if (pre.Length + word.Length + post.Length == from.Length && string.CompareOrdinal(preWordPost, from) == 0)
{
return WordReplaceList[from];
}
}
if (Configuration.Settings.Tools.OcrFixUseHardcodedRules)
{
// uppercase I or 1 inside lowercase fromWord (will be replaced by lowercase L)
word = FixIor1InsideLowerCaseWord(word);
// uppercase 0 inside lowercase fromWord (will be replaced by lowercase L)
word = Fix0InsideLowerCaseWord(word);
// uppercase I or 1 inside lowercase fromWord (will be replaced by lowercase L)
word = FixIor1InsideLowerCaseWord(word);
word = FixLowerCaseLInsideUpperCaseWord(word); // eg. SCARLETTl => SCARLETTI
}
// Retry fromWord replace list
foreach (string from in WordReplaceList.Keys)
{
if (word.Length == from.Length)
{
if (string.CompareOrdinal(word, from) == 0)
return pre + WordReplaceList[from] + post;
}
else if (word.Length + post.Length == from.Length)
{
if (string.CompareOrdinal(word + post, from) == 0)
return pre + WordReplaceList[from];
}
if (pre.Length + word.Length + post.Length == from.Length && string.CompareOrdinal(preWordPost, from) == 0)
{
return WordReplaceList[from];
}
}
return preWordPost;
}
public static string FixLowerCaseLInsideUpperCaseWord(string word)
{
if (word.Length > 3 && word.Replace("l", string.Empty).ToUpper() == word.Replace("l", string.Empty))
{
if (!word.Contains('<') && !word.Contains('>') && !word.Contains('\''))
{
word = word.Replace('l', 'I');
}
}
return word;
}
public static string FixIor1InsideLowerCaseWord(string word)
{
if (StartsAndEndsWithNumber.IsMatch(word))
return word;
if (word.Contains(new[] { '2', '3', '4', '5', '6', '7', '8', '9' }))
return word;
if (HexNumber.IsMatch(word))
return word;
if (word.LastIndexOf('I') > 0 || word.LastIndexOf('1') > 0)
{
var match = RegExIandZero.Match(word);
while (match.Success)
{
if (word[match.Index + 1] == 'I' || word[match.Index + 1] == '1')
{
bool doFix = word[match.Index + 1] != 'I' && match.Index >= 1 && word.Substring(match.Index - 1).StartsWith("Mc");
if (word[match.Index + 1] == 'I' && match.Index >= 2 && word.Substring(match.Index - 2).StartsWith("Mac"))
doFix = false;
if (doFix)
{
string oldText = word;
word = word.Substring(0, match.Index + 1) + "l";
if (match.Index + 2 < oldText.Length)
word += oldText.Substring(match.Index + 2);
}
}
match = RegExIandZero.Match(word, match.Index + 1);
}
}
return word;
}
public static string Fix0InsideLowerCaseWord(string word)
{
if (StartsAndEndsWithNumber.IsMatch(word))
return word;
if (word.Contains(new[] { '1', '2', '3', '4', '5', '6', '7', '8', '9' }) ||
word.EndsWith("a.m", StringComparison.Ordinal) ||
word.EndsWith("p.m", StringComparison.Ordinal) ||
word.EndsWith("am", StringComparison.Ordinal) ||
word.EndsWith("pm", StringComparison.Ordinal))
return word;
if (HexNumber.IsMatch(word))
return word;
if (word.LastIndexOf('0') > 0)
{
Match match = RegExTime1.Match(word);
while (match.Success)
{
if (word[match.Index + 1] == '0')
{
string oldText = word;
word = word.Substring(0, match.Index + 1) + "o";
if (match.Index + 2 < oldText.Length)
word += oldText.Substring(match.Index + 2);
}
match = RegExTime1.Match(word);
}
const string expectedDigits = "123456789";
match = RegExTime2.Match(word);
while (match.Success)
{
if (word[match.Index] == '0')
{
if (match.Index == 0 || !expectedDigits.Contains(word[match.Index - 1]))
{
string oldText = word;
word = word.Substring(0, match.Index) + "o";
if (match.Index + 1 < oldText.Length)
word += oldText.Substring(match.Index + 1);
}
}
match = RegExTime2.Match(word, match.Index + 1);
}
}
return word;
}
public string FixCommonWordErrorsQuick(string word)
{
//always replace list
foreach (string letter in _partialWordReplaceListAlways.Keys)
word = word.Replace(letter, _partialWordReplaceListAlways[letter]);
string pre = string.Empty;
string post = string.Empty;
if (word.StartsWith("<i>", StringComparison.Ordinal))
{
pre += "<i>";
word = word.Remove(0, 3);
}
while (word.StartsWith(Environment.NewLine) && word.Length > 2)
{
pre += Environment.NewLine;
word = word.Substring(2);
}
while (word.Length > 1 && word[0] == '-')
{
pre += "-";
word = word.Substring(1);
}
while (word.Length > 1 && word[0] == '.')
{
pre += ".";
word = word.Substring(1);
}
while (word.Length > 1 && word[0] == '"')
{
pre += "\"";
word = word.Substring(1);
}
if (word.Length > 1 && word[0] == '(')
{
pre += "(";
word = word.Substring(1);
}
if (word.StartsWith("<i>", StringComparison.Ordinal))
{
pre += "<i>";
word = word.Remove(0, 3);
}
while (word.EndsWith(Environment.NewLine) && word.Length > 2)
{
post += Environment.NewLine;
word = word.Substring(0, word.Length - 2);
}
while (word.EndsWith('"') && word.Length > 1)
{
post = post + "\"";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith('.') && word.Length > 1)
{
post = post + ".";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith(',') && word.Length > 1)
{
post = post + ",";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith('?') && word.Length > 1)
{
post = post + "?";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith('!') && word.Length > 1)
{
post = post + "!";
word = word.Substring(0, word.Length - 1);
}
while (word.EndsWith(')') && word.Length > 1)
{
post = post + ")";
word = word.Substring(0, word.Length - 1);
}
if (word.EndsWith("</i>", StringComparison.Ordinal))
{
post = post + "</i>";
word = word.Remove(word.Length - 4, 4);
}
string preWordPost = pre + word + post;
if (word.Length == 0)
return preWordPost;
foreach (string from in WordReplaceList.Keys)
{
if (word.Length == from.Length)
{
if (string.CompareOrdinal(word, from) == 0)
return pre + WordReplaceList[from] + post;
}
else if (word.Length + post.Length == from.Length)
{
if (string.CompareOrdinal(word + post, from) == 0)
return pre + WordReplaceList[from];
}
if (pre.Length + word.Length + post.Length == from.Length && string.CompareOrdinal(preWordPost, from) == 0)
{
return WordReplaceList[from];
}
}
return preWordPost;
}
public bool RemoveWordOrPartial(string word)
{
if (word.Contains(' '))
{
if (DeletePartialLineFromWordList(word))
{
if (PartialLineWordBoundaryReplaceList.ContainsKey(word))
PartialLineWordBoundaryReplaceList.Remove(word);
return true;
}
return false;
}
if (DeleteWordFromWordList(word))
{
if (WordReplaceList.ContainsKey(word))
WordReplaceList.Remove(word);
return true;
}
return false;
}
private bool DeleteWordFromWordList(string fromWord)
{
const string replaceListName = "WholeWords";
var doc = LoadXmlReplaceListDocument();
var list = LoadReplaceList(doc, replaceListName);
var userDoc = LoadXmlReplaceListUserDocument();
var userList = LoadReplaceList(userDoc, replaceListName);
return DeleteFromList(fromWord, userDoc, replaceListName, "Word", list, userList);
}
private bool DeletePartialLineFromWordList(string fromWord)
{
const string replaceListName = "PartialLines";
var doc = LoadXmlReplaceListDocument();
var list = LoadReplaceList(doc, replaceListName);
var userDoc = LoadXmlReplaceListUserDocument();
var userList = LoadReplaceList(userDoc, replaceListName);
return DeleteFromList(fromWord, userDoc, replaceListName, "LinePart", list, userList);
}
private bool DeleteFromList(string word, XmlDocument userDoc, string replaceListName, string elementName, Dictionary<string, string> dictionary, Dictionary<string, string> userDictionary)
{
if (dictionary == null)
throw new ArgumentNullException("dictionary");
if (userDictionary == null)
throw new ArgumentNullException("userDictionary");
bool removed = false;
if (userDictionary.ContainsKey((word)))
{
userDictionary.Remove(word);
XmlNode wholeWordsNode = userDoc.DocumentElement.SelectSingleNode(replaceListName);
if (wholeWordsNode != null)
{
wholeWordsNode.RemoveAll();
foreach (var kvp in userDictionary)
{
XmlNode newNode = userDoc.CreateNode(XmlNodeType.Element, elementName, null);
XmlAttribute aFrom = userDoc.CreateAttribute("from");
XmlAttribute aTo = userDoc.CreateAttribute("to");
aFrom.InnerText = kvp.Key;
aTo.InnerText = kvp.Value;
newNode.Attributes.Append(aTo);
newNode.Attributes.Append(aFrom);
wholeWordsNode.AppendChild(newNode);
}
userDoc.Save(ReplaceListXmlFileNameUser);
removed = true;
}
}
if (dictionary.ContainsKey((word)))
{
XmlNode wholeWordsNode = userDoc.DocumentElement.SelectSingleNode("Removed" + replaceListName);
if (wholeWordsNode != null)
{
XmlNode newNode = userDoc.CreateNode(XmlNodeType.Element, elementName, null);
XmlAttribute aFrom = userDoc.CreateAttribute("from");
XmlAttribute aTo = userDoc.CreateAttribute("to");
aFrom.InnerText = word;
aTo.InnerText = string.Empty;
newNode.Attributes.Append(aTo);
newNode.Attributes.Append(aFrom);
wholeWordsNode.AppendChild(newNode);
userDoc.Save(ReplaceListXmlFileNameUser);
removed = true;
}
}
return removed;
}
private XmlDocument LoadXmlReplaceListDocument()
{
const string xmlText = "<ReplaceList><WholeWords/><PartialLines/><BeginLines/><EndLines/><WholeLines/></ReplaceList>";
var doc = new XmlDocument();
if (File.Exists(_replaceListXmlFileName))
{
try
{
doc.Load(_replaceListXmlFileName);
}
catch
{
doc.LoadXml(xmlText);
}
}
else
{
doc.LoadXml(xmlText);
}
return doc;
}
private string ReplaceListXmlFileNameUser
{
get { return Path.Combine(Path.GetDirectoryName(_replaceListXmlFileName), Path.GetFileNameWithoutExtension(_replaceListXmlFileName) + "_User" + Path.GetExtension(_replaceListXmlFileName)); }
}
private XmlDocument LoadXmlReplaceListUserDocument()
{
const string xmlText = "<ReplaceList><WholeWords/><PartialLines/><BeginLines/><EndLines/><WholeLines/><RemovedWholeWords/><RemovedPartialLines/><RemovedBeginLines/><RemovedEndLines/><RemovedWholeLines/></ReplaceList>";
var doc = new XmlDocument();
if (File.Exists(ReplaceListXmlFileNameUser))
{
try
{
doc.Load(ReplaceListXmlFileNameUser);
}
catch
{
doc.LoadXml(xmlText);
}
}
else
{
doc.LoadXml(xmlText);
}
return doc;
}
public bool AddWordOrPartial(string fromWord, string toWord)
{
if (fromWord.Contains(' '))
{
if (SavePartialLineToWordList(fromWord, toWord))
{
if (!PartialLineWordBoundaryReplaceList.ContainsKey(fromWord))
PartialLineWordBoundaryReplaceList.Add(fromWord, toWord);
return true;
}
return false;
}
if (SaveWordToWordList(fromWord, toWord))
{
if (!WordReplaceList.ContainsKey(fromWord))
WordReplaceList.Add(fromWord, toWord);
return true;
}
return false;
}
private bool SaveWordToWordList(string fromWord, string toWord)
{
const string replaceListName = "WholeWords";
var doc = LoadXmlReplaceListDocument();
var list = LoadReplaceList(doc, replaceListName);
var userDoc = LoadXmlReplaceListUserDocument();
var userList = LoadReplaceList(userDoc, replaceListName);
return SaveToList(fromWord, toWord, userDoc, replaceListName, "Word", list, userList);
}
private bool SavePartialLineToWordList(string fromWord, string toWord)
{
const string replaceListName = "PartialLines";
var doc = LoadXmlReplaceListDocument();
var list = LoadReplaceList(doc, replaceListName);
var userDoc = LoadXmlReplaceListUserDocument();
var userList = LoadReplaceList(userDoc, replaceListName);
return SaveToList(fromWord, toWord, userDoc, replaceListName, "LinePart", list, userList);
}
private bool SaveToList(string fromWord, string toWord, XmlDocument userDoc, string replaceListName, string elementName, Dictionary<string, string> dictionary, Dictionary<string, string> userDictionary)
{
if (dictionary == null)
throw new ArgumentNullException("dictionary");
if (userDictionary == null)
throw new ArgumentNullException("userDictionary");
if (userDictionary.ContainsKey(fromWord))
return false;
userDictionary.Add(fromWord, toWord);
XmlNode wholeWordsNode = userDoc.DocumentElement.SelectSingleNode(replaceListName);
if (wholeWordsNode != null)
{
XmlNode newNode = userDoc.CreateNode(XmlNodeType.Element, elementName, null);
XmlAttribute aFrom = userDoc.CreateAttribute("from");
XmlAttribute aTo = userDoc.CreateAttribute("to");
aTo.InnerText = toWord;
aFrom.InnerText = fromWord;
newNode.Attributes.Append(aFrom);
newNode.Attributes.Append(aTo);
wholeWordsNode.AppendChild(newNode);
userDoc.Save(ReplaceListXmlFileNameUser);
}
return true;
}
public void AddToWholeLineList(string fromLine, string toLine)
{
var userDocument = LoadXmlReplaceListUserDocument();
if (!_wholeLineReplaceList.ContainsKey(fromLine))
_wholeLineReplaceList.Add(fromLine, toLine);
XmlNode wholeWordsNode = userDocument.DocumentElement.SelectSingleNode("WholeLines");
if (wholeWordsNode != null)
{
XmlNode newNode = userDocument.CreateNode(XmlNodeType.Element, "Line", null);
XmlAttribute aFrom = userDocument.CreateAttribute("from");
XmlAttribute aTo = userDocument.CreateAttribute("to");
aTo.InnerText = toLine;
aFrom.InnerText = fromLine;
newNode.Attributes.Append(aFrom);
newNode.Attributes.Append(aTo);
wholeWordsNode.AppendChild(newNode);
userDocument.Save(_replaceListXmlFileName);
}
}
public static string ReplaceWord(string text, string word, string newWord)
{
var sb = new StringBuilder();
if (word != null && text != null && text.Contains(word))
{
const string startChars = @" ¡¿<>-""”“()[]'`´¶♪¿¡.…—!?,:;/";
int appendFrom = 0;
for (int i = 0; i < text.Length; i++)
{
if (text.Substring(i).StartsWith(word) && i >= appendFrom)
{
bool startOk = i == 0;
if (!startOk)
startOk = (startChars + Environment.NewLine).Contains(text[i - 1]);
if (!startOk && word.StartsWith(' '))
startOk = true;
if (startOk)
{
bool endOk = (i + word.Length == text.Length);
if (!endOk)
endOk = (startChars + Environment.NewLine).Contains(text[i + word.Length]);
if (!endOk)
endOk = newWord.EndsWith(' ');
if (endOk)
{
sb.Append(newWord);
appendFrom = i + word.Length;
}
}
}
if (i >= appendFrom)
sb.Append(text[i]);
}
}
return sb.ToString();
}
}
}