Some more minor fixes in "Fix common errors"

git-svn-id: https://subtitleedit.googlecode.com/svn/trunk@193 99eadd0c-20b8-1223-b5c4-2a2b2df33de2
This commit is contained in:
niksedk 2010-12-25 15:13:57 +00:00
parent 560a146afa
commit 442724cf67
3 changed files with 72 additions and 17 deletions

View File

@ -774,6 +774,10 @@ namespace Nikse.SubtitleEdit.Forms
public void FixUnneededSpaces()
{
const string zeroWhiteSpace = "\u200B";
const string zeroWidthNoBreakSpace = "\uFEFF";
string fixAction = _language.UnneededSpace;
int doubleSpaces = 0;
for (int i = 0; i < _subtitle.Paragraphs.Count; i++)
@ -782,6 +786,10 @@ namespace Nikse.SubtitleEdit.Forms
string oldText = p.Text;
p.Text = p.Text.Trim();
p.Text = p.Text.Replace(zeroWhiteSpace, string.Empty);
p.Text = p.Text.Replace(zeroWidthNoBreakSpace, string.Empty);
p.Text = p.Text.Replace("", string.Empty); // some kind of hidden space!!!
while (p.Text.Contains(" "))
{
p.Text = p.Text.Replace(" ", " ");
@ -1318,6 +1326,12 @@ namespace Nikse.SubtitleEdit.Forms
else if (match.Index > Environment.NewLine.Length + 1 && Environment.NewLine.Contains(st.StrippedText[match.Index - 1].ToString()))
{
if ((Utilities.GetLetters(true, true, true) + ",").Contains(st.StrippedText[match.Index - (Environment.NewLine.Length + 1)].ToString()))
{
string next = string.Empty;
if (match.Length >= 2)
next = match.Value.Substring(1, 1);
if (Utilities.LowerCaseVowels.Contains(next))
{
st.StrippedText = st.StrippedText.Remove(match.Index, 1).Insert(match.Index, "l");
p.Text = st.MergedString;
@ -1326,6 +1340,7 @@ namespace Nikse.SubtitleEdit.Forms
AddFixToListView(p, i + 1, fixAction, oldText, p.Text);
}
}
}
else if (match.Index > 1 && ((st.StrippedText[match.Index - 1] == '\"') || (st.StrippedText[match.Index - 1] == '>') || (st.StrippedText[match.Index - 1] == '-')))
{
}
@ -1347,6 +1362,11 @@ namespace Nikse.SubtitleEdit.Forms
AddFixToListView(p, i + 1, fixAction, oldText, p.Text);
}
else
{
if (before == " " && !Utilities.LowerCaseVowels.Contains(after.ToLower()))
{
}
else
{
st.StrippedText = st.StrippedText.Remove(match.Index, 1).Insert(match.Index, "l");
p.Text = st.MergedString;
@ -1356,6 +1376,7 @@ namespace Nikse.SubtitleEdit.Forms
}
}
}
}
}
}
@ -1699,7 +1720,7 @@ namespace Nikse.SubtitleEdit.Forms
{
string text = p.Text.Substring(indexOfNewLine + 2);
StripableText st = new StripableText(text);
if (st.StrippedText.Length > 0 && st.StrippedText[0].ToString() != st.StrippedText[0].ToString().ToUpper())
if (st.StrippedText.Length > 0 && st.StrippedText[0].ToString() != st.StrippedText[0].ToString().ToUpper() && !st.Pre.EndsWith("[") && !st.Pre.Contains("..."))
{
text = st.Pre + st.StrippedText.Remove(0, 1).Insert(0, st.StrippedText[0].ToString().ToUpper()) + st.Post;
@ -1858,7 +1879,12 @@ namespace Nikse.SubtitleEdit.Forms
prev = s[match.Index - 1].ToString();
if (match.Index + 1 < s.Length)
next = s[match.Index + 1].ToString();
if (prev != ">" && next != ">" && next != "}")
string wholePrev = string.Empty;
if (match.Index > 1)
wholePrev = s.Substring(0, match.Index - 1);
if (prev != ">" && next != ">" && next != "}" && !wholePrev.Trim().EndsWith("..."))
{
string temp = s.Substring(0, match.Index) + "I";
if (match.Index + 1 < oldText.Length)

View File

@ -752,7 +752,7 @@ namespace Nikse.SubtitleEdit.Forms
}
string line = GetStringWithItalicTags(matches);
if (checkBoxAutoFixCommonErrors.Checked)
line = OcrFixEngine.FixOcrErrorsViaHardcodedRules(line, _lastLine);
line = OcrFixEngine.FixOcrErrorsViaHardcodedRules(line, _lastLine, null); // TODO: add abbreviations list
return line;
}

View File

@ -27,6 +27,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
List<string> _namesEtcList = new List<string>();
List<string> _namesEtcListUppercase = new List<string>();
List<string> _namesEtcMultiWordList = new List<string>(); // case sensitive phrases
List<string> _abbreviationList;
List<string> _userWordList = new List<string>();
List<string> _wordSkipList = new List<string>();
Hunspell _hunspell;
@ -120,6 +121,19 @@ namespace Nikse.SubtitleEdit.Logic.OCR
_userWordList = new List<string>();
_userWordListXmlFileName = Utilities.LoadUserWordList(_userWordList, _fiveLetterWordListLanguageName);
// Find abbreviations
_abbreviationList = new List<string>();
foreach (string name in _namesEtcList)
{
if (name.EndsWith("."))
_abbreviationList.Add(name);
}
foreach (string name in _userWordList)
{
if (name.EndsWith("."))
_abbreviationList.Add(name);
}
// Load NHunspell spellchecker
_hunspell = new Hunspell(dictionary + ".aff", dictionary + ".dic");
IsDictionaryLoaded = true;
@ -455,7 +469,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
private string FixCommenOcrLineErrors(string input, string lastLine)
{
input = FixOcrErrorsViaHardcodedRules(input, lastLine);
input = FixOcrErrorsViaHardcodedRules(input, lastLine, _abbreviationList);
input = FixOcrErrorViaLineReplaceList(input);
// e.g. "selectionsu." -> "selections..."
@ -481,16 +495,31 @@ namespace Nikse.SubtitleEdit.Logic.OCR
return input;
}
public static string FixOcrErrorsViaHardcodedRules(string input, string lastLine)
private static bool EndsWithAbbreviation(string line, List<string> abbreviationList)
{
if (string.IsNullOrEmpty(line) || abbreviationList == null)
return false;
abbreviationList.Add("a.m.");
abbreviationList.Add("p.m.");
abbreviationList.Add("o.r.");
foreach (string abbreviation in abbreviationList)
{
if (line.ToLower().EndsWith(" " + abbreviation.ToLower()))
return true;
}
return false;
}
public static string FixOcrErrorsViaHardcodedRules(string input, string lastLine, List<string> abbreviationList)
{
if (lastLine == null ||
lastLine.EndsWith(".") ||
lastLine.EndsWith("!") ||
lastLine.EndsWith("?"))
{
if (lastLine == null || !lastLine.EndsWith("..."))
if (lastLine == null || (!lastLine.EndsWith("...") && !EndsWithAbbreviation(lastLine, abbreviationList)))
{
if (input.Length > 0 && input[0].ToString() != input[0].ToString().ToUpper())
input = input.Remove(0, 1).Insert(0, input[0].ToString().ToUpper());
}