Some more minor fixes in "Fix common errors"

git-svn-id: https://subtitleedit.googlecode.com/svn/trunk@193 99eadd0c-20b8-1223-b5c4-2a2b2df33de2
This commit is contained in:
niksedk 2010-12-25 15:13:57 +00:00
parent 560a146afa
commit 442724cf67
3 changed files with 72 additions and 17 deletions

View File

@ -774,6 +774,10 @@ namespace Nikse.SubtitleEdit.Forms
public void FixUnneededSpaces() public void FixUnneededSpaces()
{ {
const string zeroWhiteSpace = "\u200B";
const string zeroWidthNoBreakSpace = "\uFEFF";
string fixAction = _language.UnneededSpace; string fixAction = _language.UnneededSpace;
int doubleSpaces = 0; int doubleSpaces = 0;
for (int i = 0; i < _subtitle.Paragraphs.Count; i++) for (int i = 0; i < _subtitle.Paragraphs.Count; i++)
@ -782,6 +786,10 @@ namespace Nikse.SubtitleEdit.Forms
string oldText = p.Text; string oldText = p.Text;
p.Text = p.Text.Trim(); p.Text = p.Text.Trim();
p.Text = p.Text.Replace(zeroWhiteSpace, string.Empty);
p.Text = p.Text.Replace(zeroWidthNoBreakSpace, string.Empty);
p.Text = p.Text.Replace("", string.Empty); // some kind of hidden space!!!
while (p.Text.Contains(" ")) while (p.Text.Contains(" "))
{ {
p.Text = p.Text.Replace(" ", " "); p.Text = p.Text.Replace(" ", " ");
@ -1319,11 +1327,18 @@ namespace Nikse.SubtitleEdit.Forms
{ {
if ((Utilities.GetLetters(true, true, true) + ",").Contains(st.StrippedText[match.Index - (Environment.NewLine.Length + 1)].ToString())) if ((Utilities.GetLetters(true, true, true) + ",").Contains(st.StrippedText[match.Index - (Environment.NewLine.Length + 1)].ToString()))
{ {
st.StrippedText = st.StrippedText.Remove(match.Index, 1).Insert(match.Index, "l"); string next = string.Empty;
p.Text = st.MergedString; if (match.Length >= 2)
uppercaseIsInsideLowercaseWords++; next = match.Value.Substring(1, 1);
_totalFixes++;
AddFixToListView(p, i + 1, fixAction, oldText, p.Text); if (Utilities.LowerCaseVowels.Contains(next))
{
st.StrippedText = st.StrippedText.Remove(match.Index, 1).Insert(match.Index, "l");
p.Text = st.MergedString;
uppercaseIsInsideLowercaseWords++;
_totalFixes++;
AddFixToListView(p, i + 1, fixAction, oldText, p.Text);
}
} }
} }
else if (match.Index > 1 && ((st.StrippedText[match.Index - 1] == '\"') || (st.StrippedText[match.Index - 1] == '>') || (st.StrippedText[match.Index - 1] == '-'))) else if (match.Index > 1 && ((st.StrippedText[match.Index - 1] == '\"') || (st.StrippedText[match.Index - 1] == '>') || (st.StrippedText[match.Index - 1] == '-')))
@ -1348,11 +1363,17 @@ namespace Nikse.SubtitleEdit.Forms
} }
else else
{ {
st.StrippedText = st.StrippedText.Remove(match.Index, 1).Insert(match.Index, "l"); if (before == " " && !Utilities.LowerCaseVowels.Contains(after.ToLower()))
p.Text = st.MergedString; {
uppercaseIsInsideLowercaseWords++; }
_totalFixes++; else
AddFixToListView(p, i + 1, fixAction, oldText, p.Text); {
st.StrippedText = st.StrippedText.Remove(match.Index, 1).Insert(match.Index, "l");
p.Text = st.MergedString;
uppercaseIsInsideLowercaseWords++;
_totalFixes++;
AddFixToListView(p, i + 1, fixAction, oldText, p.Text);
}
} }
} }
} }
@ -1699,7 +1720,7 @@ namespace Nikse.SubtitleEdit.Forms
{ {
string text = p.Text.Substring(indexOfNewLine + 2); string text = p.Text.Substring(indexOfNewLine + 2);
StripableText st = new StripableText(text); StripableText st = new StripableText(text);
if (st.StrippedText.Length > 0 && st.StrippedText[0].ToString() != st.StrippedText[0].ToString().ToUpper()) if (st.StrippedText.Length > 0 && st.StrippedText[0].ToString() != st.StrippedText[0].ToString().ToUpper() && !st.Pre.EndsWith("[") && !st.Pre.Contains("..."))
{ {
text = st.Pre + st.StrippedText.Remove(0, 1).Insert(0, st.StrippedText[0].ToString().ToUpper()) + st.Post; text = st.Pre + st.StrippedText.Remove(0, 1).Insert(0, st.StrippedText[0].ToString().ToUpper()) + st.Post;
@ -1858,7 +1879,12 @@ namespace Nikse.SubtitleEdit.Forms
prev = s[match.Index - 1].ToString(); prev = s[match.Index - 1].ToString();
if (match.Index + 1 < s.Length) if (match.Index + 1 < s.Length)
next = s[match.Index + 1].ToString(); next = s[match.Index + 1].ToString();
if (prev != ">" && next != ">" && next != "}")
string wholePrev = string.Empty;
if (match.Index > 1)
wholePrev = s.Substring(0, match.Index - 1);
if (prev != ">" && next != ">" && next != "}" && !wholePrev.Trim().EndsWith("..."))
{ {
string temp = s.Substring(0, match.Index) + "I"; string temp = s.Substring(0, match.Index) + "I";
if (match.Index + 1 < oldText.Length) if (match.Index + 1 < oldText.Length)

View File

@ -752,7 +752,7 @@ namespace Nikse.SubtitleEdit.Forms
} }
string line = GetStringWithItalicTags(matches); string line = GetStringWithItalicTags(matches);
if (checkBoxAutoFixCommonErrors.Checked) if (checkBoxAutoFixCommonErrors.Checked)
line = OcrFixEngine.FixOcrErrorsViaHardcodedRules(line, _lastLine); line = OcrFixEngine.FixOcrErrorsViaHardcodedRules(line, _lastLine, null); // TODO: add abbreviations list
return line; return line;
} }

View File

@ -27,6 +27,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
List<string> _namesEtcList = new List<string>(); List<string> _namesEtcList = new List<string>();
List<string> _namesEtcListUppercase = new List<string>(); List<string> _namesEtcListUppercase = new List<string>();
List<string> _namesEtcMultiWordList = new List<string>(); // case sensitive phrases List<string> _namesEtcMultiWordList = new List<string>(); // case sensitive phrases
List<string> _abbreviationList;
List<string> _userWordList = new List<string>(); List<string> _userWordList = new List<string>();
List<string> _wordSkipList = new List<string>(); List<string> _wordSkipList = new List<string>();
Hunspell _hunspell; Hunspell _hunspell;
@ -120,6 +121,19 @@ namespace Nikse.SubtitleEdit.Logic.OCR
_userWordList = new List<string>(); _userWordList = new List<string>();
_userWordListXmlFileName = Utilities.LoadUserWordList(_userWordList, _fiveLetterWordListLanguageName); _userWordListXmlFileName = Utilities.LoadUserWordList(_userWordList, _fiveLetterWordListLanguageName);
// Find abbreviations
_abbreviationList = new List<string>();
foreach (string name in _namesEtcList)
{
if (name.EndsWith("."))
_abbreviationList.Add(name);
}
foreach (string name in _userWordList)
{
if (name.EndsWith("."))
_abbreviationList.Add(name);
}
// Load NHunspell spellchecker // Load NHunspell spellchecker
_hunspell = new Hunspell(dictionary + ".aff", dictionary + ".dic"); _hunspell = new Hunspell(dictionary + ".aff", dictionary + ".dic");
IsDictionaryLoaded = true; IsDictionaryLoaded = true;
@ -455,7 +469,7 @@ namespace Nikse.SubtitleEdit.Logic.OCR
private string FixCommenOcrLineErrors(string input, string lastLine) private string FixCommenOcrLineErrors(string input, string lastLine)
{ {
input = FixOcrErrorsViaHardcodedRules(input, lastLine); input = FixOcrErrorsViaHardcodedRules(input, lastLine, _abbreviationList);
input = FixOcrErrorViaLineReplaceList(input); input = FixOcrErrorViaLineReplaceList(input);
// e.g. "selectionsu." -> "selections..." // e.g. "selectionsu." -> "selections..."
@ -481,16 +495,31 @@ namespace Nikse.SubtitleEdit.Logic.OCR
return input; return input;
} }
public static string FixOcrErrorsViaHardcodedRules(string input, string lastLine) private static bool EndsWithAbbreviation(string line, List<string> abbreviationList)
{
if (string.IsNullOrEmpty(line) || abbreviationList == null)
return false;
abbreviationList.Add("a.m.");
abbreviationList.Add("p.m.");
abbreviationList.Add("o.r.");
foreach (string abbreviation in abbreviationList)
{
if (line.ToLower().EndsWith(" " + abbreviation.ToLower()))
return true;
}
return false;
}
public static string FixOcrErrorsViaHardcodedRules(string input, string lastLine, List<string> abbreviationList)
{ {
if (lastLine == null || if (lastLine == null ||
lastLine.EndsWith(".") || lastLine.EndsWith(".") ||
lastLine.EndsWith("!") || lastLine.EndsWith("!") ||
lastLine.EndsWith("?")) lastLine.EndsWith("?"))
{ {
if (lastLine == null || !lastLine.EndsWith("...")) if (lastLine == null || (!lastLine.EndsWith("...") && !EndsWithAbbreviation(lastLine, abbreviationList)))
{ {
if (input.Length > 0 && input[0].ToString() != input[0].ToString().ToUpper()) if (input.Length > 0 && input[0].ToString() != input[0].ToString().ToUpper())
input = input.Remove(0, 1).Insert(0, input[0].ToString().ToUpper()); input = input.Remove(0, 1).Insert(0, input[0].ToString().ToUpper());
} }