SubtitleEdit/libse/Forms/FixCommonErrors/FixMissingSpaces.cs

337 lines
17 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Text.RegularExpressions;
namespace Nikse.SubtitleEdit.Core.Forms.FixCommonErrors
{
public class FixMissingSpaces : IFixCommonError
{
private static readonly Regex FixMissingSpacesReComma = new Regex(@"[^\s\d],[^\s]", RegexOptions.Compiled);
private static readonly Regex FixMissingSpacesRePeriod = new Regex(@"[a-z][a-z][.][a-zA-Z]", RegexOptions.Compiled);
private static readonly Regex FixMissingSpacesReQuestionMark = new Regex(@"[^\s\d]\?[a-zA-Z]", RegexOptions.Compiled);
private static readonly Regex FixMissingSpacesReExclamation = new Regex(@"[^\s\d]\![a-zA-Z]", RegexOptions.Compiled);
private static readonly Regex FixMissingSpacesReColon = new Regex(@"[^\s\d]\:[a-zA-Z]", RegexOptions.Compiled);
private static readonly Regex Url = new Regex(@"\w\.(?:com|net|org)\b", RegexOptions.Compiled);
public void Fix(Subtitle subtitle, IFixCallbacks callbacks)
{
var language = Configuration.Settings.Language.FixCommonErrors;
string languageCode = callbacks.Language;
string fixAction = language.FixMissingSpace;
int missingSpaces = 0;
const string expectedChars = @"""”<.";
for (int i = 0; i < subtitle.Paragraphs.Count; i++)
{
Paragraph p = subtitle.Paragraphs[i];
// missing space after comma ","
Match match = FixMissingSpacesReComma.Match(p.Text);
while (match.Success)
{
bool doFix = !expectedChars.Contains(p.Text[match.Index + 2]);
if (doFix && languageCode == "el" && (p.Text.Substring(match.Index).StartsWith("ό,τι", StringComparison.Ordinal) || p.Text.Substring(match.Index).StartsWith("ο,τι", StringComparison.Ordinal)))
doFix = false;
if (doFix && callbacks.AllowFix(p, fixAction))
{
missingSpaces++;
string oldText = p.Text;
p.Text = p.Text.Replace(match.Value, match.Value[0] + ", " + match.Value[match.Value.Length - 1]);
callbacks.AddFixToListView(p, fixAction, oldText, p.Text);
}
match = match.NextMatch();
}
bool allowFix = callbacks.AllowFix(p, fixAction);
// missing space after "?"
match = FixMissingSpacesReQuestionMark.Match(p.Text);
while (match.Success)
{
if (allowFix && !@"""<".Contains(p.Text[match.Index + 2]))
{
missingSpaces++;
string oldText = p.Text;
p.Text = p.Text.Replace(match.Value, match.Value[0] + "? " + match.Value[match.Value.Length - 1]);
callbacks.AddFixToListView(p, fixAction, oldText, p.Text);
}
match = FixMissingSpacesReQuestionMark.Match(p.Text, match.Index + 1);
}
// missing space after "!"
match = FixMissingSpacesReExclamation.Match(p.Text);
while (match.Success)
{
if (allowFix && !@"""<".Contains(p.Text[match.Index + 2]))
{
missingSpaces++;
string oldText = p.Text;
p.Text = p.Text.Replace(match.Value, match.Value[0] + "! " + match.Value[match.Value.Length - 1]);
callbacks.AddFixToListView(p, fixAction, oldText, p.Text);
}
match = FixMissingSpacesReExclamation.Match(p.Text, match.Index + 1);
}
// missing space after ":"
match = FixMissingSpacesReColon.Match(p.Text);
while (match.Success)
{
int start = match.Index;
start -= 4;
if (start < 0)
start = 0;
int indexOfStartCodeTag = p.Text.IndexOf('{', start);
int indexOfEndCodeTag = p.Text.IndexOf('}', start);
if (indexOfStartCodeTag >= 0 && indexOfEndCodeTag >= 0 && indexOfStartCodeTag < match.Index)
{
// we are inside a tag: like indexOfEndCodeTag "{y:i}Is this italic?"
}
else if (allowFix && !@"""<".Contains(p.Text[match.Index + 2]))
{
missingSpaces++;
string oldText = p.Text;
p.Text = p.Text.Replace(match.Value, match.Value[0] + ": " + match.Value[match.Value.Length - 1]);
callbacks.AddFixToListView(p, fixAction, oldText, p.Text);
}
match = FixMissingSpacesReColon.Match(p.Text, match.Index + 1);
}
// missing space after period "."
match = FixMissingSpacesRePeriod.Match(p.Text);
while (match.Success)
{
if (!p.Text.Contains("www.", StringComparison.OrdinalIgnoreCase) &&
!p.Text.Contains("http://", StringComparison.OrdinalIgnoreCase) &&
!Url.IsMatch(p.Text)) // Skip urls.
{
bool isMatchAbbreviation = false;
string word = GetWordFromIndex(p.Text, match.Index);
if (Utilities.CountTagInText(word, '.') > 1)
isMatchAbbreviation = true;
if (!isMatchAbbreviation && word.Contains('@')) // skip emails
isMatchAbbreviation = true;
if (match.Value.Equals("h.d", StringComparison.OrdinalIgnoreCase) && match.Index > 0 && p.Text.Substring(match.Index - 1, 4).Equals("ph.d", StringComparison.OrdinalIgnoreCase))
isMatchAbbreviation = true;
if (!isMatchAbbreviation && callbacks.AllowFix(p, fixAction))
{
missingSpaces++;
string oldText = p.Text;
p.Text = p.Text.Replace(match.Value, match.Value.Replace(".", ". "));
callbacks.AddFixToListView(p, fixAction, oldText, p.Text);
}
}
match = match.NextMatch();
}
if (!p.Text.StartsWith("--", StringComparison.Ordinal))
{
var arr = p.Text.SplitToLines();
if (arr.Length == 2 && arr[0].Length > 1 && arr[1].Length > 1)
{
if (arr[0][0] == '-' && arr[0][1] != ' ')
arr[0] = arr[0].Insert(1, " ");
if (arr[0].Length > 6 && arr[0].StartsWith("<i>-", StringComparison.OrdinalIgnoreCase) && arr[0][4] != ' ')
arr[0] = arr[0].Insert(4, " ");
if (arr[1][0] == '-' && arr[1][1] != ' ' && arr[1][1] != '-')
arr[1] = arr[1].Insert(1, " ");
if (arr[1].Length > 6 && arr[1].StartsWith("<i>-", StringComparison.OrdinalIgnoreCase) && arr[1][4] != ' ')
arr[1] = arr[1].Insert(4, " ");
string newText = arr[0] + Environment.NewLine + arr[1];
if (newText != p.Text && callbacks.AllowFix(p, fixAction))
{
missingSpaces++;
string oldText = p.Text;
p.Text = newText;
callbacks.AddFixToListView(p, fixAction, oldText, p.Text);
}
}
}
//fix missing spaces before/after quotes - Get a"get out of jail free"card. -> Get a "get out of jail free" card.
if (Utilities.CountTagInText(p.Text, '"') == 2)
{
int start = p.Text.IndexOf('"');
int end = p.Text.LastIndexOf('"');
string quote = p.Text.Substring(start, end - start + 1);
if (!quote.Contains(Environment.NewLine))
{
string newText = p.Text;
int indexOfFontTag = newText.IndexOf("<font ", StringComparison.OrdinalIgnoreCase);
bool isAfterAssTag = newText.Contains("{\\") && start > 0 && newText[start - 1] == '}';
if (!isAfterAssTag && start > 0 && !(Environment.NewLine + @" >[(♪♫¿").Contains(p.Text[start - 1]))
{
if (indexOfFontTag < 0 || start > newText.IndexOf('>', indexOfFontTag)) // font tags can contain "
{
newText = newText.Insert(start, " ");
end++;
}
}
if (end < newText.Length - 2 && !(Environment.NewLine + @" <,.!?:;])♪♫¿").Contains(p.Text[end + 1]))
{
if (indexOfFontTag < 0 || end > newText.IndexOf('>', indexOfFontTag)) // font tags can contain "
{
newText = newText.Insert(end + 1, " ");
}
}
if (newText != p.Text && callbacks.AllowFix(p, fixAction))
{
missingSpaces++;
string oldText = p.Text;
p.Text = newText;
callbacks.AddFixToListView(p, fixAction, oldText, p.Text);
}
}
}
//fix missing spaces before/after music quotes - #He's so happy# -> #He's so happy#
if (p.Text.Length > 5 && p.Text.Contains(new[] { '#', '♪', '♫' }))
{
string newText = p.Text;
if (@"#♪♫".Contains(newText[0]) && !@" <".Contains(newText[1]) && !newText.Substring(1).StartsWith(Environment.NewLine) &&
!newText.Substring(1).StartsWith('♪') && !newText.Substring(1).StartsWith('♫'))
newText = newText.Insert(1, " ");
if (@"#♪♫".Contains(newText[newText.Length - 1]) && !@" >".Contains(newText[newText.Length - 2]) &&
!newText.Substring(0, newText.Length - 1).EndsWith(Environment.NewLine, StringComparison.Ordinal) && !newText.Substring(0, newText.Length - 1).EndsWith('♪') &&
!newText.Substring(0, newText.Length - 1).EndsWith('♫'))
newText = newText.Insert(newText.Length - 1, " ");
if (newText != p.Text && callbacks.AllowFix(p, fixAction))
{
missingSpaces++;
string oldText = p.Text;
p.Text = newText;
callbacks.AddFixToListView(p, fixAction, oldText, p.Text);
}
}
//fix missing spaces in "Hey...move it!" to "Hey... move it!"
int index = p.Text.IndexOf("...", StringComparison.Ordinal);
if (index >= 0 && p.Text.Length > 5)
{
string newText = p.Text;
while (index != -1)
{
if (newText.Length > index + 4 && index >= 1)
{
if (Utilities.AllLettersAndNumbers.Contains(newText[index + 3]) &&
Utilities.AllLettersAndNumbers.Contains(newText[index - 1]))
newText = newText.Insert(index + 3, " ");
}
index = newText.IndexOf("...", index + 2, StringComparison.Ordinal);
}
if (newText != p.Text && callbacks.AllowFix(p, fixAction))
{
missingSpaces++;
string oldText = p.Text;
p.Text = newText;
callbacks.AddFixToListView(p, fixAction, oldText, p.Text);
}
}
//fix missing spaces in "The<i>Bombshell</i> will gone." to "The <i>Bombshell</i> will gone."
index = p.Text.IndexOf("<i>", StringComparison.OrdinalIgnoreCase);
if (index >= 0 && p.Text.Length > 5)
{
string newText = p.Text;
while (index != -1)
{
if (newText.Length > index + 6 && index > 1)
{
if (Utilities.AllLettersAndNumbers.Contains(newText[index + 3]) &&
Utilities.AllLettersAndNumbers.Contains(newText[index - 1]))
newText = newText.Insert(index, " ");
}
index = newText.IndexOf("<i>", index + 3, StringComparison.OrdinalIgnoreCase);
}
if (newText != p.Text && callbacks.AllowFix(p, fixAction))
{
missingSpaces++;
string oldText = p.Text;
p.Text = newText;
callbacks.AddFixToListView(p, fixAction, oldText, p.Text);
}
}
//fix missing spaces in "The <i>Bombshell</i>will gone." to "The <i>Bombshell</i> will gone."
index = p.Text.IndexOf("</i>", StringComparison.OrdinalIgnoreCase);
if (index > 3 && p.Text.Length > 5)
{
string newText = p.Text;
while (index != -1)
{
if (newText.Length > index + 6 && index > 1)
{
if (Utilities.AllLettersAndNumbers.Contains(newText[index + 4]) &&
Utilities.AllLettersAndNumbers.Contains(newText[index - 1]))
newText = newText.Insert(index + 4, " ");
}
index = newText.IndexOf("</i>", index + 4, StringComparison.OrdinalIgnoreCase);
}
if (newText != p.Text && callbacks.AllowFix(p, fixAction))
{
missingSpaces++;
string oldText = p.Text;
p.Text = newText;
callbacks.AddFixToListView(p, fixAction, oldText, p.Text);
}
}
if (callbacks.Language == "fr") // special rules for French
{
string newText = p.Text;
int j = 1;
while (j < newText.Length)
{
if (@"!?:;".Contains(newText[j]))
{
if (Utilities.AllLetters.Contains(newText[j - 1]))
{
newText = newText.Insert(j, " ");
j++;
}
}
j++;
}
if (newText != p.Text && callbacks.AllowFix(p, fixAction))
{
missingSpaces++;
string oldText = p.Text;
p.Text = newText;
callbacks.AddFixToListView(p, fixAction, oldText, p.Text);
}
}
}
callbacks.UpdateFixStatus(missingSpaces, language.FixMissingSpaces, string.Format(language.XMissingSpacesAdded, missingSpaces));
}
private static string GetWordFromIndex(string text, int index)
{
if (string.IsNullOrEmpty(text) || index < 0 || index >= text.Length)
return string.Empty;
int endIndex = index;
for (int i = index; i < text.Length; i++)
{
if ((@" " + Environment.NewLine).Contains(text[i]))
break;
endIndex = i;
}
int startIndex = index;
for (int i = index; i >= 0; i--)
{
if ((@" " + Environment.NewLine).Contains(text[i]))
break;
startIndex = i;
}
return text.Substring(startIndex, endIndex - startIndex + 1);
}
}
}