SubtitleEdit/libse/Forms/RemoveTextForHI.cs

1132 lines
54 KiB
C#
Raw Normal View History

using System;
using System.Collections.Generic;
using System.Globalization;
using System.Text;
using System.Text.RegularExpressions;
namespace Nikse.SubtitleEdit.Core.Forms
{
public class RemoveTextForHI
{
public RemoveTextForHISettings Settings { get; set; }
public List<int> Warnings;
public int WarningIndex;
private List<string> _interjectionList;
public RemoveTextForHI(RemoveTextForHISettings removeTextForHISettings)
{
Settings = removeTextForHISettings;
}
public void ResetInterjections()
{
_interjectionList = null;
}
public string RemoveHearImpairedtagsInsideLine(string newText)
{
for (int i = 6; i < newText.Length; i++)
{
var s = newText.Substring(i);
if (s.Length > 2 && ".?!".Contains(s[0]))
{
var pre = string.Empty;
s = s.Remove(0, 1);
if (s.StartsWith(' '))
{
if (s.StartsWith(" <i>", StringComparison.Ordinal))
pre = " <i>";
else
pre = " ";
}
else if (s.StartsWith("<i>", StringComparison.Ordinal))
pre = "<i>";
else if (s.StartsWith("</i>", StringComparison.Ordinal))
pre = "</i>";
if (pre.Length > 0)
{
s = s.Remove(0, pre.Length);
if (s.Length > 1 && s[0] == ' ')
{
pre += " ";
s = s.Remove(0, 1);
}
if (HasHearImpariedTagsAtStart(s))
{
s = RemoveStartEndTags(s);
newText = newText.Substring(0, i + 1) + pre + " " + s;
newText = newText.Replace("<i></i>", string.Empty);
newText = newText.Replace("<i> </i>", " ").FixExtraSpaces();
}
}
}
}
return newText;
}
public string RemoveColon(string text)
{
if (!(Settings.RemoveTextBeforeColon && text.Contains(':')))
return text;
string preAssTag = string.Empty;
if (text.StartsWith("{\\", StringComparison.Ordinal))
{
int indexOfEndBracketSuccessor = text.IndexOf('}') + 1;
if (indexOfEndBracketSuccessor > 0)
{
preAssTag = text.Substring(0, indexOfEndBracketSuccessor);
text = text.Remove(0, indexOfEndBracketSuccessor).TrimStart();
}
}
// House 7x01 line 52: and she would like you to do three things:
// Okay or remove???
string noTagText = HtmlUtil.RemoveHtmlTags(text);
if (noTagText.Length > 10 && noTagText.IndexOf(':') == noTagText.Length - 1 && noTagText != noTagText.ToUpper())
return text;
string newText = string.Empty;
var lines = text.Trim().SplitToLines();
int noOfNames = 0;
int count = 0;
bool removedInFirstLine = false;
bool removedInSecondLine = false;
foreach (string line in lines)
{
int indexOfColon = line.IndexOf(':');
if (indexOfColon <= 0 || IsInsideBrackets(line, indexOfColon))
{
newText = (newText + Environment.NewLine + line).Trim();
if (newText.EndsWith("</i>", StringComparison.Ordinal) && text.StartsWith("<i>", StringComparison.Ordinal) && !newText.StartsWith("<i>", StringComparison.Ordinal))
newText = "<i>" + newText;
else if (newText.EndsWith("</b>", StringComparison.Ordinal) && text.StartsWith("<b>", StringComparison.Ordinal) && !newText.StartsWith("<b>", StringComparison.Ordinal))
newText = "<b>" + newText;
}
else
{
var pre = line.Substring(0, indexOfColon);
var noTagPre = HtmlUtil.RemoveHtmlTags(pre, true);
if (Settings.RemoveTextBeforeColonOnlyUppercase && noTagPre != noTagPre.ToUpper())
{
newText = (newText + Environment.NewLine + line).Trim();
}
else
{
if (Utilities.CountTagInText(line, ':') == 1)
{
if (count == 1 && newText.Length > 1 && removedInFirstLine &&
!".?!".Contains(newText[newText.Length - 1]) && newText.LineEndsWithHtmlTag(true) &&
line != line.ToUpper())
{
2015-09-09 22:32:28 +02:00
newText += Environment.NewLine;
if (pre.Contains("<i>") && line.Contains("</i>"))
2015-09-09 22:32:28 +02:00
newText += "<i>" + line;
else if (pre.Contains("<b>") && line.Contains("</b>"))
2015-09-09 22:32:28 +02:00
newText += "<b>" + line;
else if (pre.Contains("<u>") && line.Contains("</u>"))
2015-09-09 22:32:28 +02:00
newText += "<u>" + line;
else if (pre.Contains('[') && line.Contains(']'))
2015-09-09 22:32:28 +02:00
newText += "[" + line;
else if (pre.Contains('(') && line.EndsWith(')'))
2015-09-09 22:32:28 +02:00
newText += "(" + line;
else
2015-09-09 22:32:28 +02:00
newText += line;
}
else if (count == 1 && newText.Length > 1 && indexOfColon > 15 && line.Substring(0, indexOfColon).Contains(' ') &&
!".?!".Contains(newText[newText.Length - 1]) && newText.LineEndsWithHtmlTag(true) &&
line != line.ToUpper())
{
2015-09-09 22:32:28 +02:00
newText += Environment.NewLine;
if (pre.Contains("<i>") && line.Contains("</i>"))
2015-09-09 22:32:28 +02:00
newText += "<i>" + line;
else if (pre.Contains("<b>") && line.Contains("</b>"))
2015-09-09 22:32:28 +02:00
newText += "<b>" + line;
else if (pre.Contains("<u>") && line.Contains("</u>"))
2015-09-09 22:32:28 +02:00
newText += "<u>" + line;
else if (pre.Contains('[') && line.Contains(']'))
2015-09-09 22:32:28 +02:00
newText += "[" + line;
else if (pre.Contains('(') && line.EndsWith(')'))
2015-09-09 22:32:28 +02:00
newText += "(" + line;
else
2015-09-09 22:32:28 +02:00
newText += line;
}
else
{
var preStripable = new StripableText(pre);
var remove = true;
if (indexOfColon < line.Length - 1)
{
if (Settings.ColonSeparateLine && !line.Substring(indexOfColon + 1).StartsWith(Environment.NewLine, StringComparison.Ordinal))
remove = false;
else if (Utilities.IsBetweenNumbers(line, indexOfColon))
remove = false;
}
if (remove && !DoRemove(pre))
remove = false;
if (remove)
{
var content = line.Substring(indexOfColon + 1).Trim();
if (content.Length > 0)
{
2015-09-09 22:32:28 +02:00
newText += Environment.NewLine;
if (pre.Contains("<i>") && content.Contains("</i>"))
2015-09-09 22:32:28 +02:00
newText += "<i>" + content;
else if (pre.Contains("<b>") && content.Contains("</b>"))
2015-09-09 22:32:28 +02:00
newText += "<b>" + content;
else if (pre.Contains('[') && content.Contains(']'))
2015-09-09 22:32:28 +02:00
newText += "[" + content;
else if (pre.Contains('(') && content.EndsWith(')'))
2015-09-09 22:32:28 +02:00
newText += "(" + content;
else
2015-09-09 22:32:28 +02:00
newText += content;
if (count == 0)
removedInFirstLine = true;
else if (count == 1)
removedInSecondLine = true;
}
newText = newText.Trim();
if (text.StartsWith('(') && newText.EndsWith(')') && !newText.Contains('('))
newText = newText.TrimEnd(')');
else if (text.StartsWith('[') && newText.EndsWith(']') && !newText.Contains('['))
newText = newText.TrimEnd(']');
else if (newText.EndsWith("</i>", StringComparison.Ordinal) && text.StartsWith("<i>", StringComparison.Ordinal) && !newText.StartsWith("<i>", StringComparison.Ordinal))
newText = "<i>" + newText;
else if (newText.EndsWith("</b>", StringComparison.Ordinal) && text.StartsWith("<b>", StringComparison.Ordinal) && !newText.StartsWith("<b>", StringComparison.Ordinal))
newText = "<b>" + newText;
else if (newText.EndsWith("</u>", StringComparison.Ordinal) && text.StartsWith("<u>", StringComparison.Ordinal) && !newText.StartsWith("<u>", StringComparison.Ordinal))
newText = "<u>" + newText;
if (!IsHIDescription(preStripable.StrippedText))
noOfNames++;
}
else
{
newText = (newText + Environment.NewLine + line).Trim();
if (newText.EndsWith("</i>", StringComparison.Ordinal) && text.StartsWith("<i>", StringComparison.Ordinal) && !newText.StartsWith("<i>", StringComparison.Ordinal))
newText = "<i>" + newText;
else if (newText.EndsWith("</b>", StringComparison.Ordinal) && text.StartsWith("<b>", StringComparison.Ordinal) && !newText.StartsWith("<b>", StringComparison.Ordinal))
newText = "<b>" + newText;
else if ((newText.EndsWith("</u>", StringComparison.Ordinal) && text.StartsWith("<u>", StringComparison.Ordinal) && !newText.StartsWith("<u>", StringComparison.Ordinal)))
newText = "<u>" + newText;
}
}
}
else
{
2015-09-09 22:32:28 +02:00
char[] endChars = { '.', '?', '!' };
string s2 = line;
for (int k = 0; k < 2; k++)
{
if (s2.Contains(':'))
{
int colonIndex = s2.IndexOf(':');
string start = s2.Substring(0, colonIndex);
if (!Settings.RemoveTextBeforeColonOnlyUppercase || start == start.ToUpper())
{
2015-09-09 22:32:28 +02:00
int endIndex = start.LastIndexOfAny(endChars);
if (colonIndex > 0 && colonIndex < s2.Length - 1)
{
if (char.IsDigit(s2[colonIndex - 1]) && char.IsDigit(s2[colonIndex + 1]))
endIndex = 0;
}
if (endIndex < 0)
s2 = s2.Remove(0, colonIndex - endIndex);
else if (endIndex > 0)
s2 = s2.Remove(endIndex + 1, colonIndex - endIndex);
}
if (count == 0)
removedInFirstLine = true;
else if (count == 1)
removedInSecondLine = true;
}
}
2015-09-09 22:32:28 +02:00
newText = (newText + Environment.NewLine + s2).Trim();
}
}
}
count++;
}
newText = newText.Trim();
if (noOfNames > 0 && Utilities.GetNumberOfLines(newText) == 2)
{
int indexOfDialogChar = newText.IndexOf('-');
bool insertDash = true;
var arr = newText.SplitToLines();
if (arr.Length == 2 && arr[0].Length > 1 && arr[1].Length > 1)
{
string arr0 = new StripableText(arr[0]).StrippedText;
var arr1Stripable = new StripableText(arr[1]);
string arr1 = arr1Stripable.StrippedText;
if (arr0.Length > 0 && arr1.Length > 1)
{
// line continuation?
if (Utilities.LowercaseLetters.Contains(arr1[0])) // second line starts with lower case letter
{
char c = arr0[arr0.Length - 1];
2015-09-09 22:32:28 +02:00
if (Utilities.LowercaseLetters.Contains(c) || c == ',') // first line ends with comma or lower case letter
{
if (!arr1Stripable.Pre.Contains("..."))
{
insertDash = false;
}
}
}
if (insertDash)
{
string arr0QuoteTrimmed = arr[0].TrimEnd('"');
if (arr0QuoteTrimmed.Length > 0 && !".?!".Contains(arr0QuoteTrimmed[arr0QuoteTrimmed.Length - 1]) && !arr0QuoteTrimmed.EndsWith("</i>", StringComparison.Ordinal))
{
if (!arr1Stripable.Pre.Contains('-'))
{
insertDash = false;
}
}
}
}
if (insertDash && removedInFirstLine && !removedInSecondLine && !text.StartsWith('-') && !text.StartsWith("<i>-", StringComparison.Ordinal))
{
if (!arr[1].StartsWith('-') && !arr[1].StartsWith("<i>-", StringComparison.Ordinal))
insertDash = false;
}
}
if (insertDash)
{
if (indexOfDialogChar < 0 || indexOfDialogChar > 4)
{
var st = new StripableText(newText, string.Empty, string.Empty);
newText = st.Pre + "- " + st.StrippedText + st.Post;
}
int indexOfNewLine = newText.IndexOf(Environment.NewLine, StringComparison.Ordinal);
string second = newText.Substring(indexOfNewLine).Trim();
indexOfDialogChar = second.IndexOf('-');
if (indexOfDialogChar < 0 || indexOfDialogChar > 6)
{
var st = new StripableText(second, String.Empty, String.Empty);
second = st.Pre + "- " + st.StrippedText + st.Post;
newText = newText.Remove(indexOfNewLine) + Environment.NewLine + second;
}
}
}
else if (newText.Contains('-') && !newText.Contains(Environment.NewLine))
{
var st = new StripableText(newText);
if (st.Pre.Contains('-'))
newText = st.Pre.Replace("-", string.Empty) + st.StrippedText + st.Post;
}
else if (removedInSecondLine && !removedInFirstLine && Utilities.GetNumberOfLines(newText) == 2)
{
string noTags = HtmlUtil.RemoveHtmlTags(newText, true).Trim();
bool insertDash = noTags.StartsWith('-') && Utilities.CountTagInText(noTags, '-') == 1;
if (insertDash)
{
if (newText.Contains(Environment.NewLine + "<i>"))
newText = newText.Replace(Environment.NewLine + "<i>", Environment.NewLine + "<i>- ");
else
newText = newText.Replace(Environment.NewLine, Environment.NewLine + "- ");
}
}
if (text.Contains("<i>") && !newText.Contains("<i>") && newText.EndsWith("</i>", StringComparison.Ordinal))
newText = "<i>" + newText;
if (string.IsNullOrWhiteSpace(newText))
return string.Empty;
return preAssTag + newText;
}
private bool IsInsideBrackets(string text, int targetIndex)
{
// <i>♪ (THE CAPITOLS: "COOL JERK") ♪</i>
var index = text.LastIndexOf('(', targetIndex - 1) + 1;
if (index > 0 && text.IndexOf(')', index) > targetIndex)
return true;
index = text.LastIndexOf('[', targetIndex - 1) + 1;
if (index > 0 && text.IndexOf(']', index) > targetIndex)
return true;
return false;
}
private bool DoRemove(string pre)
{
// Skip these: Barry, remember: She cannot; http://google.com; Improved by: ...
if (pre.IndexOfAny(new[] { "Previously on", "Improved by", "http", ", " }, StringComparison.OrdinalIgnoreCase) >= 0)
return false;
// Okay! Narrator: Hello!
if (pre.IndexOfAny(new[] { '!', '?' }) > 0)
return false;
return true;
}
public string RemoveTextFromHearImpaired(string text)
{
if (Settings.RemoveWhereContains)
{
foreach (var removeIfTextContain in Settings.RemoveIfTextContains)
{
if (text.Contains(removeIfTextContain))
return string.Empty;
}
}
string oldText = text;
text = RemoveColon(text);
string pre = " >-\"'`´♪¿¡.…—";
string post = " -\"'`´♪.!?:…—";
if (Settings.RemoveTextBetweenCustomTags)
{
pre = pre.Replace(Settings.CustomStart, string.Empty);
post = post.Replace(Settings.CustomEnd, string.Empty);
}
var st = new StripableText(text, pre, post);
var sb = new StringBuilder();
var parts = st.StrippedText.Trim().SplitToLines();
int lineNumber = 0;
bool removedDialogInFirstLine = false;
int noOfNamesRemoved = 0;
int noOfNamesRemovedNotInLineOne = 0;
foreach (string s in parts)
{
var stSub = new StripableText(s, pre, post);
string strippedText = stSub.StrippedText;
if (lineNumber == parts.Length - 1 && st.Post.Contains('?'))
strippedText += "?";
else if (stSub.Post.Contains('?'))
strippedText += "?";
if (!StartsAndEndsWithHearImpariedTags(strippedText))
{
if (removedDialogInFirstLine && stSub.Pre.Contains("- "))
stSub.Pre = stSub.Pre.Replace("- ", string.Empty);
string newText = stSub.StrippedText;
newText = RemoveHearImpairedTags(newText);
if (stSub.StrippedText.Length - newText.Length > 2)
{
string removedText = GetRemovedString(stSub.StrippedText, newText);
if (!IsHIDescription(removedText))
{
noOfNamesRemoved++;
if (lineNumber > 0)
noOfNamesRemovedNotInLineOne++;
}
}
sb.AppendLine(stSub.Pre + newText + stSub.Post);
}
else
{
if (!IsHIDescription(stSub.StrippedText))
{
noOfNamesRemoved++;
if (lineNumber > 0)
noOfNamesRemovedNotInLineOne++;
}
if (lineNumber == 0)
{
if (st.Pre.Contains("- "))
{
st.Pre = st.Pre.Replace("- ", string.Empty);
removedDialogInFirstLine = true;
}
else if (st.Pre == "-")
{
st.Pre = string.Empty;
removedDialogInFirstLine = true;
}
}
if (st.Pre.Contains("<i>") && stSub.Post.Contains("</i>"))
st.Pre = st.Pre.Replace("<i>", string.Empty);
if (s.Contains("<i>") && !s.Contains("</i>") && st.Post.Contains("</i>"))
st.Post = st.Post.Replace("</i>", string.Empty);
}
lineNumber++;
}
text = st.Pre + sb.ToString().Trim() + st.Post;
text = text.Replace(" ", " ").Trim();
text = text.Replace("<i></i>", string.Empty);
text = text.Replace("<i> </i>", " ");
text = text.Replace("<b></b>", string.Empty);
text = text.Replace("<b> </b>", " ");
text = text.Replace("<u></u>", string.Empty);
text = text.Replace("<u> </u>", " ");
text = RemoveEmptyFontTag(text);
text = text.Replace(" ", " ").Trim();
text = RemoveColon(text);
text = RemoveLineIfAllUppercase(text);
text = RemoveHearImpairedtagsInsideLine(text);
if (Settings.RemoveInterjections)
text = RemoveInterjections(text);
st = new StripableText(text, " >-\"'`´♪¿¡.…—", " -\"'`´♪.!?:…—");
text = st.StrippedText;
if (StartsAndEndsWithHearImpariedTags(text))
{
text = RemoveStartEndTags(text);
}
text = RemoveHearImpairedTags(text);
// fix 3 lines to two liners - if only two lines
if (noOfNamesRemoved >= 1 && Utilities.GetNumberOfLines(text) == 3)
{
var splitChars = new[] { '.', '?', '!' };
var splitParts = HtmlUtil.RemoveHtmlTags(text).Replace(" ", string.Empty).Split(splitChars, StringSplitOptions.RemoveEmptyEntries);
if (splitParts.Length == 2)
{
var temp = new StripableText(text);
temp.StrippedText = temp.StrippedText.Replace(Environment.NewLine, " ");
int splitIndex = temp.StrippedText.LastIndexOfAny(splitChars);
if (splitIndex > 0)
{
text = temp.Pre + temp.StrippedText.Insert(splitIndex + 1, Environment.NewLine) + temp.Post;
}
}
}
if (!text.StartsWith('-') && noOfNamesRemoved >= 1 && Utilities.GetNumberOfLines(text) == 2)
{
var lines = text.SplitToLines();
var part0 = lines[0].Trim().Replace("</i>", string.Empty).Trim();
if (!part0.EndsWith(',') && (!part0.EndsWith('-') || noOfNamesRemovedNotInLineOne > 0))
{
if (part0.Length > 0 && ".?!".Contains(part0[part0.Length - 1]))
{
if (noOfNamesRemovedNotInLineOne > 0)
{
if (!st.Pre.Contains('-'))
text = "- " + text.Replace(Environment.NewLine, Environment.NewLine + "- ");
if (!text.Contains(Environment.NewLine + "-") && !text.Contains(Environment.NewLine + "<i>-"))
text = text.Replace(Environment.NewLine, Environment.NewLine + "- ");
}
}
}
}
if (!string.IsNullOrEmpty(text) || (st.Pre.Contains('♪') || st.Post.Contains('♪')))
text = st.Pre + text + st.Post;
if (oldText.TrimStart().StartsWith("- ", StringComparison.Ordinal) &&
text != null && !text.Contains(Environment.NewLine) &&
(oldText.Contains(Environment.NewLine + "- ") ||
oldText.Contains(Environment.NewLine + " - ") ||
oldText.Contains(Environment.NewLine + "<i>- ") ||
oldText.Contains(Environment.NewLine + "<i> - ")))
{
text = text.TrimStart().TrimStart('-').TrimStart();
}
if (oldText.TrimStart().StartsWith('-') && !oldText.TrimStart().StartsWith("--", StringComparison.Ordinal) &&
text != null && !text.Contains(Environment.NewLine) &&
(oldText.Contains(Environment.NewLine + "-") && !oldText.Contains(Environment.NewLine + "--") ||
oldText.Contains(Environment.NewLine + " - ") ||
oldText.Contains(Environment.NewLine + "<i>- ") ||
oldText.Contains(Environment.NewLine + "<i> - ")))
{
text = text.TrimStart().TrimStart('-').TrimStart();
}
if (oldText.TrimStart().StartsWith("<i>- ", StringComparison.Ordinal) &&
text != null && text.StartsWith("<i>- ", StringComparison.Ordinal) && !text.Contains(Environment.NewLine) &&
(oldText.Contains(Environment.NewLine + "- ") ||
oldText.Contains(Environment.NewLine + " - ") ||
oldText.Contains(Environment.NewLine + "<i>- ") ||
oldText.Contains(Environment.NewLine + "<i> - ")))
{
text = text.Remove(3, 2);
}
if (text != null && !text.Contains(Environment.NewLine) &&
(oldText.Contains(':') && !text.Contains(':') ||
oldText.Contains('[') && !text.Contains('[') ||
oldText.Contains('(') && !text.Contains('(') ||
oldText.Contains('{') && !text.Contains('{')) &&
(oldText.Contains(Environment.NewLine + "- ") ||
oldText.Contains(Environment.NewLine + " - ") ||
oldText.Contains(Environment.NewLine + "<i>- ") ||
oldText.Contains(Environment.NewLine + "<i> - ")))
{
text = text.TrimStart().TrimStart('-').TrimStart();
}
if (oldText != text)
{
// insert spaces before "-"
text = text.Replace(Environment.NewLine + "- <i>", Environment.NewLine + "<i>- ");
text = text.Replace(Environment.NewLine + "-<i>", Environment.NewLine + "<i>- ");
if (text.Length > 2 && text[0] == '-' && text[1] != ' ' && text[1] != '-')
text = text.Insert(1, " ");
if (text.Length > 5 && text.StartsWith("<i>-", StringComparison.Ordinal) && text[4] != ' ' && text[4] != '-')
text = text.Insert(4, " ");
int index = text.IndexOf(Environment.NewLine + "-", StringComparison.Ordinal);
if (index >= 0 && text.Length - index > 4)
{
index += Environment.NewLine.Length + 1;
if (text[index] != ' ' && text[index] != '-')
text = text.Insert(index, " ");
}
index = text.IndexOf(Environment.NewLine + "<i>-", StringComparison.Ordinal);
2015-09-09 22:32:28 +02:00
if (index >= 0 && text.Length - index > 5)
{
index += Environment.NewLine.Length + 4;
if (text[index] != ' ' && text[index] != '-')
text = text.Insert(index, " ");
}
}
return text.Trim();
}
private string RemoveEmptyFontTag(string text)
{
int indexOfStartFont = text.IndexOf("<font ", StringComparison.OrdinalIgnoreCase);
if (indexOfStartFont >= 0)
{
int indexOfEndFont = text.IndexOf("</font>", StringComparison.OrdinalIgnoreCase);
if (indexOfEndFont > indexOfStartFont)
{
int startTagBefore = text.Substring(0, indexOfEndFont).LastIndexOf('<');
if (startTagBefore == indexOfStartFont)
{
var lastTwo = text.Substring(indexOfEndFont - 2, 2);
if (lastTwo.TrimEnd().EndsWith('>'))
{
text = text.Remove(indexOfStartFont, indexOfEndFont + "</font>".Length - indexOfStartFont);
if (lastTwo.EndsWith(' '))
text = text.Insert(indexOfStartFont, " ");
text = text.Replace(" ", " ");
}
}
}
}
return text;
}
private void AddWarning()
{
if (Warnings != null && WarningIndex >= 0)
Warnings.Add(WarningIndex);
}
private static readonly HashSet<string> HiDescriptionWords = new HashSet<string>(new[]
{
"cackles",
"cheers",
"chitters",
"chuckles",
"clears throat",
"exclaims",
"exhales",
"explosion",
"gasps",
"groans",
"growls",
"grunts",
"laughs",
"noise",
"roars",
"scoff",
"screeches",
"shouts",
"shrieks",
"sigh",
"sighs",
"snores",
"stutters",
"thuds",
"trumpets",
"whisper",
"whispers",
"whistles",
});
private bool IsHIDescription(string text)
{
text = text.Trim(' ', '(', ')', '[', ']', '?', '{', '}');
text = text.ToLower();
if (text.Trim().Replace("mr. ", string.Empty).Replace("mrs. ", string.Empty).Replace("dr. ", string.Empty).Contains(' '))
AddWarning();
return (HiDescriptionWords.Contains(text) ||
text.StartsWith("engine ", StringComparison.Ordinal) ||
text.EndsWith("on tv", StringComparison.Ordinal) ||
text.EndsWith("shatters", StringComparison.Ordinal) ||
text.EndsWith("ing", StringComparison.Ordinal));
}
private static string GetRemovedString(string oldText, string newText)
{
oldText = oldText.ToLower();
newText = newText.ToLower();
int start = oldText.IndexOf(newText, StringComparison.Ordinal);
string result;
if (start > 0)
result = oldText.Substring(0, oldText.Length - newText.Length);
else
result = oldText.Substring(newText.Length);
return result.Trim(' ', '(', ')', '[', ']', '?', '{', '}');
}
private static int CompareLength(string a, string b)
{
return b.Length.CompareTo(a.Length);
}
public string RemoveInterjections(string text)
{
if (_interjectionList == null)
{
var interjectionList = new HashSet<string>();
foreach (var s in Configuration.Settings.Tools.Interjections.Split(new[] { ';' }, StringSplitOptions.RemoveEmptyEntries))
{
if (s.Length > 0)
{
interjectionList.Add(s);
var upper = s.ToUpper();
interjectionList.Add(upper);
var lower = s.ToLower();
interjectionList.Add(lower);
interjectionList.Add(lower.CapitalizeFirstLetter());
}
}
_interjectionList = new List<string>(interjectionList);
interjectionList.Clear();
interjectionList.TrimExcess();
_interjectionList.Sort(CompareLength);
}
string oldText = text;
bool doRepeat = true;
while (doRepeat)
{
doRepeat = false;
foreach (string s in _interjectionList)
{
if (text.Contains(s))
{
var regex = new Regex("\\b" + Regex.Escape(s) + "\\b");
var match = regex.Match(text);
if (match.Success)
{
int index = match.Index;
string temp = text.Remove(index, s.Length);
if (temp.Remove(0, index) == " —" && temp.EndsWith("— —", StringComparison.Ordinal))
{
temp = temp.Remove(temp.Length - 3);
if (temp.EndsWith(Environment.NewLine + "—", StringComparison.Ordinal))
temp = temp.Remove(temp.Length - 1).TrimEnd();
}
else if (temp.Remove(0, index) == " —" && temp.EndsWith("- —", StringComparison.Ordinal))
{
temp = temp.Remove(temp.Length - 3);
if (temp.EndsWith(Environment.NewLine + "-", StringComparison.Ordinal))
temp = temp.Remove(temp.Length - 1).TrimEnd();
}
else if (index == 2 && temp.StartsWith("- —", StringComparison.Ordinal))
{
temp = temp.Remove(2, 2);
}
else if (index == 2 && temp.StartsWith("- —", StringComparison.Ordinal))
{
temp = temp.Remove(2, 1);
}
else if (index == 0 && temp.StartsWith(" —", StringComparison.Ordinal))
{
temp = temp.Remove(0, 2);
}
else if (index == 0 && temp.StartsWith('—'))
{
temp = temp.Remove(0, 1);
}
string pre = string.Empty;
if (index > 0)
doRepeat = true;
bool removeAfter = true;
if (index > s.Length)
{
if (temp.Length > index - s.Length + 3)
{
int subIndex = index - s.Length + 1;
string subTemp = temp.Substring(subIndex, 3);
if (subTemp == ", !" || subTemp == ", ?" || subTemp == ", .")
{
temp = temp.Remove(subIndex, 2);
removeAfter = false;
}
}
if (removeAfter && temp.Length > index - s.Length + 2)
{
int subIndex = index - s.Length;
string subTemp = temp.Substring(subIndex, 3);
if (subTemp == ", !" || subTemp == ", ?" || subTemp == ", .")
{
temp = temp.Remove(subIndex, 2);
removeAfter = false;
}
else
{
subTemp = temp.Substring(subIndex);
if (subTemp.StartsWith(", -—", StringComparison.Ordinal))
{
temp = temp.Remove(subIndex, 3);
removeAfter = false;
}
else if (subTemp.StartsWith(", --", StringComparison.Ordinal))
{
temp = temp.Remove(subIndex, 2);
removeAfter = false;
}
}
}
if (removeAfter && temp.Length > index - s.Length + 2)
{
int subIndex = index - s.Length + 1;
string subTemp = temp.Substring(subIndex, 2);
if (subTemp == "-!" || subTemp == "-?" || subTemp == "-.")
{
temp = temp.Remove(subIndex, 1);
removeAfter = false;
}
}
}
if (index > 3 && index - 2 < temp.Length)
{
string subTemp = temp.Substring(index - 2);
if (subTemp.StartsWith(", —", StringComparison.Ordinal) || subTemp.StartsWith(", —", StringComparison.Ordinal))
{
temp = temp.Remove(index - 2, 1);
index--;
}
}
if (removeAfter)
{
if (index == 0)
{
if (temp.StartsWith('-'))
temp = temp.Remove(0, 1).Trim();
}
else if (index == 3 && temp.StartsWith("<i>-", StringComparison.Ordinal))
{
temp = temp.Remove(3, 1);
}
else if (index > 0 && temp.Length > index)
{
pre = text.Substring(0, index);
temp = temp.Remove(0, index);
if (temp.StartsWith('-') && pre.EndsWith('-'))
temp = temp.Remove(0, 1);
if (temp.StartsWith('-') && pre.EndsWith("- ", StringComparison.Ordinal))
temp = temp.Remove(0, 1);
}
while (temp.Length > 0 && " ,.?!".Contains(temp[0]))
{
temp = temp.Remove(0, 1);
doRepeat = true;
}
if (temp.Length > 0 && s[0].ToString(CultureInfo.InvariantCulture) != s[0].ToString(CultureInfo.InvariantCulture).ToLower())
{
temp = char.ToUpper(temp[0]) + temp.Substring(1);
doRepeat = true;
}
if (temp.StartsWith('-') && pre.EndsWith(' '))
temp = temp.Remove(0, 1);
if (temp.StartsWith('—') && pre.EndsWith(','))
pre = pre.TrimEnd(',') + " ";
temp = pre + temp;
}
if (temp.EndsWith(Environment.NewLine + "- ", StringComparison.Ordinal))
temp = temp.Remove(temp.Length - 2).TrimEnd();
var st = new StripableText(temp);
if (st.StrippedText.Length == 0)
return string.Empty;
if (temp.StartsWith('-') && !temp.Contains(Environment.NewLine) && text.Contains(Environment.NewLine))
temp = temp.Remove(0, 1).Trim();
text = temp;
}
}
}
}
var lines = text.SplitToLines();
if (lines.Length == 2 && text != oldText)
{
if (lines[0] == "-" && lines[1] == "-")
return string.Empty;
if (lines[0].Length > 1 && lines[0][0] == '-' && lines[1].Trim() == "-")
return lines[0].Remove(0, 1).Trim();
if (lines[1].Length > 1 && lines[1][0] == '-' && lines[0].Trim() == "-")
return lines[1].Remove(0, 1).Trim();
if (lines[1].Length > 4 && lines[1].StartsWith("<i>-", StringComparison.Ordinal) && lines[0].Trim() == "-")
return "<i>" + lines[1].Remove(0, 4).Trim();
if (lines[0].Length > 1 && lines[1] == "-" || lines[1] == "." || lines[1] == "!" || lines[1] == "?")
{
if (lines[0].StartsWith('-') && oldText.Contains(Environment.NewLine + "-"))
lines[0] = lines[0].Remove(0, 1);
return lines[0].Trim();
}
var noTags0 = HtmlUtil.RemoveHtmlTags(lines[0], false).Trim();
var noTags1 = HtmlUtil.RemoveHtmlTags(lines[1], false).Trim();
if (noTags0 == "-")
{
if (noTags1 == noTags0)
return string.Empty;
if (lines[1].Length > 1 && lines[1][0] == '-')
return lines[1].Remove(0, 1).Trim();
if (lines[1].Length > 4 && lines[1].StartsWith("<i>-", StringComparison.Ordinal))
return "<i>" + lines[1].Remove(0, 4).Trim();
return lines[1];
}
if (noTags1 == "-")
{
if (lines[0].Length > 1 && lines[0][0] == '-')
return lines[0].Remove(0, 1).Trim();
if (lines[0].Length > 4 && lines[0].StartsWith("<i>-", StringComparison.Ordinal))
return "<i>" + lines[0].Remove(0, 4).Trim();
return lines[0];
}
}
if (lines.Length == 2)
{
if (string.IsNullOrWhiteSpace(lines[1].Replace(".", string.Empty).Replace("?", string.Empty).Replace("!", string.Empty).Replace("-", string.Empty).Replace("—", string.Empty)))
{
text = lines[0];
lines = text.SplitToLines();
}
else if (string.IsNullOrWhiteSpace(lines[0].Replace(".", string.Empty).Replace("?", string.Empty).Replace("!", string.Empty).Replace("-", string.Empty).Replace("—", string.Empty)))
{
text = lines[1];
lines = text.SplitToLines();
}
}
if (lines.Length == 1 && text != oldText && Utilities.GetNumberOfLines(oldText) == 2)
{
if ((oldText.StartsWith('-') || oldText.StartsWith("<i>-", StringComparison.Ordinal)) &&
(oldText.Contains("." + Environment.NewLine) || oldText.Contains(".</i>" + Environment.NewLine) ||
oldText.Contains("!" + Environment.NewLine) || oldText.Contains("!</i>" + Environment.NewLine) ||
oldText.Contains("?" + Environment.NewLine) || oldText.Contains("?</i>" + Environment.NewLine)))
{
if (text.StartsWith("<i>-", StringComparison.Ordinal))
text = "<i>" + text.Remove(0, 4).TrimStart();
else
text = text.TrimStart('-').TrimStart();
}
else if ((oldText.Contains(Environment.NewLine + "-") || oldText.Contains(Environment.NewLine + "<i>-")) &&
(oldText.Contains("." + Environment.NewLine) || oldText.Contains(".</i>" + Environment.NewLine) ||
oldText.Contains("!" + Environment.NewLine) || oldText.Contains("!</i>" + Environment.NewLine) ||
oldText.Contains("?" + Environment.NewLine) || oldText.Contains("?</i>" + Environment.NewLine)))
{
if (text.StartsWith("<i>-", StringComparison.Ordinal))
text = "<i>" + text.Remove(0, 4).TrimStart();
else
text = text.TrimStart('-').TrimStart();
}
}
return text;
}
private string RemoveStartEndTags(string text)
{
string newText = text;
string s = text;
int index;
if (Settings.RemoveTextBetweenSquares && s.StartsWith('[') && (index = s.IndexOf(']', 1)) > 0)
{
if (++index < s.Length && s[index] == ':')
index++;
newText = s.Remove(0, index);
}
else if (Settings.RemoveTextBetweenBrackets && s.StartsWith('{') && (index = s.IndexOf('}', 1)) > 0)
{
if (++index < s.Length && s[index] == ':')
index++;
newText = s.Remove(0, index);
}
else if (Settings.RemoveTextBetweenParentheses && s.StartsWith('(') && (index = s.IndexOf(')', 1)) > 0)
{
if (++index < s.Length && s[index] == ':')
index++;
newText = s.Remove(0, index);
}
else if (Settings.RemoveTextBetweenQuestionMarks && s.StartsWith('?') && (index = s.IndexOf('?', 1)) > 0)
{
if (++index < s.Length && s[index] == ':')
index++;
newText = s.Remove(0, index);
}
else if (Settings.RemoveTextBetweenCustomTags &&
s.Length > 0 && Settings.CustomEnd.Length > 0 && Settings.CustomStart.Length > 0 &&
s.StartsWith(Settings.CustomStart, StringComparison.Ordinal) && (index = s.LastIndexOf(Settings.CustomEnd, StringComparison.Ordinal)) > 0)
{
newText = s.Remove(0, index + Settings.CustomEnd.Length);
}
if (newText != text)
newText = newText.TrimStart(' ');
return newText;
}
public string RemoveHearImpairedTags(string text)
{
string preAssTag = string.Empty;
if (text.StartsWith("{\\", StringComparison.Ordinal) && text.IndexOf('}', 2) > 0)
{
int indexOfEndBracketSuccessor = text.IndexOf('}', 3) + 1;
preAssTag = text.Substring(0, indexOfEndBracketSuccessor);
text = text.Remove(0, indexOfEndBracketSuccessor).TrimStart();
}
if (Settings.RemoveTextBetweenSquares)
{
text = RemoveTextBetweenTags("[", "]:", text);
text = RemoveTextBetweenTags("[", "]", text);
}
if (Settings.RemoveTextBetweenBrackets)
{
text = RemoveTextBetweenTags("{", "}:", text);
text = RemoveTextBetweenTags("{", "}", text);
}
if (Settings.RemoveTextBetweenParentheses)
{
text = RemoveTextBetweenTags("(", "):", text);
text = RemoveTextBetweenTags("(", ")", text);
}
if (Settings.RemoveTextBetweenQuestionMarks)
{
text = RemoveTextBetweenTags("?", "?:", text);
text = RemoveTextBetweenTags("?", "?", text);
}
if (Settings.RemoveTextBetweenCustomTags && Settings.CustomStart.Length > 0 && Settings.CustomEnd.Length > 0)
{
text = RemoveTextBetweenTags(Settings.CustomStart, Settings.CustomEnd, text);
}
if (string.IsNullOrWhiteSpace(text))
return string.Empty;
return preAssTag + text.TrimStart();
}
private bool HasHearImpairedText(string text)
{
return RemoveHearImpairedTags(text) != text;
}
public bool HasHearImpariedTagsAtStart(string text)
{
if (Settings.OnlyIfInSeparateLine)
return StartsAndEndsWithHearImpariedTags(text);
return HasHearImpairedText(text);
}
public bool HasHearImpariedTagsAtEnd(string text)
{
if (Settings.OnlyIfInSeparateLine)
return StartsAndEndsWithHearImpariedTags(text);
return HasHearImpairedText(text);
}
private bool StartsAndEndsWithHearImpariedTags(string text)
{
return (Settings.RemoveTextBetweenSquares && text.StartsWith('[') && text.EndsWith(']') && !text.Trim('[').Contains('[')) ||
(Settings.RemoveTextBetweenBrackets && text.StartsWith('{') && text.EndsWith('}') && !text.Trim('{').Contains('{')) ||
(Settings.RemoveTextBetweenParentheses && text.StartsWith('(') && text.EndsWith(')') && !text.Trim('(').Contains('(')) ||
(Settings.RemoveTextBetweenQuestionMarks && text.StartsWith('?') && text.EndsWith('?') && !text.Trim('?').Contains('?')) ||
(Settings.RemoveTextBetweenSquares && text.StartsWith('[') && text.EndsWith("]:", StringComparison.Ordinal) && !text.Trim('[').Contains('[')) ||
(Settings.RemoveTextBetweenBrackets && text.StartsWith('{') && text.EndsWith("}:", StringComparison.Ordinal) && !text.Trim('{').Contains('{')) ||
(Settings.RemoveTextBetweenParentheses && text.StartsWith('(') && text.EndsWith("):", StringComparison.Ordinal) && !text.Trim('(').Contains('(')) ||
(Settings.RemoveTextBetweenQuestionMarks && text.StartsWith('?') && text.EndsWith("?:", StringComparison.Ordinal) && !text.Trim('?').Contains('?')) ||
(Settings.RemoveTextBetweenCustomTags &&
Settings.CustomStart.Length > 0 && Settings.CustomEnd.Length > 0 &&
text.StartsWith(Settings.CustomStart, StringComparison.Ordinal) && text.EndsWith(Settings.CustomEnd, StringComparison.Ordinal));
}
private static string RemoveTextBetweenTags(string startTag, string endTag, string text)
{
text = text.Trim();
if (startTag == "?" || endTag == "?")
{
if (text.StartsWith(startTag, StringComparison.Ordinal) && text.EndsWith(endTag, StringComparison.Ordinal))
return string.Empty;
return text;
}
int start = text.IndexOf(startTag, StringComparison.Ordinal);
if (start < 0 || text.Length - start - startTag.Length < endTag.Length)
return text;
do
{
int end = text.IndexOf(endTag, start + startTag.Length, StringComparison.Ordinal);
if (end < 0)
break;
text = text.Remove(start, end - start + 1);
if (start > 3 && text.Length - start > 1 &&
text[start] == ':' && text[start - 1] == ' ' && ".?!".Contains(text[start - 2]))
{
text = text.Remove(start - 1, 2);
}
start = text.IndexOf(startTag, StringComparison.Ordinal);
}
2015-09-09 22:32:28 +02:00
while (start >= 0 && text.Length - start - startTag.Length >= endTag.Length);
return text.FixExtraSpaces().TrimEnd();
}
public string RemoveLineIfAllUppercase(string text)
{
if (!Settings.RemoveIfAllUppercase)
return text;
var lines = text.SplitToLines();
var sb = new StringBuilder();
foreach (var line in lines)
{
var lineNoHtml = HtmlUtil.RemoveHtmlTags(line, true);
if (lineNoHtml == lineNoHtml.ToUpper())
{
var temp = lineNoHtml.TrimEnd('.', '!', '?', ':').Trim().Trim(' ', '-', '—');
if (temp.Length == 1 || temp == "YES" || temp == "NO" || temp == "WHY" || temp == "HI")
{
sb.AppendLine(line);
}
}
else
{
sb.AppendLine(line);
}
}
return sb.ToString().Trim();
}
}
}