using System; using System.Linq; using System.Text; using System.Text.RegularExpressions; namespace Nikse.SubtitleEdit.Core { /// /// HTML specific string manipulations. /// public static class HtmlUtil { public const string TagItalic = "i"; public const string TagBold = "b"; public const string TagUnderline = "u"; public const string TagParagraph = "p"; public const string TagFont = "font"; public const string TagCyrillicI = "\u0456"; // Cyrillic Small Letter Byelorussian-Ukrainian i (http://graphemica.com/%D1%96) private static readonly Regex TagOpenRegex = new Regex(@"<\s*(?:/\s*)?(\w+)[^>]*>", RegexOptions.Compiled); /// /// Remove all of the specified opening and closing tags from the source HTML string. /// /// The source string to search for specified HTML tags. /// The HTML tags to remove. /// A new string without the specified opening and closing tags. public static string RemoveOpenCloseTags(string source, params string[] tags) { if (string.IsNullOrEmpty(source) || source.IndexOf('<') < 0) return source; // This pattern matches these tag formats: // // < tag*> // // < /tag*> // // < / tag*> return TagOpenRegex.Replace( source, m => tags.Contains(m.Groups[1].Value, StringComparer.OrdinalIgnoreCase) ? string.Empty : m.Value); } /// /// Converts a string to an HTML-encoded string using named character references. /// /// The string to encode. /// An encoded string. public static string EncodeNamed(string source) { if (string.IsNullOrEmpty(source)) return string.Empty; var encoded = new StringBuilder(source.Length); foreach (var ch in source) { switch (ch) { case '<': encoded.Append("<"); break; case '>': encoded.Append(">"); break; case '"': encoded.Append("""); break; case '&': encoded.Append("&"); break; case '\'': encoded.Append("'"); break; case ' ': encoded.Append(" "); break; case '–': encoded.Append("–"); break; case '—': encoded.Append("—"); break; case '¡': encoded.Append("¡"); break; case '¿': encoded.Append("¿"); break; case '“': encoded.Append("“"); break; case '”': encoded.Append("”"); break; case '‘': encoded.Append("‘"); break; case '’': encoded.Append("’"); break; case '«': encoded.Append("«"); break; case '»': encoded.Append("»"); break; case '¢': encoded.Append("¢"); break; case '©': encoded.Append("©"); break; case '÷': encoded.Append("÷"); break; case 'µ': encoded.Append("µ"); break; case '·': encoded.Append("·"); break; case '¶': encoded.Append("¶"); break; case '±': encoded.Append("±"); break; case '€': encoded.Append("€"); break; case '£': encoded.Append("£"); break; case '®': encoded.Append("®"); break; case '§': encoded.Append("§"); break; case '™': encoded.Append("™"); break; case '¥': encoded.Append("¥"); break; case 'á': encoded.Append("á"); break; case 'Á': encoded.Append("Á"); break; case 'à': encoded.Append("à"); break; case 'À': encoded.Append("À"); break; case 'â': encoded.Append("â"); break; case 'Â': encoded.Append("Â"); break; case 'å': encoded.Append("å"); break; case 'Å': encoded.Append("Å"); break; case 'ã': encoded.Append("ã"); break; case 'Ã': encoded.Append("Ã"); break; case 'ä': encoded.Append("ä"); break; case 'Ä': encoded.Append("Ä"); break; case 'æ': encoded.Append("æ"); break; case 'Æ': encoded.Append("Æ"); break; case 'ç': encoded.Append("ç"); break; case 'Ç': encoded.Append("Ç"); break; case 'é': encoded.Append("é"); break; case 'É': encoded.Append("É"); break; case 'è': encoded.Append("è"); break; case 'È': encoded.Append("È"); break; case 'ê': encoded.Append("ê"); break; case 'Ê': encoded.Append("Ê"); break; case 'ë': encoded.Append("ë"); break; case 'Ë': encoded.Append("Ë"); break; case 'í': encoded.Append("í"); break; case 'Í': encoded.Append("Í"); break; case 'ì': encoded.Append("ì"); break; case 'Ì': encoded.Append("Ì"); break; case 'î': encoded.Append("î"); break; case 'Î': encoded.Append("Î"); break; case 'ï': encoded.Append("ï"); break; case 'Ï': encoded.Append("Ï"); break; case 'ñ': encoded.Append("ñ"); break; case 'Ñ': encoded.Append("Ñ"); break; case 'ó': encoded.Append("ó"); break; case 'Ó': encoded.Append("Ó"); break; case 'ò': encoded.Append("ò"); break; case 'Ò': encoded.Append("Ò"); break; case 'ô': encoded.Append("ô"); break; case 'Ô': encoded.Append("Ô"); break; case 'ø': encoded.Append("ø"); break; case 'Ø': encoded.Append("Ø"); break; case 'õ': encoded.Append("õ"); break; case 'Õ': encoded.Append("Õ"); break; case 'ö': encoded.Append("ö"); break; case 'Ö': encoded.Append("Ö"); break; case 'ß': encoded.Append("ß"); break; case 'ú': encoded.Append("ú"); break; case 'Ú': encoded.Append("Ú"); break; case 'ù': encoded.Append("ù"); break; case 'Ù': encoded.Append("Ù"); break; case 'û': encoded.Append("û"); break; case 'Û': encoded.Append("Û"); break; case 'ü': encoded.Append("ü"); break; case 'Ü': encoded.Append("Ü"); break; case 'ÿ': encoded.Append("ÿ"); break; default: if (ch > 127) encoded.Append("&#" + (int)ch + ";"); else encoded.Append(ch); break; } } return encoded.ToString(); } /// /// Converts a string to an HTML-encoded string using numeric character references. /// /// The string to encode. /// An encoded string. public static string EncodeNumeric(string source) { if (string.IsNullOrEmpty(source)) return string.Empty; var encoded = new StringBuilder(source.Length); foreach (var ch in source) { if (ch == ' ') { encoded.Append("&#"); encoded.Append(160); //   encoded.Append(';'); } else if (ch > 127 || ch == '<' || ch == '>' || ch == '"' || ch == '&' || ch == '\'') { encoded.Append("&#"); encoded.Append((int)ch); encoded.Append(';'); } else { encoded.Append(ch); } } return encoded.ToString(); } public static string RemoveHtmlTags(string s, bool alsoSsaTags = false) { if (s == null || s.Length < 3) return s; if (alsoSsaTags) s = Utilities.RemoveSsaTags(s); if (!s.Contains('<')) return s; if (s.Contains("< ")) s = FixInvalidItalicTags(s); return RemoveCommonHtmlTags(s); } /// /// Optimized method to remove common html tags, like , , , and /// /// Text to remove html tags from /// Text stripped from common html tags private static string RemoveCommonHtmlTags(string s) { char[] array = new char[s.Length]; int arrayIndex = 0; bool inside = false; for (int i = 0; i < s.Length; i++) { char ch = s[i]; if (ch == '<' && i < s.Length - 2) { var next = s[i + 1]; var nextNext = s[i + 2]; if (nextNext == '>' && (next == 'i' || // next == 'b' || // next == 'u')) // { inside = true; continue; } if (next == '/' && i < s.Length - 3) { var nextNextNext = s[i + 3]; if (nextNextNext == '>' && (nextNext == 'i' || // nextNext == 'b' || // nextNext == 'u')) // { inside = true; continue; } } if (nextNext == '/' && i < s.Length - 3) { // some bad end tags sometimes seen var nextNextNext = s[i + 3]; if (nextNextNext == '>' && (next == 'i' || // next == 'b' || // next == 'u')) // { inside = true; continue; } } if (next == 'f' && s.Substring(i).StartsWith("", StringComparison.OrdinalIgnoreCase)) // { inside = true; continue; } } if (inside && ch == '>') { inside = false; continue; } if (!inside) { array[arrayIndex] = ch; arrayIndex++; } } return new string(array, 0, arrayIndex); } public static bool IsUrl(string text) { if (string.IsNullOrWhiteSpace(text) || text.Length < 6 || !text.Contains('.') || text.Contains(' ')) return false; var allLower = text.ToLower(); if (allLower.StartsWith("http://", StringComparison.Ordinal) || allLower.StartsWith("https://", StringComparison.Ordinal) || allLower.StartsWith("www.", StringComparison.Ordinal) || allLower.EndsWith(".org", StringComparison.Ordinal) || allLower.EndsWith(".com", StringComparison.Ordinal) || allLower.EndsWith(".net", StringComparison.Ordinal)) return true; if (allLower.Contains(".org/") || allLower.Contains(".com/") || allLower.Contains(".net/")) return true; return false; } public static bool StartsWithUrl(string text) { if (string.IsNullOrWhiteSpace(text)) return false; var arr = text.Trim().TrimEnd('.').TrimEnd().Split(); if (arr.Length == 0) return false; return IsUrl(arr[0]); } private static readonly string[] UppercaseTags = { "", "", "", "", "", "", "" }; public static string FixUpperTags(string text) { if (string.IsNullOrEmpty(text) || !text.Contains('<')) return text; var idx = text.IndexOfAny(UppercaseTags, StringComparison.Ordinal); while (idx >= 0) { var endIdx = text.IndexOf('>', idx + 2); if (endIdx < idx) break; var tag = text.Substring(idx, endIdx - idx).ToLowerInvariant(); text = text.Remove(idx, endIdx - idx).Insert(idx, tag); idx = text.IndexOfAny(UppercaseTags, StringComparison.Ordinal); } return text; } public static string FixInvalidItalicTags(string text) { const string beginTag = ""; const string endTag = ""; text = text.Replace("< i >", beginTag); text = text.Replace("< i>", beginTag); text = text.Replace("", beginTag); text = text.Replace("", beginTag); text = text.Replace("< I >", beginTag); text = text.Replace("< I>", beginTag); text = text.Replace("", beginTag); text = text.Replace("< / i >", endTag); text = text.Replace("< /i>", endTag); text = text.Replace("", endTag); text = text.Replace("< /i >", endTag); text = text.Replace("", endTag); text = text.Replace("", endTag); text = text.Replace("< / i>", endTag); text = text.Replace("", endTag); text = text.Replace("< / I >", endTag); text = text.Replace("< /I>", endTag); text = text.Replace("", endTag); text = text.Replace("< /I >", endTag); text = text.Replace("", endTag); text = text.Replace("", endTag); text = text.Replace("< / I>", endTag); text = text.Replace(" ", "_@_"); text = text.Replace(" _@_", "_@_"); text = text.Replace(" _@_ ", "_@_"); text = text.Replace("_@_", " "); if (text.Contains(beginTag)) { text = text.Replace("", endTag); text = text.Replace("", endTag); } else { text = text.Replace("", string.Empty); text = text.Replace("", string.Empty); } text = text.Replace(beginTag + beginTag, beginTag); text = text.Replace(endTag + endTag, endTag); int italicBeginTagCount = Utilities.CountTagInText(text, beginTag); int italicEndTagCount = Utilities.CountTagInText(text, endTag); int noOfLines = Utilities.GetNumberOfLines(text); if (italicBeginTagCount + italicEndTagCount > 0) { if (italicBeginTagCount == 1 && italicEndTagCount == 1 && text.IndexOf(beginTag, StringComparison.Ordinal) > text.IndexOf(endTag, StringComparison.Ordinal)) { const string pattern = "___________@"; text = text.Replace(beginTag, pattern); text = text.Replace(endTag, beginTag); text = text.Replace(pattern, endTag); } if (italicBeginTagCount == 2 && italicEndTagCount == 0) { int firstIndex = text.IndexOf(beginTag, StringComparison.Ordinal); int lastIndex = text.LastIndexOf(beginTag, StringComparison.Ordinal); int lastIndexWithNewLine = text.LastIndexOf(Environment.NewLine + beginTag, StringComparison.Ordinal) + Environment.NewLine.Length; if (noOfLines == 2 && lastIndex == lastIndexWithNewLine && firstIndex < 2) text = text.Replace(Environment.NewLine, endTag + Environment.NewLine) + endTag; else text = text.Remove(lastIndex, beginTag.Length).Insert(lastIndex, endTag); } if (italicBeginTagCount == 1 && italicEndTagCount == 2) { int firstIndex = text.IndexOf(endTag, StringComparison.Ordinal); if (text.StartsWith("--", StringComparison.Ordinal) || text.StartsWith("- -", StringComparison.Ordinal) || text.StartsWith("- -", StringComparison.Ordinal) || text.StartsWith("- -", StringComparison.Ordinal)) text = text.Remove(0, 5); else if (firstIndex == 0) text = text.Remove(0, 4); else text = text.Substring(0, firstIndex) + text.Substring(firstIndex + endTag.Length); } if (italicBeginTagCount == 2 && italicEndTagCount == 1) { var lines = text.SplitToLines(); if (lines.Count == 2 && lines[0].StartsWith(beginTag, StringComparison.Ordinal) && lines[0].EndsWith(endTag, StringComparison.Ordinal) && lines[1].StartsWith(beginTag, StringComparison.Ordinal)) { text = text.TrimEnd() + endTag; } else { int lastIndex = text.LastIndexOf(beginTag, StringComparison.Ordinal); if (text.Length > lastIndex + endTag.Length) text = text.Substring(0, lastIndex) + text.Substring(lastIndex - 1 + endTag.Length); else text = text.Substring(0, lastIndex - 1) + endTag; } if (text.StartsWith(beginTag, StringComparison.Ordinal) && text.EndsWith(endTag, StringComparison.Ordinal) && text.Contains(endTag + Environment.NewLine + beginTag)) { text = text.Replace(endTag + Environment.NewLine + beginTag, Environment.NewLine); } } if (italicBeginTagCount == 1 && italicEndTagCount == 0) { int lastIndexWithNewLine = text.LastIndexOf(Environment.NewLine + beginTag, StringComparison.Ordinal) + Environment.NewLine.Length; int lastIndex = text.LastIndexOf(beginTag, StringComparison.Ordinal); if (text.StartsWith(beginTag, StringComparison.Ordinal)) text += endTag; else if (noOfLines == 2 && lastIndex == lastIndexWithNewLine) text += endTag; else text = text.Replace(beginTag, string.Empty); } if (italicBeginTagCount == 0 && italicEndTagCount == 1) { var cleanText = RemoveOpenCloseTags(text, TagItalic, TagBold, TagUnderline, TagCyrillicI); bool isFixed = false; // Foo. if (text.EndsWith(endTag, StringComparison.Ordinal) && !cleanText.StartsWith('-') && !cleanText.Contains(Environment.NewLine + "-")) { text = beginTag + text; isFixed = true; } // - Foo | - Foo. // - Bar. | - Foo. if (!isFixed && Utilities.GetNumberOfLines(cleanText) == 2) { int newLineIndex = text.IndexOf(Environment.NewLine, StringComparison.Ordinal); if (newLineIndex > 0) { var firstLine = text.Substring(0, newLineIndex).Trim(); var secondLine = text.Substring(newLineIndex + 2).Trim(); if (firstLine.EndsWith(endTag, StringComparison.Ordinal)) { firstLine = beginTag + firstLine; isFixed = true; } if (secondLine.EndsWith(endTag, StringComparison.Ordinal)) { secondLine = beginTag + secondLine; isFixed = true; } text = firstLine + Environment.NewLine + secondLine; } } if (!isFixed) text = text.Replace(endTag, string.Empty); } // - foo. // - bar. if (italicBeginTagCount == 0 && italicEndTagCount == 2 && text.Contains(endTag + Environment.NewLine, StringComparison.Ordinal) && text.EndsWith(endTag, StringComparison.Ordinal)) { text = text.Replace(endTag, string.Empty); text = beginTag + text + endTag; } if (italicBeginTagCount == 0 && italicEndTagCount == 2 && text.StartsWith(endTag, StringComparison.Ordinal) && text.EndsWith(endTag, StringComparison.Ordinal)) { int firstIndex = text.IndexOf(endTag, StringComparison.Ordinal); text = text.Remove(firstIndex, endTag.Length).Insert(firstIndex, beginTag); } // Foo // Bar if (italicBeginTagCount == 2 && italicEndTagCount == 2 && noOfLines == 2) { int index = text.IndexOf(Environment.NewLine, StringComparison.Ordinal); if (index > 0 && text.Length > index + (beginTag.Length + endTag.Length)) { var firstLine = text.Substring(0, index).Trim(); var secondLine = text.Substring(index + Environment.NewLine.Length).Trim(); if (firstLine.Length > 10 && firstLine.StartsWith("- ", StringComparison.Ordinal) && firstLine.EndsWith(endTag, StringComparison.Ordinal)) { text = "- " + firstLine.Remove(0, 5) + Environment.NewLine + secondLine; text = text.Replace("- ", "- "); index = text.IndexOf(Environment.NewLine, StringComparison.Ordinal); firstLine = text.Substring(0, index).Trim(); secondLine = text.Substring(index + Environment.NewLine.Length).Trim(); } if (secondLine.Length > 10 && secondLine.StartsWith("- ", StringComparison.Ordinal) && secondLine.EndsWith(endTag, StringComparison.Ordinal)) { text = firstLine + Environment.NewLine + "- " + secondLine.Remove(0, 5); text = text.Replace("- ", "- "); index = text.IndexOf(Environment.NewLine, StringComparison.Ordinal); firstLine = text.Substring(0, index).Trim(); secondLine = text.Substring(index + Environment.NewLine.Length).Trim(); } if (Utilities.StartsAndEndsWithTag(firstLine, beginTag, endTag) && Utilities.StartsAndEndsWithTag(secondLine, beginTag, endTag)) { text = text.Replace(beginTag, string.Empty).Replace(endTag, string.Empty).Trim(); text = beginTag + text + endTag; } } //FALCONE: I didn't think
it was going to be you, var colIdx = text.IndexOf(':'); if (colIdx >= 0 && Utilities.CountTagInText(text, beginTag) + Utilities.CountTagInText(text, endTag) == 4 && text.Length > colIdx + 1 && !char.IsDigit(text[colIdx + 1])) { var firstLine = text.Substring(0, index); var secondLine = text.Substring(index).TrimStart(); var secIdxCol = secondLine.IndexOf(':'); if (secIdxCol < 0 || !Utilities.IsBetweenNumbers(secondLine, secIdxCol)) { var idx = firstLine.IndexOf(':'); if (idx > 1) { var pre = text.Substring(0, idx + 1).TrimStart(); text = text.Remove(0, idx + 1); text = FixInvalidItalicTags(text).Trim(); if (text.StartsWith(" ", StringComparison.OrdinalIgnoreCase)) text = Utilities.RemoveSpaceBeforeAfterTag(text, beginTag); text = pre + " " + text; } } } } //- You think they're they gone? //- That can't be. if (italicBeginTagCount == 3 && italicEndTagCount == 1 && noOfLines == 2) { var newLineIdx = text.IndexOf(Environment.NewLine, StringComparison.Ordinal); var firstLine = text.Substring(0, newLineIdx).Trim(); var secondLine = text.Substring(newLineIdx).Trim(); if ((Utilities.StartsAndEndsWithTag(firstLine, beginTag, beginTag) && Utilities.StartsAndEndsWithTag(secondLine, beginTag, endTag)) || (Utilities.StartsAndEndsWithTag(secondLine, beginTag, beginTag) && Utilities.StartsAndEndsWithTag(firstLine, beginTag, endTag))) { text = text.Replace(beginTag, string.Empty); text = text.Replace(endTag, string.Empty); text = text.Replace(" ", " ").Trim(); text = beginTag + text + endTag; } } if (noOfLines == 3) { var lines = text.SplitToLines(); if ((italicBeginTagCount == 3 && italicEndTagCount == 2) || (italicBeginTagCount == 2 && italicEndTagCount == 3)) { int numberOfItalics = 0; foreach (var line in lines) { if (line.StartsWith(beginTag, StringComparison.Ordinal)) numberOfItalics++; if (line.EndsWith(endTag, StringComparison.Ordinal)) numberOfItalics++; } if (numberOfItalics == 5) { // fix missing tag text = "" + text.Replace("", string.Empty).Replace("", string.Empty) + ""; } } } text = text.Replace("", string.Empty); text = text.Replace(" ", string.Empty); text = text.Replace(" ", string.Empty); } return text; } public static string ToggleTag(string text, string tag) { if (text.IndexOf("<" + tag + ">", StringComparison.OrdinalIgnoreCase) >= 0 || text.IndexOf("", StringComparison.OrdinalIgnoreCase) >= 0) { text = text.Replace("<" + tag + ">", string.Empty); text = text.Replace("", string.Empty); text = text.Replace("<" + tag.ToUpper() + ">", string.Empty); text = text.Replace("", string.Empty); } else { int indexOfEndBracket = text.IndexOf('}'); if (text.StartsWith("{\\", StringComparison.Ordinal) && indexOfEndBracket > 1 && indexOfEndBracket < 6) { text = $"{text.Substring(0, indexOfEndBracket + 1)}<{tag}>{text.Remove(0, indexOfEndBracket + 1)}"; } else { text = $"<{tag}>{text}"; } } return text; } } }