using System; using System.Collections.Generic; using System.Text; namespace Nikse.SubtitleEdit.Core { public class StrippableText { public string Pre { get; set; } public string Post { get; set; } public string StrippedText { get; set; } public string OriginalText { get; } public string MergedString => Pre + StrippedText + Post; public StrippableText(string text) : this(text, " >-\"„”“['‘`´¶(♪¿¡.…—", " -\"”“]'`´¶)♪.!?:…—؛،؟") { } public StrippableText(string text, string stripStartCharacters, string stripEndCharacters) { OriginalText = text; Pre = string.Empty; if (text.Length > 0 && ("<{" + stripStartCharacters).Contains(text[0])) { int beginLength; do { beginLength = text.Length; while (text.Length > 0 && stripStartCharacters.Contains(text[0])) { Pre += text[0]; text = text.Remove(0, 1); } // ASS/SSA codes like {\an9} int endIndex = text.IndexOf('}'); if (endIndex > 0 && text.StartsWith("{\\", StringComparison.Ordinal)) { int nextStartIndex = text.IndexOf('{', 2); if (nextStartIndex == -1 || nextStartIndex > endIndex) { endIndex++; Pre += text.Substring(0, endIndex); text = text.Remove(0, endIndex); } } // tags like or endIndex = text.IndexOf('>'); if (text.StartsWith('<') && endIndex >= 2) { endIndex++; Pre += text.Substring(0, endIndex); text = text.Remove(0, endIndex); } } while (text.Length < beginLength); } Post = string.Empty; if (text.Length > 0 && (">" + stripEndCharacters).Contains(text[text.Length - 1])) { int beginLength; do { beginLength = text.Length; while (text.Length > 0 && stripEndCharacters.Contains(text[text.Length - 1])) { Post = text[text.Length - 1] + Post; text = text.Substring(0, text.Length - 1); } if (text.EndsWith('>')) { // tags if (text.EndsWith("", StringComparison.OrdinalIgnoreCase) || text.EndsWith("", StringComparison.OrdinalIgnoreCase) || text.EndsWith("", StringComparison.OrdinalIgnoreCase)) { Post = text.Substring(text.Length - 4) + Post; text = text.Substring(0, text.Length - 4); } // tag if (text.EndsWith("", StringComparison.OrdinalIgnoreCase)) { Post = text.Substring(text.Length - 7) + Post; text = text.Substring(0, text.Length - 7); } } } while (text.Length < beginLength); } StrippedText = text; } private static string GetAndInsertNextId(List replaceIds, List replaceNames, string name, int idName) { string id = $"_@{idName}_"; replaceIds.Add(id); replaceNames.Add(name); return id; } private void ReplaceNames1Remove(List nameList, List replaceIds, List replaceNames, List originalNames) { if (Post.StartsWith('.')) { StrippedText += "."; Post = Post.Remove(0, 1); } string lower = StrippedText.ToLower(); int idName = 0; foreach (string name in nameList) { int start = lower.IndexOf(name.ToLowerInvariant(), StringComparison.Ordinal); while (start >= 0 && start < lower.Length) { bool startOk = (start == 0) || (lower[start - 1] == ' ') || (lower[start - 1] == '-') || (lower[start - 1] == '"') || (lower[start - 1] == '\'') || (lower[start - 1] == '>') || (lower[start - 1] == '[') || (lower[start - 1] == '“') || Environment.NewLine.EndsWith(lower[start - 1]); if (startOk && string.CompareOrdinal(name, "Don") == 0 && lower.Substring(start).StartsWith("don't", StringComparison.Ordinal)) startOk = false; if (startOk) { int end = start + name.Length; bool endOk = end <= lower.Length; if (endOk) endOk = end == lower.Length || (@" ,.!?:;')]- <”""" + Environment.NewLine).Contains(lower[end]); if (endOk && StrippedText.Length >= start + name.Length) { string originalName = StrippedText.Substring(start, name.Length); originalNames.Add(originalName); StrippedText = StrippedText.Remove(start, name.Length); StrippedText = StrippedText.Insert(start, GetAndInsertNextId(replaceIds, replaceNames, name, idName++)); lower = StrippedText.ToLower(); } } if (start + 3 > lower.Length) start = lower.Length + 1; else start = lower.IndexOf(name, start + 3, StringComparison.OrdinalIgnoreCase); } } if (StrippedText.EndsWith('.')) { Post = "." + Post; StrippedText = StrippedText.TrimEnd('.'); } } private void ReplaceNames2Fix(List replaceIds, List replaceNames) { for (int i = 0; i < replaceIds.Count; i++) { StrippedText = StrippedText.Replace(replaceIds[i], replaceNames[i]); } } private static readonly char[] ExpectedCharsArray = { '.', '!', '?', ':', ';', ')', ']', '}', '(', '[', '{' }; public void FixCasing(List nameList, bool changeNameCases, bool makeUppercaseAfterBreak, bool checkLastLine, string lastLine, double millisecondsFromLast = 0) { var replaceIds = new List(); var replaceNames = new List(); var originalNames = new List(); ReplaceNames1Remove(nameList, replaceIds, replaceNames, originalNames); if (checkLastLine) { string s = HtmlUtil.RemoveHtmlTags(lastLine).TrimEnd().TrimEnd('\"').TrimEnd('”').TrimEnd(); bool startWithUppercase = string.IsNullOrEmpty(s) || s.EndsWith('.') || s.EndsWith('!') || s.EndsWith('?') || s.EndsWith(". ♪", StringComparison.Ordinal) || s.EndsWith("! ♪", StringComparison.Ordinal) || s.EndsWith("? ♪", StringComparison.Ordinal) || s.EndsWith(']') || s.EndsWith(')') || s.EndsWith(':') || s.EndsWith('_'); if (!startWithUppercase && millisecondsFromLast > 5000) startWithUppercase = true; // start with uppercase after music symbol - but only if next line does not start with music symbol if (!startWithUppercase && (s.EndsWith('♪') || s.EndsWith('♫'))) { if (!Pre.Contains(new[] { '♪', '♫' })) startWithUppercase = true; } if (startWithUppercase && StrippedText.Length > 0 && !Pre.Contains("...")) { if (!StrippedText.StartsWith("www.", StringComparison.OrdinalIgnoreCase) && !StrippedText.StartsWith("http", StringComparison.OrdinalIgnoreCase)) { StrippedText = char.ToUpper(StrippedText[0]) + StrippedText.Substring(1); if (StrippedText.StartsWith("_@", StringComparison.Ordinal)) { for (int i = 0; i < replaceIds.Count; i++) { string id = $"_@{i}_"; if (StrippedText.StartsWith(id, StringComparison.Ordinal)) { if (!string.IsNullOrEmpty(originalNames[i])) originalNames[i] = char.ToUpper(originalNames[i][0]) + originalNames[i].Remove(0, 1); break; } } } } } } if (makeUppercaseAfterBreak && StrippedText.Contains(ExpectedCharsArray)) { const string breakAfterChars = @".!?:;)]}([{"; const string expectedChars = "\"“`´'()<>!?.- \r\n"; var sb = new StringBuilder(StrippedText.Length); bool lastWasBreak = false; for (int i = 0; i < StrippedText.Length; i++) { var s = StrippedText[i]; if (lastWasBreak) { if (expectedChars.Contains(s)) { sb.Append(s); } else if ((sb.EndsWith('<') || sb.ToString().EndsWith("') { // tags sb.Append(s); } else if (sb.EndsWith('<') && s == '/' && i + 2 < StrippedText.Length && StrippedText[i + 2] == '>') { // tags sb.Append(s); } else if (sb.ToString().EndsWith("... ", StringComparison.Ordinal)) { sb.Append(s); lastWasBreak = false; } else { if (breakAfterChars.Contains(s)) { sb.Append(s); } else { lastWasBreak = false; sb.Append(char.ToUpper(s)); if (StrippedText.Substring(i).StartsWith("_@", StringComparison.Ordinal)) { var ks = StrippedText.Substring(i); for (int k = 0; k < replaceIds.Count; k++) { string id = $"_@{k}_"; if (ks.StartsWith(id, StringComparison.Ordinal)) { if (!string.IsNullOrEmpty(originalNames[k])) originalNames[k] = char.ToUpper(originalNames[k][0]) + originalNames[k].Remove(0, 1); break; } } } } } } else { sb.Append(s); if (breakAfterChars.Contains(s)) { var idx = sb.ToString().IndexOf('['); if (s == ']' && idx > 1) { // I [Motor roaring] love you! string temp = sb.ToString(0, idx - 1).Trim(); if (temp.Length > 0 && !char.IsLetterOrDigit(temp[temp.Length - 1])) lastWasBreak = true; } else if (s == ']' && idx == -1 && Pre.Contains('[')) { // [ Motor roaring ] Hallo! lastWasBreak = true; } else if (s == ':') // seems to be the rule (in subtitles) to nearly always capitalize first letter efter semicolon { lastWasBreak = true; } else { idx = sb.ToString().LastIndexOf(' '); if (idx >= 0 && idx < sb.Length - 2 && !IsInMiddleOfUrl(i - idx, StrippedText.Substring(idx + 1))) { lastWasBreak = true; } else if (StrippedText.Length > i + 1 && " \r\n".Contains(StrippedText[i+1])) { lastWasBreak = true; } } } else if (s == '-' && Pre.Contains("-")) { if (sb.ToString().EndsWith(Environment.NewLine + "-")) { var prevLine = HtmlUtil.RemoveHtmlTags(sb.ToString().Substring(0, sb.Length - 2).TrimEnd()); if (prevLine.EndsWith('.') || prevLine.EndsWith('!') || prevLine.EndsWith('?') || prevLine.EndsWith(". ♪", StringComparison.Ordinal) || prevLine.EndsWith("! ♪", StringComparison.Ordinal) || prevLine.EndsWith("? ♪", StringComparison.Ordinal) || prevLine.EndsWith(']') || prevLine.EndsWith(')') || prevLine.EndsWith(':')) { lastWasBreak = true; } } } } } StrippedText = sb.ToString(); } ReplaceNames2Fix(replaceIds, changeNameCases ? replaceNames : originalNames); } private bool IsInMiddleOfUrl(int idx, string s) { if (idx < s.Length - 1 && (char.IsWhiteSpace(s[idx]) || char.IsPunctuation(s[idx]))) return false; return s.StartsWith("www.", StringComparison.OrdinalIgnoreCase) || s.StartsWith("http", StringComparison.OrdinalIgnoreCase); } public string CombineWithPrePost(string text) { return Pre + text + Post; } } }