using System;
using System.Collections.Generic;
using System.Text;
namespace Nikse.SubtitleEdit.Core
{
public class StrippableText
{
public string Pre { get; set; }
public string Post { get; set; }
public string StrippedText { get; set; }
public string OriginalText { get; }
public string MergedString => Pre + StrippedText + Post;
public StrippableText(string text)
: this(text, " >-\"„”“['‘`´¶(♪¿¡.…—", " -\"”“]'`´¶)♪.!?:…—؛،؟")
{
}
public StrippableText(string text, string stripStartCharacters, string stripEndCharacters)
{
OriginalText = text;
Pre = string.Empty;
if (text.Length > 0 && ("<{" + stripStartCharacters).Contains(text[0]))
{
int beginLength;
do
{
beginLength = text.Length;
while (text.Length > 0 && stripStartCharacters.Contains(text[0]))
{
Pre += text[0];
text = text.Remove(0, 1);
}
// ASS/SSA codes like {\an9}
int endIndex = text.IndexOf('}');
if (endIndex > 0 && text.StartsWith("{\\", StringComparison.Ordinal))
{
int nextStartIndex = text.IndexOf('{', 2);
if (nextStartIndex == -1 || nextStartIndex > endIndex)
{
endIndex++;
Pre += text.Substring(0, endIndex);
text = text.Remove(0, endIndex);
}
}
// tags like or
endIndex = text.IndexOf('>');
if (text.StartsWith('<') && endIndex >= 2)
{
endIndex++;
Pre += text.Substring(0, endIndex);
text = text.Remove(0, endIndex);
}
}
while (text.Length < beginLength);
}
Post = string.Empty;
if (text.Length > 0 && (">" + stripEndCharacters).Contains(text[text.Length - 1]))
{
int beginLength;
do
{
beginLength = text.Length;
while (text.Length > 0 && stripEndCharacters.Contains(text[text.Length - 1]))
{
Post = text[text.Length - 1] + Post;
text = text.Substring(0, text.Length - 1);
}
if (text.EndsWith('>'))
{
// tags
if (text.EndsWith("", StringComparison.OrdinalIgnoreCase) ||
text.EndsWith("", StringComparison.OrdinalIgnoreCase) ||
text.EndsWith("", StringComparison.OrdinalIgnoreCase))
{
Post = text.Substring(text.Length - 4) + Post;
text = text.Substring(0, text.Length - 4);
}
// tag
if (text.EndsWith("", StringComparison.OrdinalIgnoreCase))
{
Post = text.Substring(text.Length - 7) + Post;
text = text.Substring(0, text.Length - 7);
}
}
}
while (text.Length < beginLength);
}
StrippedText = text;
}
private static string GetAndInsertNextId(List replaceIds, List replaceNames, string name, int idName)
{
string id = $"_@{idName}_";
replaceIds.Add(id);
replaceNames.Add(name);
return id;
}
private void ReplaceNames1Remove(List nameList, List replaceIds, List replaceNames, List originalNames)
{
if (Post.StartsWith('.'))
{
StrippedText += ".";
Post = Post.Remove(0, 1);
}
string lower = StrippedText.ToLower();
int idName = 0;
foreach (string name in nameList)
{
int start = lower.IndexOf(name.ToLowerInvariant(), StringComparison.Ordinal);
while (start >= 0 && start < lower.Length)
{
bool startOk = (start == 0) || (lower[start - 1] == ' ') || (lower[start - 1] == '-') ||
(lower[start - 1] == '"') || (lower[start - 1] == '\'') || (lower[start - 1] == '>') || (lower[start - 1] == '[') || (lower[start - 1] == '“') ||
Environment.NewLine.EndsWith(lower[start - 1]);
if (startOk && string.CompareOrdinal(name, "Don") == 0 && lower.Substring(start).StartsWith("don't", StringComparison.Ordinal))
startOk = false;
if (startOk)
{
int end = start + name.Length;
bool endOk = end <= lower.Length;
if (endOk)
endOk = end == lower.Length || (@" ,.!?:;')]- <”""" + Environment.NewLine).Contains(lower[end]);
if (endOk && StrippedText.Length >= start + name.Length)
{
string originalName = StrippedText.Substring(start, name.Length);
originalNames.Add(originalName);
StrippedText = StrippedText.Remove(start, name.Length);
StrippedText = StrippedText.Insert(start, GetAndInsertNextId(replaceIds, replaceNames, name, idName++));
lower = StrippedText.ToLower();
}
}
if (start + 3 > lower.Length)
start = lower.Length + 1;
else
start = lower.IndexOf(name, start + 3, StringComparison.OrdinalIgnoreCase);
}
}
if (StrippedText.EndsWith('.'))
{
Post = "." + Post;
StrippedText = StrippedText.TrimEnd('.');
}
}
private void ReplaceNames2Fix(List replaceIds, List replaceNames)
{
for (int i = 0; i < replaceIds.Count; i++)
{
StrippedText = StrippedText.Replace(replaceIds[i], replaceNames[i]);
}
}
private static readonly char[] ExpectedCharsArray = { '.', '!', '?', ':', ';', ')', ']', '}', '(', '[', '{' };
public void FixCasing(List nameList, bool changeNameCases, bool makeUppercaseAfterBreak, bool checkLastLine, string lastLine, double millisecondsFromLast = 0)
{
var replaceIds = new List();
var replaceNames = new List();
var originalNames = new List();
ReplaceNames1Remove(nameList, replaceIds, replaceNames, originalNames);
if (checkLastLine)
{
string s = HtmlUtil.RemoveHtmlTags(lastLine).TrimEnd().TrimEnd('\"').TrimEnd('”').TrimEnd();
bool startWithUppercase = string.IsNullOrEmpty(s) ||
s.EndsWith('.') ||
s.EndsWith('!') ||
s.EndsWith('?') ||
s.EndsWith(". ♪", StringComparison.Ordinal) ||
s.EndsWith("! ♪", StringComparison.Ordinal) ||
s.EndsWith("? ♪", StringComparison.Ordinal) ||
s.EndsWith(']') ||
s.EndsWith(')') ||
s.EndsWith(':') ||
s.EndsWith('_');
if (!startWithUppercase && millisecondsFromLast > 5000)
startWithUppercase = true;
// start with uppercase after music symbol - but only if next line does not start with music symbol
if (!startWithUppercase && (s.EndsWith('♪') || s.EndsWith('♫')))
{
if (!Pre.Contains(new[] { '♪', '♫' }))
startWithUppercase = true;
}
if (startWithUppercase && StrippedText.Length > 0 && !Pre.Contains("..."))
{
if (!StrippedText.StartsWith("www.", StringComparison.OrdinalIgnoreCase) &&
!StrippedText.StartsWith("http", StringComparison.OrdinalIgnoreCase))
{
StrippedText = char.ToUpper(StrippedText[0]) + StrippedText.Substring(1);
if (StrippedText.StartsWith("_@", StringComparison.Ordinal))
{
for (int i = 0; i < replaceIds.Count; i++)
{
string id = $"_@{i}_";
if (StrippedText.StartsWith(id, StringComparison.Ordinal))
{
if (!string.IsNullOrEmpty(originalNames[i]))
originalNames[i] = char.ToUpper(originalNames[i][0]) + originalNames[i].Remove(0, 1);
break;
}
}
}
}
}
}
if (makeUppercaseAfterBreak && StrippedText.Contains(ExpectedCharsArray))
{
const string breakAfterChars = @".!?:;)]}([{";
const string expectedChars = "\"“`´'()<>!?.- \r\n";
var sb = new StringBuilder(StrippedText.Length);
bool lastWasBreak = false;
for (int i = 0; i < StrippedText.Length; i++)
{
var s = StrippedText[i];
if (lastWasBreak)
{
if (expectedChars.Contains(s))
{
sb.Append(s);
}
else if ((sb.EndsWith('<') || sb.ToString().EndsWith("", StringComparison.Ordinal)) && i + 1 < StrippedText.Length && StrippedText[i + 1] == '>')
{ // tags
sb.Append(s);
}
else if (sb.EndsWith('<') && s == '/' && i + 2 < StrippedText.Length && StrippedText[i + 2] == '>')
{ // tags
sb.Append(s);
}
else if (sb.ToString().EndsWith("... ", StringComparison.Ordinal))
{
sb.Append(s);
lastWasBreak = false;
}
else
{
if (breakAfterChars.Contains(s))
{
sb.Append(s);
}
else
{
lastWasBreak = false;
sb.Append(char.ToUpper(s));
if (StrippedText.Substring(i).StartsWith("_@", StringComparison.Ordinal))
{
var ks = StrippedText.Substring(i);
for (int k = 0; k < replaceIds.Count; k++)
{
string id = $"_@{k}_";
if (ks.StartsWith(id, StringComparison.Ordinal))
{
if (!string.IsNullOrEmpty(originalNames[k]))
originalNames[k] = char.ToUpper(originalNames[k][0]) + originalNames[k].Remove(0, 1);
break;
}
}
}
}
}
}
else
{
sb.Append(s);
if (breakAfterChars.Contains(s))
{
var idx = sb.ToString().IndexOf('[');
if (s == ']' && idx > 1)
{ // I [Motor roaring] love you!
string temp = sb.ToString(0, idx - 1).Trim();
if (temp.Length > 0 && !char.IsLetterOrDigit(temp[temp.Length - 1]))
lastWasBreak = true;
}
else if (s == ']' && idx == -1 && Pre.Contains('['))
{ // [ Motor roaring ] Hallo!
lastWasBreak = true;
}
else if (s == ':') // seems to be the rule (in subtitles) to nearly always capitalize first letter efter semicolon
{
lastWasBreak = true;
}
else
{
idx = sb.ToString().LastIndexOf(' ');
if (idx >= 0 && idx < sb.Length - 2 && !IsInMiddleOfUrl(i - idx, StrippedText.Substring(idx + 1)))
{
lastWasBreak = true;
}
else if (StrippedText.Length > i + 1 && " \r\n".Contains(StrippedText[i+1]))
{
lastWasBreak = true;
}
}
}
else if (s == '-' && Pre.Contains("-"))
{
if (sb.ToString().EndsWith(Environment.NewLine + "-"))
{
var prevLine = HtmlUtil.RemoveHtmlTags(sb.ToString().Substring(0, sb.Length - 2).TrimEnd());
if (prevLine.EndsWith('.') ||
prevLine.EndsWith('!') ||
prevLine.EndsWith('?') ||
prevLine.EndsWith(". ♪", StringComparison.Ordinal) ||
prevLine.EndsWith("! ♪", StringComparison.Ordinal) ||
prevLine.EndsWith("? ♪", StringComparison.Ordinal) ||
prevLine.EndsWith(']') ||
prevLine.EndsWith(')') ||
prevLine.EndsWith(':'))
{
lastWasBreak = true;
}
}
}
}
}
StrippedText = sb.ToString();
}
ReplaceNames2Fix(replaceIds, changeNameCases ? replaceNames : originalNames);
}
private bool IsInMiddleOfUrl(int idx, string s)
{
if (idx < s.Length - 1 && (char.IsWhiteSpace(s[idx]) || char.IsPunctuation(s[idx])))
return false;
return s.StartsWith("www.", StringComparison.OrdinalIgnoreCase) || s.StartsWith("http", StringComparison.OrdinalIgnoreCase);
}
public string CombineWithPrePost(string text)
{
return Pre + text + Post;
}
}
}