More italic detection - thx Zoltan :)

This commit is contained in:
Nikolaj Olsson 2024-09-26 06:35:42 +02:00
parent 95e4c2450b
commit e11977d438
2 changed files with 75 additions and 14 deletions

View File

@ -107,6 +107,37 @@ namespace Test.Logic.Ocr
Assert.AreEqual("Leonard:<i>They're here.</i>", result);
}
[TestMethod]
public void TestItalicAndColon2()
{
var matches = new List<VobSubOcr.CompareMatch>
{
new VobSubOcr.CompareMatch("C", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("A", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("E", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("S", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("A", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("R", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch(":", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch(" ", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("I", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch(" ", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("l", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("i", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("v", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("e", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch(" ", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("h", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("e", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("r", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("e", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch(".", true, 0, Guid.NewGuid().ToString()),
};
var result = MatchesToItalicStringConverter.GetStringWithItalicTags(matches);
Assert.AreEqual("CAESAR: <i>I live here.</i>", result);
}
[TestMethod]
public void TestItalicAndBrackets()
{

View File

@ -6,9 +6,15 @@ using System.Text;
namespace Nikse.SubtitleEdit.Logic.Ocr
{
public class SplitItem
{
public List<VobSubOcr.CompareMatch> Matches { get; set; }
public string Separator { get; set; }
}
public static class MatchesToItalicStringConverter
{
private static readonly string[] Separators = { "-", "—", ".", "'", "\"", " ", "!", "\r", "\n", "\r\n" };
private static readonly string[] Separators = { "-", "—", ".", "'", "\"", " ", "\r", "\n", "\r\n" };
public static string GetStringWithItalicTags(List<VobSubOcr.CompareMatch> matches)
{
@ -18,24 +24,34 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
}
var sb = new StringBuilder();
foreach (var lineMatches in SplitMatchesToLines(matches))
foreach (var lineMatches in SplitMatchesToLineParts(matches))
{
var numberOfLetters = GetNumberOfLetters(lineMatches);
var numberOfItalicLetters = GetNumberOfItalicLetters(lineMatches);
var numberOfLetters = GetNumberOfLetters(lineMatches.Matches);
var numberOfItalicLetters = GetNumberOfItalicLetters(lineMatches.Matches);
if (numberOfItalicLetters == numberOfLetters || numberOfItalicLetters > 3 && numberOfLetters - numberOfItalicLetters < 2 && ItalicIsInsideWord(matches))
{
sb.AppendLine("<i>" + GetRawString(lineMatches) + "</i>");
sb.AppendLine("<i>" + GetRawString(lineMatches.Matches) + "</i>");
}
else if (numberOfItalicLetters == 0 || numberOfLetters > 2 && numberOfItalicLetters < 2)
{
sb.AppendLine(GetRawString(lineMatches));
sb.Append(GetRawString(lineMatches.Matches));
sb.Append(lineMatches.Separator);
}
else
{
sb.AppendLine(GetStringWithItalicTagsMixed(lineMatches));
sb.Append(GetStringWithItalicTagsMixed(lineMatches.Matches));
sb.Append(lineMatches.Separator);
}
}
return sb.ToString().TrimEnd().Replace("</i>" + Environment.NewLine + "<i>", Environment.NewLine);
var text = sb.ToString().TrimEnd().Replace("</i>" + Environment.NewLine + "<i>", Environment.NewLine);
text = text.Replace(" ", " ");
text = text.Replace("<i> ", " <i>");
text = text.Replace(" </i>", "</i> ");
text = text.Replace(" ", " ");
return text.Trim();
}
private static bool ItalicIsInsideWord(List<VobSubOcr.CompareMatch> matches)
@ -72,7 +88,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
for (var i = 0; i < lineMatches.Count; i++)
{
var m = lineMatches[i];
if (m.Text == " " || m.Text == "-" || m.Text == "'" || m.Text == ":" || m.Text == "[" || m.Text == "]") // chars that allow change of italic
if (m.Text == " " || m.Text == "-" || m.Text == "'") // chars that allow change of italic
{
if (sbWord.Length > 0)
{
@ -133,6 +149,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
.Replace("</i>.<i>", ".")
.Replace("<i>...</i>", "...")
.Replace("</i>...<i>", "...");
return text;
}
@ -173,17 +190,28 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
return italicOn;
}
private static List<List<VobSubOcr.CompareMatch>> SplitMatchesToLines(List<VobSubOcr.CompareMatch> matches)
private static List<SplitItem> SplitMatchesToLineParts(List<VobSubOcr.CompareMatch> matches)
{
var result = new List<List<VobSubOcr.CompareMatch>>();
var result = new List<SplitItem>();
var line = new List<VobSubOcr.CompareMatch>();
foreach (var t in matches)
{
if (t.Text == Environment.NewLine)
{
if (line.Count > 0)
{
result.Add(line);
result.Add(new SplitItem { Matches = line, Separator = Environment.NewLine });
line = new List<VobSubOcr.CompareMatch>();
}
}
else if (t.Text == ":" || t.Text == ")" || t.Text == "]")
{
if (line.Count > 0)
{
line.Add(t);
result.Add(new SplitItem { Matches = line, Separator = string.Empty });
line = new List<VobSubOcr.CompareMatch>();
}
}
@ -192,10 +220,12 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
line.Add(t);
}
}
if (line.Count > 0)
{
result.Add(line);
result.Add(new SplitItem { Matches = line, Separator = string.Empty });
}
return result;
}
@ -211,7 +241,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
}
}
return sb.ToString().Trim();
return sb.ToString().Replace(" ", " ").Replace(" ", " ");
}
private static int GetNumberOfLetters(List<VobSubOcr.CompareMatch> matches)