More italic detection - thx Zoltan :)

This commit is contained in:
Nikolaj Olsson 2024-09-26 06:35:42 +02:00
parent 95e4c2450b
commit e11977d438
2 changed files with 75 additions and 14 deletions

View File

@ -107,6 +107,37 @@ namespace Test.Logic.Ocr
Assert.AreEqual("Leonard:<i>They're here.</i>", result); Assert.AreEqual("Leonard:<i>They're here.</i>", result);
} }
[TestMethod]
public void TestItalicAndColon2()
{
var matches = new List<VobSubOcr.CompareMatch>
{
new VobSubOcr.CompareMatch("C", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("A", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("E", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("S", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("A", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("R", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch(":", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch(" ", false, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("I", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch(" ", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("l", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("i", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("v", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("e", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch(" ", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("h", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("e", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("r", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch("e", true, 0, Guid.NewGuid().ToString()),
new VobSubOcr.CompareMatch(".", true, 0, Guid.NewGuid().ToString()),
};
var result = MatchesToItalicStringConverter.GetStringWithItalicTags(matches);
Assert.AreEqual("CAESAR: <i>I live here.</i>", result);
}
[TestMethod] [TestMethod]
public void TestItalicAndBrackets() public void TestItalicAndBrackets()
{ {

View File

@ -6,9 +6,15 @@ using System.Text;
namespace Nikse.SubtitleEdit.Logic.Ocr namespace Nikse.SubtitleEdit.Logic.Ocr
{ {
public class SplitItem
{
public List<VobSubOcr.CompareMatch> Matches { get; set; }
public string Separator { get; set; }
}
public static class MatchesToItalicStringConverter public static class MatchesToItalicStringConverter
{ {
private static readonly string[] Separators = { "-", "—", ".", "'", "\"", " ", "!", "\r", "\n", "\r\n" }; private static readonly string[] Separators = { "-", "—", ".", "'", "\"", " ", "\r", "\n", "\r\n" };
public static string GetStringWithItalicTags(List<VobSubOcr.CompareMatch> matches) public static string GetStringWithItalicTags(List<VobSubOcr.CompareMatch> matches)
{ {
@ -18,24 +24,34 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
} }
var sb = new StringBuilder(); var sb = new StringBuilder();
foreach (var lineMatches in SplitMatchesToLines(matches)) foreach (var lineMatches in SplitMatchesToLineParts(matches))
{ {
var numberOfLetters = GetNumberOfLetters(lineMatches); var numberOfLetters = GetNumberOfLetters(lineMatches.Matches);
var numberOfItalicLetters = GetNumberOfItalicLetters(lineMatches); var numberOfItalicLetters = GetNumberOfItalicLetters(lineMatches.Matches);
if (numberOfItalicLetters == numberOfLetters || numberOfItalicLetters > 3 && numberOfLetters - numberOfItalicLetters < 2 && ItalicIsInsideWord(matches)) if (numberOfItalicLetters == numberOfLetters || numberOfItalicLetters > 3 && numberOfLetters - numberOfItalicLetters < 2 && ItalicIsInsideWord(matches))
{ {
sb.AppendLine("<i>" + GetRawString(lineMatches) + "</i>"); sb.AppendLine("<i>" + GetRawString(lineMatches.Matches) + "</i>");
} }
else if (numberOfItalicLetters == 0 || numberOfLetters > 2 && numberOfItalicLetters < 2) else if (numberOfItalicLetters == 0 || numberOfLetters > 2 && numberOfItalicLetters < 2)
{ {
sb.AppendLine(GetRawString(lineMatches)); sb.Append(GetRawString(lineMatches.Matches));
sb.Append(lineMatches.Separator);
} }
else else
{ {
sb.AppendLine(GetStringWithItalicTagsMixed(lineMatches)); sb.Append(GetStringWithItalicTagsMixed(lineMatches.Matches));
sb.Append(lineMatches.Separator);
} }
} }
return sb.ToString().TrimEnd().Replace("</i>" + Environment.NewLine + "<i>", Environment.NewLine);
var text = sb.ToString().TrimEnd().Replace("</i>" + Environment.NewLine + "<i>", Environment.NewLine);
text = text.Replace(" ", " ");
text = text.Replace("<i> ", " <i>");
text = text.Replace(" </i>", "</i> ");
text = text.Replace(" ", " ");
return text.Trim();
} }
private static bool ItalicIsInsideWord(List<VobSubOcr.CompareMatch> matches) private static bool ItalicIsInsideWord(List<VobSubOcr.CompareMatch> matches)
@ -72,7 +88,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
for (var i = 0; i < lineMatches.Count; i++) for (var i = 0; i < lineMatches.Count; i++)
{ {
var m = lineMatches[i]; var m = lineMatches[i];
if (m.Text == " " || m.Text == "-" || m.Text == "'" || m.Text == ":" || m.Text == "[" || m.Text == "]") // chars that allow change of italic if (m.Text == " " || m.Text == "-" || m.Text == "'") // chars that allow change of italic
{ {
if (sbWord.Length > 0) if (sbWord.Length > 0)
{ {
@ -133,6 +149,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
.Replace("</i>.<i>", ".") .Replace("</i>.<i>", ".")
.Replace("<i>...</i>", "...") .Replace("<i>...</i>", "...")
.Replace("</i>...<i>", "..."); .Replace("</i>...<i>", "...");
return text; return text;
} }
@ -173,17 +190,28 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
return italicOn; return italicOn;
} }
private static List<List<VobSubOcr.CompareMatch>> SplitMatchesToLines(List<VobSubOcr.CompareMatch> matches) private static List<SplitItem> SplitMatchesToLineParts(List<VobSubOcr.CompareMatch> matches)
{ {
var result = new List<List<VobSubOcr.CompareMatch>>(); var result = new List<SplitItem>();
var line = new List<VobSubOcr.CompareMatch>(); var line = new List<VobSubOcr.CompareMatch>();
foreach (var t in matches) foreach (var t in matches)
{ {
if (t.Text == Environment.NewLine) if (t.Text == Environment.NewLine)
{ {
if (line.Count > 0) if (line.Count > 0)
{ {
result.Add(line); result.Add(new SplitItem { Matches = line, Separator = Environment.NewLine });
line = new List<VobSubOcr.CompareMatch>();
}
}
else if (t.Text == ":" || t.Text == ")" || t.Text == "]")
{
if (line.Count > 0)
{
line.Add(t);
result.Add(new SplitItem { Matches = line, Separator = string.Empty });
line = new List<VobSubOcr.CompareMatch>(); line = new List<VobSubOcr.CompareMatch>();
} }
} }
@ -192,10 +220,12 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
line.Add(t); line.Add(t);
} }
} }
if (line.Count > 0) if (line.Count > 0)
{ {
result.Add(line); result.Add(new SplitItem { Matches = line, Separator = string.Empty });
} }
return result; return result;
} }
@ -211,7 +241,7 @@ namespace Nikse.SubtitleEdit.Logic.Ocr
} }
} }
return sb.ToString().Trim(); return sb.ToString().Replace(" ", " ").Replace(" ", " ");
} }
private static int GetNumberOfLetters(List<VobSubOcr.CompareMatch> matches) private static int GetNumberOfLetters(List<VobSubOcr.CompareMatch> matches)