mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-11-22 03:02:35 +01:00
Fix for Whisper Post-processing - thx Purfview/cvrle77 :)
Somewhat related to #8044
This commit is contained in:
parent
419a884770
commit
001cad505d
@ -115,5 +115,107 @@ Lexington, Massachusetts.";
|
|||||||
Assert.IsTrue(fixedSubtitle.Paragraphs[4].Text.EndsWith("camp.", StringComparison.Ordinal));
|
Assert.IsTrue(fixedSubtitle.Paragraphs[4].Text.EndsWith("camp.", StringComparison.Ordinal));
|
||||||
Assert.IsTrue(fixedSubtitle.Paragraphs[5].Text.StartsWith("Hey there", StringComparison.Ordinal));
|
Assert.IsTrue(fixedSubtitle.Paragraphs[5].Text.StartsWith("Hey there", StringComparison.Ordinal));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void TryForWholeSentences1()
|
||||||
|
{
|
||||||
|
var raw = @"12
|
||||||
|
00:00:25,500 --> 00:00:27,060
|
||||||
|
Oh, my... Bob, right?
|
||||||
|
|
||||||
|
13
|
||||||
|
00:00:28,560 --> 00:00:29,220
|
||||||
|
Could be fun.
|
||||||
|
|
||||||
|
14
|
||||||
|
00:00:29,660 --> 00:00:32,580
|
||||||
|
Well, we could get to know each other a
|
||||||
|
little, maybe loosen things up around
|
||||||
|
|
||||||
|
15
|
||||||
|
00:00:32,580 --> 00:00:33,060
|
||||||
|
here?
|
||||||
|
|
||||||
|
16
|
||||||
|
00:00:33,680 --> 00:00:39,160
|
||||||
|
Well, I've worked with this lot before,
|
||||||
|
and, erm... Yeah, this is as loose as they
|
||||||
|
|
||||||
|
17
|
||||||
|
00:00:39,160 --> 00:00:40,300
|
||||||
|
get.
|
||||||
|
|
||||||
|
18
|
||||||
|
00:00:46,120 --> 00:00:46,340
|
||||||
|
Hmm.
|
||||||
|
|
||||||
|
19
|
||||||
|
00:00:48,160 --> 00:00:49,120
|
||||||
|
What's the about that, Bob's?
|
||||||
|
|
||||||
|
20
|
||||||
|
00:00:49,120 --> 00:00:49,860
|
||||||
|
Oh, no.
|
||||||
|
|
||||||
|
21
|
||||||
|
00:00:50,580 --> 00:00:50,700
|
||||||
|
Yep.
|
||||||
|
|
||||||
|
22
|
||||||
|
00:00:51,240 --> 00:00:52,600
|
||||||
|
I felt that soon as I said it.
|
||||||
|
|
||||||
|
23
|
||||||
|
00:00:54,860 --> 00:00:56,340
|
||||||
|
Right, I'm headed out.
|
||||||
|
|
||||||
|
24
|
||||||
|
00:00:57,460 --> 00:00:58,780
|
||||||
|
Everyone have a great day, yeah?
|
||||||
|
|
||||||
|
25
|
||||||
|
00:00:58,780 --> 00:00:59,600
|
||||||
|
Yeah.
|
||||||
|
|
||||||
|
26
|
||||||
|
00:01:00,600 --> 00:01:01,880
|
||||||
|
Wait.
|
||||||
|
|
||||||
|
27
|
||||||
|
00:01:01,880 --> 00:01:02,480
|
||||||
|
Wait.";
|
||||||
|
|
||||||
|
var subtitle = new Subtitle();
|
||||||
|
new SubRip().LoadSubtitle(subtitle, raw.SplitToLines(), null);
|
||||||
|
|
||||||
|
var fixedSubtitle = AudioToTextPostProcessor.TryForWholeSentences(subtitle, "en", 42);
|
||||||
|
|
||||||
|
Assert.AreEqual(14, fixedSubtitle.Paragraphs.Count);
|
||||||
|
Assert.IsTrue(fixedSubtitle.Paragraphs[2].Text == "Well, we could get to know each other a" + Environment.NewLine + "little, maybe loosen things up around here?");
|
||||||
|
Assert.IsTrue(fixedSubtitle.Paragraphs[3].Text == "Well, I've worked with this lot before, and," + Environment.NewLine + "erm... Yeah, this is as loose as they get.");
|
||||||
|
Assert.IsTrue(fixedSubtitle.Paragraphs[4].Text == "Hmm.");
|
||||||
|
Assert.IsTrue(fixedSubtitle.Paragraphs[5].Text == "What's the about that, Bob's?");
|
||||||
|
}
|
||||||
|
|
||||||
|
[TestMethod]
|
||||||
|
public void TryForWholeSentences2()
|
||||||
|
{
|
||||||
|
var raw = @"1
|
||||||
|
00:00:26,500 --> 00:00:27,060
|
||||||
|
Yes, I think this could indeed be very good. But also
|
||||||
|
|
||||||
|
2
|
||||||
|
00:00:28,560 --> 00:00:29,220
|
||||||
|
that could be fun indeed my friend.";
|
||||||
|
|
||||||
|
var subtitle = new Subtitle();
|
||||||
|
new SubRip().LoadSubtitle(subtitle, raw.SplitToLines(), null);
|
||||||
|
|
||||||
|
var fixedSubtitle = AudioToTextPostProcessor.TryForWholeSentences(subtitle, "en", 42);
|
||||||
|
|
||||||
|
Assert.AreEqual(2, fixedSubtitle.Paragraphs.Count);
|
||||||
|
Assert.IsTrue(fixedSubtitle.Paragraphs[0].Text == "Yes, I think this could" + Environment.NewLine + "indeed be very good.");
|
||||||
|
Assert.IsTrue(fixedSubtitle.Paragraphs[1].Text == "But also that could be" + Environment.NewLine + "fun indeed my friend.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -172,15 +172,22 @@ namespace Test.Logic.AutoTranslate
|
|||||||
Assert.IsTrue(splitResult[0].Length > 5);
|
Assert.IsTrue(splitResult[0].Length > 5);
|
||||||
Assert.IsTrue(splitResult[1].Length > 5);
|
Assert.IsTrue(splitResult[1].Length > 5);
|
||||||
Assert.IsTrue(splitResult[2].Length > 5);
|
Assert.IsTrue(splitResult[2].Length > 5);
|
||||||
Assert.AreEqual(string.Join(" ", subtitle.Paragraphs.Select(p => p.Text)), string.Join(" ", splitResult));
|
|
||||||
|
var subtitleText = string.Join("", subtitle.Paragraphs.Select(p => p.Text)).RemoveChar('\n', '\r', ' ');
|
||||||
|
var splitText = string.Join("", splitResult).RemoveChar('\n', '\r', ' ');
|
||||||
|
Assert.AreEqual(subtitleText, splitText);
|
||||||
|
|
||||||
|
Assert.AreEqual("Hallo there. In the" + Environment.NewLine + "garden today are we?", splitResult[0]);
|
||||||
|
Assert.AreEqual("So I will very", splitResult[1]);
|
||||||
|
Assert.AreEqual("soon be going home to Sweden.", splitResult[2]);
|
||||||
Assert.AreEqual("My name is Peter!", splitResult[3]);
|
Assert.AreEqual("My name is Peter!", splitResult[3]);
|
||||||
Assert.AreEqual("My name is Peter! And Jones.", splitResult[4]);
|
Assert.AreEqual("My name is Peter! And Jones.", splitResult[4]);
|
||||||
Assert.AreEqual("My name is Peter. And Jones.", splitResult[5]);
|
Assert.AreEqual("My name is Peter. And Jones.", splitResult[5]);
|
||||||
Assert.AreEqual("", splitResult[6]);
|
Assert.AreEqual("", splitResult[6]);
|
||||||
Assert.AreEqual("Hallo there.", splitResult[7]);
|
Assert.AreEqual("Hallo there.", splitResult[7]);
|
||||||
|
|
||||||
var inputText = string.Join(" ", subtitle.Paragraphs.Select(p => p.Text)).Replace(Environment.NewLine, " ");
|
var inputText = string.Join("", subtitle.Paragraphs.Select(p => p.Text)).RemoveChar('\n', '\r', ' ');
|
||||||
var splitResultText = string.Join(" ", splitResult);
|
var splitResultText = string.Join("", splitResult).RemoveChar('\n', '\r', ' ');
|
||||||
Assert.AreEqual(inputText, splitResultText);
|
Assert.AreEqual(inputText, splitResultText);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -64,11 +64,7 @@ namespace Nikse.SubtitleEdit.Core.AudioToText
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (TwoLetterLanguageCode == "en")
|
if (TwoLetterLanguageCode == "da")
|
||||||
{
|
|
||||||
// anything?
|
|
||||||
}
|
|
||||||
else if (TwoLetterLanguageCode == "da")
|
|
||||||
{
|
{
|
||||||
if (paragraph.Text.Contains("Danske tekster af nicolai winther", StringComparison.OrdinalIgnoreCase))
|
if (paragraph.Text.Contains("Danske tekster af nicolai winther", StringComparison.OrdinalIgnoreCase))
|
||||||
{
|
{
|
||||||
@ -144,10 +140,12 @@ namespace Nikse.SubtitleEdit.Core.AudioToText
|
|||||||
return subtitle;
|
return subtitle;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Subtitle TryForWholeSentences(Subtitle inputSubtitle, string language, int lineMxLength)
|
public static Subtitle TryForWholeSentences(Subtitle inputSubtitle, string language, int lineMaxLength)
|
||||||
{
|
{
|
||||||
var s = new Subtitle(inputSubtitle);
|
var s = new Subtitle(inputSubtitle);
|
||||||
const int maxMoveChunkSize = 15;
|
const int maxMoveChunkSize = 15;
|
||||||
|
var deleteIndices = new List<int>();
|
||||||
|
|
||||||
for (var i = 0; i < s.Paragraphs.Count - 1; i++)
|
for (var i = 0; i < s.Paragraphs.Count - 1; i++)
|
||||||
{
|
{
|
||||||
var p = s.Paragraphs[i];
|
var p = s.Paragraphs[i];
|
||||||
@ -158,14 +156,21 @@ namespace Nikse.SubtitleEdit.Core.AudioToText
|
|||||||
p.EndTime.TotalMilliseconds - next.StartTime.TotalMilliseconds > 100 ||
|
p.EndTime.TotalMilliseconds - next.StartTime.TotalMilliseconds > 100 ||
|
||||||
p.Text.Contains('<') ||
|
p.Text.Contains('<') ||
|
||||||
next.Text.Contains('<') ||
|
next.Text.Contains('<') ||
|
||||||
!(p.Text.Contains('.') || next.Text.Contains('.')) ||
|
!(p.Text.Contains('.') || p.Text.Contains('?') || p.Text.Contains('!') || next.Text.Contains('.') || next.Text.Contains('?') || next.Text.Contains('!')) ||
|
||||||
p.Text.EndsWith('.'))
|
p.Text.EndsWith('.') ||
|
||||||
|
p.Text.EndsWith('?') ||
|
||||||
|
p.Text.EndsWith('!'))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (deleteIndices.Contains(i))
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check for period in last part of p
|
// check for period in last part of p
|
||||||
var lastPeriodIdx = p.Text.LastIndexOf('.');
|
var lastPeriodIdx = p.Text.LastIndexOfAny(new char[] { '.', '?', '!' });
|
||||||
if (lastPeriodIdx > 3 && lastPeriodIdx > p.Text.Length - maxMoveChunkSize)
|
if (lastPeriodIdx > 3 && lastPeriodIdx > p.Text.Length - maxMoveChunkSize)
|
||||||
{
|
{
|
||||||
var newCurrentText = p.Text.Substring(0, lastPeriodIdx + 1).Trim();
|
var newCurrentText = p.Text.Substring(0, lastPeriodIdx + 1).Trim();
|
||||||
@ -177,23 +182,31 @@ namespace Nikse.SubtitleEdit.Core.AudioToText
|
|||||||
var arrayCurrent = newCurrentText.SplitToLines();
|
var arrayCurrent = newCurrentText.SplitToLines();
|
||||||
var arrayNext = newNextText.SplitToLines();
|
var arrayNext = newNextText.SplitToLines();
|
||||||
|
|
||||||
var currentOk = arrayCurrent.Count == 1 || (arrayCurrent.Count == 2 && arrayCurrent[0].Length <= lineMxLength);
|
var currentOk = arrayCurrent.Count == 1 || (arrayCurrent.Count == 2 && arrayCurrent[0].Length < lineMaxLength * 2);
|
||||||
var nextOk = arrayNext.Count == 1 || (arrayNext.Count == 2 && arrayNext[0].Length <= lineMxLength);
|
var nextOk = arrayNext.Count == 1 || (arrayNext.Count == 2 && arrayNext[0].Length < lineMaxLength * 2);
|
||||||
|
|
||||||
if (currentOk && nextOk)
|
if (currentOk && nextOk)
|
||||||
{
|
{
|
||||||
p.Text = newCurrentText;
|
p.Text = newCurrentText;
|
||||||
next.Text = newNextText;
|
next.Text = newNextText;
|
||||||
|
|
||||||
//TODO: calc time
|
if (string.IsNullOrWhiteSpace(newCurrentText))
|
||||||
|
{
|
||||||
|
deleteIndices.Add(i);
|
||||||
|
next.StartTime.TotalMilliseconds = p.StartTime.TotalMilliseconds;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//TODO: calc time
|
||||||
|
}
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// check for period in beginning of next
|
// check for period in beginning of next
|
||||||
var firstPeriodIdx = next.Text.IndexOf('.');
|
var firstPeriodIdx = next.Text.IndexOfAny(new char[] { '.', '?', '!' });
|
||||||
if (firstPeriodIdx > 3 && firstPeriodIdx < maxMoveChunkSize)
|
if (firstPeriodIdx >= 3 && firstPeriodIdx < maxMoveChunkSize)
|
||||||
{
|
{
|
||||||
var newCurrentText = next.Text.Substring(0, firstPeriodIdx + 1).Trim();
|
var newCurrentText = next.Text.Substring(0, firstPeriodIdx + 1).Trim();
|
||||||
var newNextText = next.Text.Remove(0, firstPeriodIdx + 1).Trim();
|
var newNextText = next.Text.Remove(0, firstPeriodIdx + 1).Trim();
|
||||||
@ -204,19 +217,29 @@ namespace Nikse.SubtitleEdit.Core.AudioToText
|
|||||||
var arrayCurrent = newCurrentText.SplitToLines();
|
var arrayCurrent = newCurrentText.SplitToLines();
|
||||||
var arrayNext = newNextText.SplitToLines();
|
var arrayNext = newNextText.SplitToLines();
|
||||||
|
|
||||||
var currentOk = arrayCurrent.Count == 1 || (arrayCurrent.Count == 2 && arrayCurrent[0].Length <= lineMxLength);
|
var currentOk = arrayCurrent.Count == 1 || (arrayCurrent.Count == 2 && arrayCurrent[0].Length < lineMaxLength * 2);
|
||||||
var nextOk = arrayNext.Count == 1 || (arrayNext.Count == 2 && arrayNext[0].Length <= lineMxLength);
|
var nextOk = arrayNext.Count == 1 || (arrayNext.Count == 2 && arrayNext[0].Length < lineMaxLength * 2);
|
||||||
|
|
||||||
if (currentOk && nextOk)
|
if (currentOk && nextOk)
|
||||||
{
|
{
|
||||||
p.Text = newCurrentText;
|
p.Text = newCurrentText;
|
||||||
next.Text = newNextText;
|
next.Text = newNextText;
|
||||||
|
|
||||||
//TODO: calc time
|
if (string.IsNullOrWhiteSpace(newNextText))
|
||||||
|
{
|
||||||
|
deleteIndices.Add(i + 1);
|
||||||
|
p.EndTime.TotalMilliseconds = next.EndTime.TotalMilliseconds;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
//TODO: calc time
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.RemoveParagraphsByIndices(deleteIndices);
|
||||||
|
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -756,7 +756,7 @@ namespace Nikse.SubtitleEdit.Core.Common
|
|||||||
/// <returns>Number of lines deleted</returns>
|
/// <returns>Number of lines deleted</returns>
|
||||||
public int RemoveParagraphsByIndices(IEnumerable<int> indices)
|
public int RemoveParagraphsByIndices(IEnumerable<int> indices)
|
||||||
{
|
{
|
||||||
int count = 0;
|
var count = 0;
|
||||||
foreach (var index in indices.OrderByDescending(p => p))
|
foreach (var index in indices.OrderByDescending(p => p))
|
||||||
{
|
{
|
||||||
if (index >= 0 && index < Paragraphs.Count)
|
if (index >= 0 && index < Paragraphs.Count)
|
||||||
@ -765,6 +765,7 @@ namespace Nikse.SubtitleEdit.Core.Common
|
|||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user