mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-11-25 04:33:04 +01:00
Fix for Whisper Post-processing - thx Purfview/cvrle77 :)
Somewhat related to #8044
This commit is contained in:
parent
419a884770
commit
001cad505d
@ -115,5 +115,107 @@ Lexington, Massachusetts.";
|
||||
Assert.IsTrue(fixedSubtitle.Paragraphs[4].Text.EndsWith("camp.", StringComparison.Ordinal));
|
||||
Assert.IsTrue(fixedSubtitle.Paragraphs[5].Text.StartsWith("Hey there", StringComparison.Ordinal));
|
||||
}
|
||||
|
||||
[TestMethod]
|
||||
public void TryForWholeSentences1()
|
||||
{
|
||||
var raw = @"12
|
||||
00:00:25,500 --> 00:00:27,060
|
||||
Oh, my... Bob, right?
|
||||
|
||||
13
|
||||
00:00:28,560 --> 00:00:29,220
|
||||
Could be fun.
|
||||
|
||||
14
|
||||
00:00:29,660 --> 00:00:32,580
|
||||
Well, we could get to know each other a
|
||||
little, maybe loosen things up around
|
||||
|
||||
15
|
||||
00:00:32,580 --> 00:00:33,060
|
||||
here?
|
||||
|
||||
16
|
||||
00:00:33,680 --> 00:00:39,160
|
||||
Well, I've worked with this lot before,
|
||||
and, erm... Yeah, this is as loose as they
|
||||
|
||||
17
|
||||
00:00:39,160 --> 00:00:40,300
|
||||
get.
|
||||
|
||||
18
|
||||
00:00:46,120 --> 00:00:46,340
|
||||
Hmm.
|
||||
|
||||
19
|
||||
00:00:48,160 --> 00:00:49,120
|
||||
What's the about that, Bob's?
|
||||
|
||||
20
|
||||
00:00:49,120 --> 00:00:49,860
|
||||
Oh, no.
|
||||
|
||||
21
|
||||
00:00:50,580 --> 00:00:50,700
|
||||
Yep.
|
||||
|
||||
22
|
||||
00:00:51,240 --> 00:00:52,600
|
||||
I felt that soon as I said it.
|
||||
|
||||
23
|
||||
00:00:54,860 --> 00:00:56,340
|
||||
Right, I'm headed out.
|
||||
|
||||
24
|
||||
00:00:57,460 --> 00:00:58,780
|
||||
Everyone have a great day, yeah?
|
||||
|
||||
25
|
||||
00:00:58,780 --> 00:00:59,600
|
||||
Yeah.
|
||||
|
||||
26
|
||||
00:01:00,600 --> 00:01:01,880
|
||||
Wait.
|
||||
|
||||
27
|
||||
00:01:01,880 --> 00:01:02,480
|
||||
Wait.";
|
||||
|
||||
var subtitle = new Subtitle();
|
||||
new SubRip().LoadSubtitle(subtitle, raw.SplitToLines(), null);
|
||||
|
||||
var fixedSubtitle = AudioToTextPostProcessor.TryForWholeSentences(subtitle, "en", 42);
|
||||
|
||||
Assert.AreEqual(14, fixedSubtitle.Paragraphs.Count);
|
||||
Assert.IsTrue(fixedSubtitle.Paragraphs[2].Text == "Well, we could get to know each other a" + Environment.NewLine + "little, maybe loosen things up around here?");
|
||||
Assert.IsTrue(fixedSubtitle.Paragraphs[3].Text == "Well, I've worked with this lot before, and," + Environment.NewLine + "erm... Yeah, this is as loose as they get.");
|
||||
Assert.IsTrue(fixedSubtitle.Paragraphs[4].Text == "Hmm.");
|
||||
Assert.IsTrue(fixedSubtitle.Paragraphs[5].Text == "What's the about that, Bob's?");
|
||||
}
|
||||
|
||||
[TestMethod]
|
||||
public void TryForWholeSentences2()
|
||||
{
|
||||
var raw = @"1
|
||||
00:00:26,500 --> 00:00:27,060
|
||||
Yes, I think this could indeed be very good. But also
|
||||
|
||||
2
|
||||
00:00:28,560 --> 00:00:29,220
|
||||
that could be fun indeed my friend.";
|
||||
|
||||
var subtitle = new Subtitle();
|
||||
new SubRip().LoadSubtitle(subtitle, raw.SplitToLines(), null);
|
||||
|
||||
var fixedSubtitle = AudioToTextPostProcessor.TryForWholeSentences(subtitle, "en", 42);
|
||||
|
||||
Assert.AreEqual(2, fixedSubtitle.Paragraphs.Count);
|
||||
Assert.IsTrue(fixedSubtitle.Paragraphs[0].Text == "Yes, I think this could" + Environment.NewLine + "indeed be very good.");
|
||||
Assert.IsTrue(fixedSubtitle.Paragraphs[1].Text == "But also that could be" + Environment.NewLine + "fun indeed my friend.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -172,15 +172,22 @@ namespace Test.Logic.AutoTranslate
|
||||
Assert.IsTrue(splitResult[0].Length > 5);
|
||||
Assert.IsTrue(splitResult[1].Length > 5);
|
||||
Assert.IsTrue(splitResult[2].Length > 5);
|
||||
Assert.AreEqual(string.Join(" ", subtitle.Paragraphs.Select(p => p.Text)), string.Join(" ", splitResult));
|
||||
|
||||
var subtitleText = string.Join("", subtitle.Paragraphs.Select(p => p.Text)).RemoveChar('\n', '\r', ' ');
|
||||
var splitText = string.Join("", splitResult).RemoveChar('\n', '\r', ' ');
|
||||
Assert.AreEqual(subtitleText, splitText);
|
||||
|
||||
Assert.AreEqual("Hallo there. In the" + Environment.NewLine + "garden today are we?", splitResult[0]);
|
||||
Assert.AreEqual("So I will very", splitResult[1]);
|
||||
Assert.AreEqual("soon be going home to Sweden.", splitResult[2]);
|
||||
Assert.AreEqual("My name is Peter!", splitResult[3]);
|
||||
Assert.AreEqual("My name is Peter! And Jones.", splitResult[4]);
|
||||
Assert.AreEqual("My name is Peter. And Jones.", splitResult[5]);
|
||||
Assert.AreEqual("", splitResult[6]);
|
||||
Assert.AreEqual("Hallo there.", splitResult[7]);
|
||||
|
||||
var inputText = string.Join(" ", subtitle.Paragraphs.Select(p => p.Text)).Replace(Environment.NewLine, " ");
|
||||
var splitResultText = string.Join(" ", splitResult);
|
||||
var inputText = string.Join("", subtitle.Paragraphs.Select(p => p.Text)).RemoveChar('\n', '\r', ' ');
|
||||
var splitResultText = string.Join("", splitResult).RemoveChar('\n', '\r', ' ');
|
||||
Assert.AreEqual(inputText, splitResultText);
|
||||
}
|
||||
}
|
||||
|
@ -64,11 +64,7 @@ namespace Nikse.SubtitleEdit.Core.AudioToText
|
||||
continue;
|
||||
}
|
||||
|
||||
if (TwoLetterLanguageCode == "en")
|
||||
{
|
||||
// anything?
|
||||
}
|
||||
else if (TwoLetterLanguageCode == "da")
|
||||
if (TwoLetterLanguageCode == "da")
|
||||
{
|
||||
if (paragraph.Text.Contains("Danske tekster af nicolai winther", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
@ -144,10 +140,12 @@ namespace Nikse.SubtitleEdit.Core.AudioToText
|
||||
return subtitle;
|
||||
}
|
||||
|
||||
public static Subtitle TryForWholeSentences(Subtitle inputSubtitle, string language, int lineMxLength)
|
||||
public static Subtitle TryForWholeSentences(Subtitle inputSubtitle, string language, int lineMaxLength)
|
||||
{
|
||||
var s = new Subtitle(inputSubtitle);
|
||||
const int maxMoveChunkSize = 15;
|
||||
var deleteIndices = new List<int>();
|
||||
|
||||
for (var i = 0; i < s.Paragraphs.Count - 1; i++)
|
||||
{
|
||||
var p = s.Paragraphs[i];
|
||||
@ -158,14 +156,21 @@ namespace Nikse.SubtitleEdit.Core.AudioToText
|
||||
p.EndTime.TotalMilliseconds - next.StartTime.TotalMilliseconds > 100 ||
|
||||
p.Text.Contains('<') ||
|
||||
next.Text.Contains('<') ||
|
||||
!(p.Text.Contains('.') || next.Text.Contains('.')) ||
|
||||
p.Text.EndsWith('.'))
|
||||
!(p.Text.Contains('.') || p.Text.Contains('?') || p.Text.Contains('!') || next.Text.Contains('.') || next.Text.Contains('?') || next.Text.Contains('!')) ||
|
||||
p.Text.EndsWith('.') ||
|
||||
p.Text.EndsWith('?') ||
|
||||
p.Text.EndsWith('!'))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (deleteIndices.Contains(i))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// check for period in last part of p
|
||||
var lastPeriodIdx = p.Text.LastIndexOf('.');
|
||||
var lastPeriodIdx = p.Text.LastIndexOfAny(new char[] { '.', '?', '!' });
|
||||
if (lastPeriodIdx > 3 && lastPeriodIdx > p.Text.Length - maxMoveChunkSize)
|
||||
{
|
||||
var newCurrentText = p.Text.Substring(0, lastPeriodIdx + 1).Trim();
|
||||
@ -177,23 +182,31 @@ namespace Nikse.SubtitleEdit.Core.AudioToText
|
||||
var arrayCurrent = newCurrentText.SplitToLines();
|
||||
var arrayNext = newNextText.SplitToLines();
|
||||
|
||||
var currentOk = arrayCurrent.Count == 1 || (arrayCurrent.Count == 2 && arrayCurrent[0].Length <= lineMxLength);
|
||||
var nextOk = arrayNext.Count == 1 || (arrayNext.Count == 2 && arrayNext[0].Length <= lineMxLength);
|
||||
var currentOk = arrayCurrent.Count == 1 || (arrayCurrent.Count == 2 && arrayCurrent[0].Length < lineMaxLength * 2);
|
||||
var nextOk = arrayNext.Count == 1 || (arrayNext.Count == 2 && arrayNext[0].Length < lineMaxLength * 2);
|
||||
|
||||
if (currentOk && nextOk)
|
||||
{
|
||||
p.Text = newCurrentText;
|
||||
next.Text = newNextText;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(newCurrentText))
|
||||
{
|
||||
deleteIndices.Add(i);
|
||||
next.StartTime.TotalMilliseconds = p.StartTime.TotalMilliseconds;
|
||||
}
|
||||
else
|
||||
{
|
||||
//TODO: calc time
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// check for period in beginning of next
|
||||
var firstPeriodIdx = next.Text.IndexOf('.');
|
||||
if (firstPeriodIdx > 3 && firstPeriodIdx < maxMoveChunkSize)
|
||||
var firstPeriodIdx = next.Text.IndexOfAny(new char[] { '.', '?', '!' });
|
||||
if (firstPeriodIdx >= 3 && firstPeriodIdx < maxMoveChunkSize)
|
||||
{
|
||||
var newCurrentText = next.Text.Substring(0, firstPeriodIdx + 1).Trim();
|
||||
var newNextText = next.Text.Remove(0, firstPeriodIdx + 1).Trim();
|
||||
@ -204,18 +217,28 @@ namespace Nikse.SubtitleEdit.Core.AudioToText
|
||||
var arrayCurrent = newCurrentText.SplitToLines();
|
||||
var arrayNext = newNextText.SplitToLines();
|
||||
|
||||
var currentOk = arrayCurrent.Count == 1 || (arrayCurrent.Count == 2 && arrayCurrent[0].Length <= lineMxLength);
|
||||
var nextOk = arrayNext.Count == 1 || (arrayNext.Count == 2 && arrayNext[0].Length <= lineMxLength);
|
||||
var currentOk = arrayCurrent.Count == 1 || (arrayCurrent.Count == 2 && arrayCurrent[0].Length < lineMaxLength * 2);
|
||||
var nextOk = arrayNext.Count == 1 || (arrayNext.Count == 2 && arrayNext[0].Length < lineMaxLength * 2);
|
||||
|
||||
if (currentOk && nextOk)
|
||||
{
|
||||
p.Text = newCurrentText;
|
||||
next.Text = newNextText;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(newNextText))
|
||||
{
|
||||
deleteIndices.Add(i + 1);
|
||||
p.EndTime.TotalMilliseconds = next.EndTime.TotalMilliseconds;
|
||||
}
|
||||
else
|
||||
{
|
||||
//TODO: calc time
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
s.RemoveParagraphsByIndices(deleteIndices);
|
||||
|
||||
return s;
|
||||
}
|
||||
|
@ -756,7 +756,7 @@ namespace Nikse.SubtitleEdit.Core.Common
|
||||
/// <returns>Number of lines deleted</returns>
|
||||
public int RemoveParagraphsByIndices(IEnumerable<int> indices)
|
||||
{
|
||||
int count = 0;
|
||||
var count = 0;
|
||||
foreach (var index in indices.OrderByDescending(p => p))
|
||||
{
|
||||
if (index >= 0 && index < Paragraphs.Count)
|
||||
@ -765,6 +765,7 @@ namespace Nikse.SubtitleEdit.Core.Common
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user