Add "Normalize" to FCE

Normalizes unicode chars (FORM C) + spaces + colon + dash
This commit is contained in:
Nikolaj Olsson 2020-05-27 11:21:16 +02:00
parent 3a88fdc0d6
commit 34d541198b
7 changed files with 65 additions and 2 deletions

View File

@ -615,6 +615,7 @@ Note: Do check free disk space.</WaveFileMalformed>
<CommonOcrErrorsFixed>Common OCR errors fixed (OcrReplaceList file used): {0}</CommonOcrErrorsFixed>
<RemoveSpaceBetweenNumber>Remove space between numbers</RemoveSpaceBetweenNumber>
<FixDialogsOnOneLine>Fix dialogs on one line</FixDialogsOnOneLine>
<NormalizeStrings>Normalize strings</NormalizeStrings>
<RemoveSpaceBetweenNumbersFixed>Remove space between numbers fixed: {0}</RemoveSpaceBetweenNumbersFixed>
<FixTurkishAnsi>Fix Turkish ANSI (Icelandic) letters to Unicode</FixTurkishAnsi>
<FixDanishLetterI>Fix Danish letter 'i'</FixDanishLetterI>

View File

@ -0,0 +1,45 @@
using Nikse.SubtitleEdit.Core.Interfaces;
namespace Nikse.SubtitleEdit.Core.Forms.FixCommonErrors
{
public class NormalizeStrings : IFixCommonError
{
public void Fix(Subtitle subtitle, IFixCallbacks callbacks)
{
var language = Configuration.Settings.Language.FixCommonErrors;
string fixAction = language.NormalizeStrings;
int noOfFixes = 0;
for (int i = 0; i < subtitle.Paragraphs.Count; i++)
{
var p = subtitle.Paragraphs[i];
var oldText = p.Text;
var text = p.Text
.Normalize()
.Replace('\u00a0', ' ') // replace non-break-space (160 decimal) ascii char with normal space
.Replace("\u200B", string.Empty) // Zero Width Space
.Replace("\uFEFF", string.Empty) // Zero Width No-Break Space
.Replace('\u02F8', ':') // ˸ Modifier Letter Raised Colon (\u02F8)
.Replace('\uFF1A', ':') // Fullwidth Colon (\uFF1A)
.Replace('\uFE13', ':') // ︓ Presentation Form for Vertical Colon (\uFE13)
.Replace('\u2043', '-') // Hyphen bullet (\u2043)
.Replace('\u2010', '-') // Hyphen (\u2010)
.Replace('\u2012', '-') // Figure dash (\u2012)
.Replace('\u2013', '-') // En dash (\u2013)
.Replace('\u2014', '-') // — Em dash (\u2014)
.Replace('\u2015', '-') // ― Horizontal bar (\u2015)
;
if (oldText != text && callbacks.AllowFix(p, fixAction))
{
p.Text = text;
noOfFixes++;
callbacks.AddFixToListView(p, fixAction, oldText, p.Text);
}
}
callbacks.UpdateFixStatus(noOfFixes, language.FixCommonOcrErrors, language.FixDialogsOneLineExample);
}
}
}

View File

@ -811,6 +811,7 @@ namespace Nikse.SubtitleEdit.Core
RemoveSpaceBetweenNumber = "Remove space between numbers",
FixDialogsOnOneLine = "Fix dialogs on one line",
RemoveSpaceBetweenNumbersFixed = "Remove space between numbers fixed: {0}",
NormalizeStrings = "Normalize strings",
FixLowercaseIToUppercaseI = "Fix alone lowercase 'i' to 'I' (English)",
FixTurkishAnsi = "Fix Turkish ANSI (Icelandic) letters to Unicode",
FixDanishLetterI = "Fix Danish letter 'i'",

View File

@ -1612,6 +1612,9 @@ namespace Nikse.SubtitleEdit.Core
case "FixCommonErrors/FixDialogsOnOneLine":
language.FixCommonErrors.FixDialogsOnOneLine = reader.Value;
break;
case "FixCommonErrors/NormalizeStrings":
language.FixCommonErrors.NormalizeStrings = reader.Value;
break;
case "FixCommonErrors/RemoveSpaceBetweenNumbersFixed":
language.FixCommonErrors.RemoveSpaceBetweenNumbersFixed = reader.Value;
break;

View File

@ -683,6 +683,7 @@
public string CommonOcrErrorsFixed { get; set; }
public string RemoveSpaceBetweenNumber { get; set; }
public string FixDialogsOnOneLine { get; set; }
public string NormalizeStrings { get; set; }
public string RemoveSpaceBetweenNumbersFixed { get; set; }
public string FixTurkishAnsi { get; set; }
public string FixDanishLetterI { get; set; }

View File

@ -689,6 +689,7 @@ $HorzAlign = Center
public bool FixMusicNotationTicked { get; set; }
public bool FixContinuationStyleTicked { get; set; }
public bool FixUnnecessaryLeadingDotsTicked { get; set; }
public bool NormalizeStringsTicked { get; set; }
public FixCommonErrorsSettings()
{
@ -725,6 +726,7 @@ $HorzAlign = Center
FixMusicNotationTicked = true;
FixContinuationStyleTicked = false;
FixUnnecessaryLeadingDotsTicked = true;
NormalizeStringsTicked = false;
}
}
@ -4842,6 +4844,12 @@ $HorzAlign = Center
settings.CommonErrors.FixUnnecessaryLeadingDotsTicked = Convert.ToBoolean(subNode.InnerText);
}
subNode = node.SelectSingleNode("NormalizeStringsTicked");
if (subNode != null)
{
settings.CommonErrors.NormalizeStringsTicked = Convert.ToBoolean(subNode.InnerText);
}
// Video Controls
node = doc.DocumentElement.SelectSingleNode("VideoControls");
subNode = node.SelectSingleNode("CustomSearchText1");
@ -7411,6 +7419,7 @@ $HorzAlign = Center
textWriter.WriteElementString("FixMusicNotationTicked", settings.CommonErrors.FixMusicNotationTicked.ToString(CultureInfo.InvariantCulture));
textWriter.WriteElementString("FixContinuationStyleTicked", settings.CommonErrors.FixContinuationStyleTicked.ToString(CultureInfo.InvariantCulture));
textWriter.WriteElementString("FixUnnecessaryLeadingDotsTicked", settings.CommonErrors.FixUnnecessaryLeadingDotsTicked.ToString(CultureInfo.InvariantCulture));
textWriter.WriteElementString("NormalizeStringsTicked", settings.CommonErrors.NormalizeStringsTicked.ToString(CultureInfo.InvariantCulture));
textWriter.WriteEndElement();
textWriter.WriteStartElement("VideoControls", string.Empty);

View File

@ -51,7 +51,8 @@ namespace Nikse.SubtitleEdit.Forms
private const int IndexUppercaseIInsideLowercaseWord = 28;
private const int IndexRemoveSpaceBetweenNumbers = 29;
private const int IndexDialogsOnOneLine = 30;
private const int IndexFixEllipsesStart = 31;
private const int IndexNormalizeStrings = 31;
private const int IndexFixEllipsesStart = 32;
private int _indexAloneLowercaseIToUppercaseIEnglish = -1;
private int _turkishAnsiIndex = -1;
private int _danishLetterIIndex = -1;
@ -403,7 +404,8 @@ namespace Nikse.SubtitleEdit.Forms
new FixItem(_language.FixCommonOcrErrors, _language.FixOcrErrorExample, () => FixOcrErrorsViaReplaceList(threeLetterIsoLanguageName), ce.FixOcrErrorsViaReplaceListTicked),
new FixItem(_language.FixUppercaseIInsindeLowercaseWords, _language.FixUppercaseIInsindeLowercaseWordsExample, () => new FixUppercaseIInsideWords().Fix(Subtitle, this), ce.UppercaseIInsideLowercaseWordTicked),
new FixItem(_language.RemoveSpaceBetweenNumber, _language.FixSpaceBetweenNumbersExample, () => new RemoveSpaceBetweenNumbers().Fix(Subtitle, this), ce.RemoveSpaceBetweenNumberTicked),
new FixItem(_language.FixDialogsOnOneLine, _language.FixDialogsOneLineExample, () => new FixDialogsOnOneLine().Fix(Subtitle, this), ce.FixDialogsOnOneLineTicked)
new FixItem(_language.FixDialogsOnOneLine, _language.FixDialogsOneLineExample, () => new FixDialogsOnOneLine().Fix(Subtitle, this), ce.FixDialogsOnOneLineTicked),
new FixItem(_language.NormalizeStrings, string.Empty, () => new NormalizeStrings().Fix(Subtitle, this), ce.NormalizeStringsTicked),
};
if (Configuration.Settings.General.ContinuationStyle == ContinuationStyle.None)
@ -1099,6 +1101,7 @@ namespace Nikse.SubtitleEdit.Forms
ce.FixOcrErrorsViaReplaceListTicked = listView1.Items[IndexFixOcrErrorsViaReplaceList].Checked;
ce.RemoveSpaceBetweenNumberTicked = listView1.Items[IndexRemoveSpaceBetweenNumbers].Checked;
ce.FixDialogsOnOneLineTicked = listView1.Items[IndexDialogsOnOneLine].Checked;
ce.NormalizeStringsTicked = listView1.Items[IndexNormalizeStrings].Checked;
if (_danishLetterIIndex >= 0)
{
ce.DanishLetterITicked = listView1.Items[_danishLetterIIndex].Checked;