A few more RegEx optimizations

git-svn-id: https://subtitleedit.googlecode.com/svn/trunk@909 99eadd0c-20b8-1223-b5c4-2a2b2df33de2
This commit is contained in:
niksedk 2012-01-08 14:25:39 +00:00
parent 95f8554544
commit 1cbae56538
22 changed files with 60 additions and 55 deletions

View File

@ -50,6 +50,23 @@ namespace Nikse.SubtitleEdit.Forms
readonly LanguageStructure.General _languageGeneral;
private bool _hasFixesBeenMade;
static Regex fixMissingSpacesReComma = new Regex(@"[^\s\d],[^\s]", RegexOptions.Compiled);
static Regex fixMissingSpacesRePeriod = new Regex(@"[a-z][.][a-zA-Z]", RegexOptions.Compiled);
static Regex fixMissingSpacesReQuestionMark = new Regex(@"[^\s\d]\?[a-zA-Z]", RegexOptions.Compiled);
static Regex fixMissingSpacesReExclamation = new Regex(@"[^\s\d]\![a-zA-Z]", RegexOptions.Compiled);
static Regex fixMissingSpacesReColon = new Regex(@"[^\s\d]\:[a-zA-Z]", RegexOptions.Compiled);
static Regex urlCom = new Regex(@"\w\.com\b", RegexOptions.Compiled);
static Regex urlNet = new Regex(@"\w\.net\b", RegexOptions.Compiled);
static Regex urlOrg = new Regex(@"\w\.org\b", RegexOptions.Compiled);
static Regex reAfterLowercaseLetter = new Regex(@"[a-zæøåäöé]I", RegexOptions.Compiled);
static Regex reBeforeLowercaseLetter = new Regex(@"I[a-zæøåäöé]", RegexOptions.Compiled);
static Regex removeSpaceBetweenNumbersRegEx = new Regex(@"\d \d", RegexOptions.Compiled);
static Regex fixAloneLowercaseIToUppercaseIRE = new Regex(@"\bi\b", RegexOptions.Compiled);
class FixItem
{
public string Name { get; set; }
@ -998,23 +1015,13 @@ namespace Nikse.SubtitleEdit.Forms
public void FixMissingSpaces()
{
string fixAction = _language.FixMissingSpace;
Regex reComma = new Regex(@"[^\s\d],[^\s]", RegexOptions.Compiled);
Regex rePeriod = new Regex(@"[a-z][.][a-zA-Z]", RegexOptions.Compiled);
Regex reQuestionMark = new Regex(@"[^\s\d]\?[a-zA-Z]", RegexOptions.Compiled);
Regex reExclamation = new Regex(@"[^\s\d]\![a-zA-Z]", RegexOptions.Compiled);
Regex reColon = new Regex(@"[^\s\d]\:[a-zA-Z]", RegexOptions.Compiled);
Regex urlCom = new Regex(@"\w\.com\b", RegexOptions.Compiled);
Regex urlNet = new Regex(@"\w\.net\b", RegexOptions.Compiled);
Regex urlOrg = new Regex(@"\w\.org\b", RegexOptions.Compiled);
int missingSpaces = 0;
for (int i = 0; i < _subtitle.Paragraphs.Count; i++)
{
Paragraph p = _subtitle.Paragraphs[i];
// missing space after comma ","
Match match = reComma.Match(p.Text);
Match match = fixMissingSpacesReComma.Match(p.Text);
if (match.Success)
{
while (match.Success)
@ -1036,7 +1043,7 @@ namespace Nikse.SubtitleEdit.Forms
}
// missing space after "?"
match = reQuestionMark.Match(p.Text);
match = fixMissingSpacesReQuestionMark.Match(p.Text);
if (match.Success)
{
while (match.Success)
@ -1053,12 +1060,12 @@ namespace Nikse.SubtitleEdit.Forms
AddFixToListView(p, i + 1, fixAction, oldText, p.Text);
}
}
match = reQuestionMark.Match(p.Text, match.Index + 1);
match = fixMissingSpacesReQuestionMark.Match(p.Text, match.Index + 1);
}
}
// missing space after "!"
match = reExclamation.Match(p.Text);
match = fixMissingSpacesReExclamation.Match(p.Text);
if (match.Success)
{
while (match.Success)
@ -1075,12 +1082,12 @@ namespace Nikse.SubtitleEdit.Forms
AddFixToListView(p, i + 1, fixAction, oldText, p.Text);
}
}
match = reExclamation.Match(p.Text, match.Index + 1);
match = fixMissingSpacesReExclamation.Match(p.Text, match.Index + 1);
}
}
// missing space after ":"
match = reColon.Match(p.Text);
match = fixMissingSpacesReColon.Match(p.Text);
if (match.Success)
{
while (match.Success)
@ -1107,12 +1114,12 @@ namespace Nikse.SubtitleEdit.Forms
AddFixToListView(p, i + 1, fixAction, oldText, p.Text);
}
}
match = reColon.Match(p.Text, match.Index + 1);
match = fixMissingSpacesReColon.Match(p.Text, match.Index + 1);
}
}
// missing space after period "."
match = rePeriod.Match(p.Text);
match = fixMissingSpacesRePeriod.Match(p.Text);
if (match.Success)
{
while (match.Success)
@ -1435,8 +1442,6 @@ namespace Nikse.SubtitleEdit.Forms
{
string fixAction = _language.FixUppercaseIInsideLowercaseWord;
int uppercaseIsInsideLowercaseWords = 0;
Regex reAfterLowercaseLetter = new Regex(@"[a-zæøåäöé]I", RegexOptions.Compiled);
Regex reBeforeLowercaseLetter = new Regex(@"I[a-zæøåäöé]", RegexOptions.Compiled);
// bool isLineContinuation = false;
for (int i = 0; i < _subtitle.Paragraphs.Count; i++)
{
@ -2057,17 +2062,16 @@ namespace Nikse.SubtitleEdit.Forms
private void RemoveSpaceBetweenNumbers()
{
string fixAction = _language.FixCommonOcrErrors;
int noOfFixes = 0;
Regex regex = new Regex(@"\d \d", RegexOptions.Compiled);
int noOfFixes = 0;
for (int i = 0; i < _subtitle.Paragraphs.Count; i++)
{
Paragraph p = _subtitle.Paragraphs[i];
string text = p.Text;
Match match = regex.Match(text);
Match match = removeSpaceBetweenNumbersRegEx.Match(text);
while (match.Success)
{
text = text.Remove(match.Index + 1, 1);
match = regex.Match(text);
match = removeSpaceBetweenNumbersRegEx.Match(text);
}
if (p.Text != text)
{
@ -2137,7 +2141,6 @@ namespace Nikse.SubtitleEdit.Forms
{
string fixAction = _language.FixLowercaseIToUppercaseI;
int iFixes = 0;
var re = new Regex(@"\bi\b", RegexOptions.Compiled);
for (int i = 0; i < _subtitle.Paragraphs.Count; i++)
{
Paragraph p = _subtitle.Paragraphs[i];
@ -2146,7 +2149,7 @@ namespace Nikse.SubtitleEdit.Forms
string s = p.Text;
if (s.Contains("i"))
{
s = FixAloneLowercaseIToUppercaseLine(re, oldText, s, 'i');
s = FixAloneLowercaseIToUppercaseLine(fixAloneLowercaseIToUppercaseIRE, oldText, s, 'i');
if (s != oldText && AllowFix(i + 1, fixAction))
{

View File

@ -37,9 +37,9 @@ namespace Nikse.SubtitleEdit.Logic.OCR
readonly Form _parentForm;
private string _spellCheckDictionaryName;
Regex regexAloneI = new Regex(@"\bi\b", RegexOptions.Compiled);
Regex regexAloneIAsL = new Regex(@"\bl\b", RegexOptions.Compiled);
Regex regexSpaceBetweenNumbers = new Regex(@"\d \d", RegexOptions.Compiled);
static Regex regexAloneI = new Regex(@"\bi\b", RegexOptions.Compiled);
static Regex regexAloneIAsL = new Regex(@"\bl\b", RegexOptions.Compiled);
static Regex regexSpaceBetweenNumbers = new Regex(@"\d \d", RegexOptions.Compiled);
static Regex regExLowercaseL = new Regex("[A-ZÆØÅÄÖÉÁ]l[A-ZÆØÅÄÖÉÁ]", RegexOptions.Compiled);
static Regex regExUppercaseI = new Regex("[a-zæøåöäé]I.", RegexOptions.Compiled);
static Regex regExNumber1 = new Regex(@"\d\ 1", RegexOptions.Compiled);
@ -47,6 +47,8 @@ namespace Nikse.SubtitleEdit.Logic.OCR
static Regex regExIandZero = new Regex(@"[a-zæøåäöé][I1]", RegexOptions.Compiled);
static Regex regExTime1 = new Regex(@"[a-zæøåäöé][0]", RegexOptions.Compiled);
static Regex regExTime2 = new Regex(@"0[a-zæøåäöé]", RegexOptions.Compiled);
static Regex hexNumber = new Regex(@"^#?[\dABDEFabcdef]+$", RegexOptions.Compiled);
static Regex startEndEndsWithNumber = new Regex(@"^\d+.+\d$", RegexOptions.Compiled);
public bool Abort { get; set; }
public List<string> AutoGuessesUsed { get; set; }
@ -55,9 +57,6 @@ namespace Nikse.SubtitleEdit.Logic.OCR
public CultureInfo DictionaryCulture { get; private set; }
static Regex hexNumber = new Regex(@"^#?[\dABDEFabcdef]+$", RegexOptions.Compiled);
static Regex startEndEndsWithNumber = new Regex(@"^\d+.+\d$", RegexOptions.Compiled);
/// <summary>
/// Advanced ocr fixing via replace/spelling dictionaries + some hardcoded rules
/// </summary>

View File

@ -7,7 +7,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
class AdobeEncore : SubtitleFormat
{
Regex regexTimeCodes = new Regex(@"^\d\d:\d\d:\d\d:\d\d \d\d:\d\d:\d\d:\d\d ", RegexOptions.Compiled);
static Regex regexTimeCodes = new Regex(@"^\d\d:\d\d:\d\d:\d\d \d\d:\d\d:\d\d:\d\d ", RegexOptions.Compiled);
public override string Extension
{

View File

@ -7,7 +7,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
class AdobeEncoreLineTabs : SubtitleFormat
{
Regex regexTimeCodes = new Regex(@"^\d\d\d\d\t\d\d:\d\d:\d\d:\d\d\t\d\d:\d\d:\d\d:\d\d\t", RegexOptions.Compiled);
static Regex regexTimeCodes = new Regex(@"^\d\d\d\d\t\d\d:\d\d:\d\d:\d\d\t\d\d:\d\d:\d\d:\d\d\t", RegexOptions.Compiled);
public override string Extension
{

View File

@ -7,7 +7,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
class AdobeEncoreTabs : SubtitleFormat
{
Regex regexTimeCodes = new Regex(@"^\d\d:\d\d:\d\d:\d\d\t\d\d:\d\d:\d\d:\d\d\t", RegexOptions.Compiled);
static Regex regexTimeCodes = new Regex(@"^\d\d:\d\d:\d\d:\d\d\t\d\d:\d\d:\d\d:\d\d\t", RegexOptions.Compiled);
public override string Extension
{

View File

@ -7,7 +7,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
class AdobeEncoreWithLineNumbers : SubtitleFormat
{
Regex regexTimeCodes = new Regex(@"^\d+ \d\d:\d\d:\d\d:\d\d \d\d:\d\d:\d\d:\d\d ", RegexOptions.Compiled);
static Regex regexTimeCodes = new Regex(@"^\d+ \d\d:\d\d:\d\d:\d\d \d\d:\d\d:\d\d:\d\d ", RegexOptions.Compiled);
public override string Extension
{

View File

@ -7,7 +7,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
public class AvidCaption : SubtitleFormat
{
Regex regexTimeCodes = new Regex(@"^\d\d:\d\d:\d\d:\d\d \d\d:\d\d:\d\d:\d\d$", RegexOptions.Compiled);
static Regex regexTimeCodes = new Regex(@"^\d\d:\d\d:\d\d:\d\d \d\d:\d\d:\d\d:\d\d$", RegexOptions.Compiled);
public override string Extension
{

View File

@ -8,7 +8,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
public class Csv : SubtitleFormat
{
private const string _seperator = ";";
Regex csvLine = new Regex(@"^""?\d+""?" + _seperator + @"""?\d+""?" + _seperator + @"""?\d+""?" + _seperator + @"""?[^""]*""?$", RegexOptions.Compiled);
static Regex csvLine = new Regex(@"^""?\d+""?" + _seperator + @"""?\d+""?" + _seperator + @"""?\d+""?" + _seperator + @"""?[^""]*""?$", RegexOptions.Compiled);
public override string Extension
{

View File

@ -7,7 +7,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
class DigiBeta : SubtitleFormat
{
Regex regexTimeCode = new Regex(@"^\d\d \d\d \d\d \d\d\t\d\d \d\d \d\d \d\d\t", RegexOptions.Compiled);
static Regex regexTimeCode = new Regex(@"^\d\d \d\d \d\d \d\d\t\d\d \d\d \d\d \d\d\t", RegexOptions.Compiled);
public override string Extension
{

View File

@ -7,7 +7,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
public class DvdStudioPro : SubtitleFormat
{
readonly Regex _regexTimeCodes = new Regex(@"^\d+:\d+:\d+:\d+\t,\t\d+:\d+:\d+:\d+\t,\t.*$", RegexOptions.Compiled);
static Regex _regexTimeCodes = new Regex(@"^\d+:\d+:\d+:\d+\t,\t\d+:\d+:\d+:\d+\t,\t.*$", RegexOptions.Compiled);
public override string Extension
{

View File

@ -7,7 +7,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
public class DvdStudioProSpace : SubtitleFormat
{
readonly Regex _regexTimeCodes = new Regex(@"^\d+:\d+:\d+:\d+ , \d+:\d+:\d+:\d+ , .*$", RegexOptions.Compiled);
static Regex _regexTimeCodes = new Regex(@"^\d+:\d+:\d+:\d+ , \d+:\d+:\d+:\d+ , .*$", RegexOptions.Compiled);
public override string Extension
{

View File

@ -8,7 +8,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
public class DvdSubtitle : SubtitleFormat
{
Regex regexTimeCodes = new Regex(@"^\{T\ \d+:\d+:\d+:\d+$", RegexOptions.Compiled);
static Regex regexTimeCodes = new Regex(@"^\{T\ \d+:\d+:\d+:\d+$", RegexOptions.Compiled);
public override string Extension
{

View File

@ -11,7 +11,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
/// </summary>
public class F4Text : SubtitleFormat
{
Regex regexTimeCodes = new Regex(@"^\d\d:\d\d:\d\d-\d$", RegexOptions.Compiled);
static Regex regexTimeCodes = new Regex(@"^\d\d:\d\d:\d\d-\d$", RegexOptions.Compiled);
public override string Extension
{

View File

@ -7,7 +7,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
class FabSubtitler : SubtitleFormat
{
Regex regexTimeCodes = new Regex(@"^\d\d:\d\d:\d\d:\d\d \d\d:\d\d:\d\d:\d\d$", RegexOptions.Compiled);
static Regex regexTimeCodes = new Regex(@"^\d\d:\d\d:\d\d:\d\d \d\d:\d\d:\d\d:\d\d$", RegexOptions.Compiled);
public override string Extension
{

View File

@ -10,7 +10,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
public class Idx : SubtitleFormat
{
// timestamp: 00:00:01:401, filepos: 000000000
readonly Regex _regexTimeCodes = new Regex(@"^timestamp: \d+:\d+:\d+:\d+, filepos: [\dabcdefABCDEF]+$", RegexOptions.Compiled);
static Regex _regexTimeCodes = new Regex(@"^timestamp: \d+:\d+:\d+:\d+, filepos: [\dabcdefABCDEF]+$", RegexOptions.Compiled);
public Hashtable NonTimeCodes = new Hashtable();

View File

@ -7,7 +7,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
public class MPlayer2 : SubtitleFormat
{
readonly Regex _regexMPlayer2Line = new Regex(@"^\[-?\d+]\[-?\d+].*$", RegexOptions.Compiled);
static Regex _regexMPlayer2Line = new Regex(@"^\[-?\d+]\[-?\d+].*$", RegexOptions.Compiled);
public override string Extension
{

View File

@ -7,7 +7,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
public class MicroDvd : SubtitleFormat
{
readonly Regex _regexMicroDvdLine = new Regex(@"^\{-?\d+}\{-?\d+}.*$", RegexOptions.Compiled);
static Regex _regexMicroDvdLine = new Regex(@"^\{-?\d+}\{-?\d+}.*$", RegexOptions.Compiled);
public override string Extension
{

View File

@ -7,7 +7,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
class PinnacleImpression : SubtitleFormat
{
Regex regexTimeCodes = new Regex(@"^\d\d:\d\d:\d\d:\d\d \d\d:\d\d:\d\d:\d\d ", RegexOptions.Compiled);
static Regex regexTimeCodes = new Regex(@"^\d\d:\d\d:\d\d:\d\d \d\d:\d\d:\d\d:\d\d ", RegexOptions.Compiled);
public override string Extension
{

View File

@ -7,6 +7,8 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
class QuickTimeText : SubtitleFormat
{
static Regex regexTimeCodes = new Regex(@"^\[\d\d:\d\d:\d\d.\d\d\]", RegexOptions.Compiled);
public override string Extension
{
get { return ".txt"; }
@ -77,8 +79,7 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
//tout le temps,
//[00:00:35.08]
Paragraph p = null;
subtitle.Paragraphs.Clear();
var regexTimeCodes = new Regex(@"^\[\d\d:\d\d:\d\d.\d\d\]", RegexOptions.Compiled);
subtitle.Paragraphs.Clear();
foreach (string line in lines)
{
if (regexTimeCodes.IsMatch(line))

View File

@ -7,6 +7,8 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
{
class Scenarist : SubtitleFormat
{
static Regex regexTimeCodes = new Regex(@"^\d\d\d\d\t\d\d:\d\d:\d\d:\d\d\t\d\d:\d\d:\d\d:\d\d\t", RegexOptions.Compiled);
public override string Extension
{
get { return ".txt"; }
@ -64,7 +66,6 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
//This is line two.
Paragraph p = null;
subtitle.Paragraphs.Clear();
var regexTimeCodes = new Regex(@"^\d\d\d\d\t\d\d:\d\d:\d\d:\d\d\t\d\d:\d\d:\d\d:\d\d\t", RegexOptions.Compiled);
foreach (string line in lines)
{
if (regexTimeCodes.IsMatch(line))

View File

@ -16,9 +16,9 @@ namespace Nikse.SubtitleEdit.Logic.SubtitleFormats
Paragraph _paragraph;
ExpectingLine _expecting = ExpectingLine.Number;
readonly Regex _regexTimeCodes = new Regex(@"^-?\d+:-?\d+:-?\d+[:,]-?\d+\s*-->\s*-?\d+:-?\d+:-?\d+[:,]-?\d+$", RegexOptions.Compiled);
readonly Regex _regexTimeCodes2 = new Regex(@"^\d+:\d+:\d+,\d+\s*-->\s*\d+:\d+:\d+,\d+$", RegexOptions.Compiled);
readonly Regex _buggyTimeCodes = new Regex(@"^-?\d+:-?\d+:-?\d+[¡،]-?\d+\s*-->\s*-?\d+:-?\d+:-?\d+[¡،]-?\d+$", RegexOptions.Compiled);
static Regex _regexTimeCodes = new Regex(@"^-?\d+:-?\d+:-?\d+[:,]-?\d+\s*-->\s*-?\d+:-?\d+:-?\d+[:,]-?\d+$", RegexOptions.Compiled);
static Regex _regexTimeCodes2 = new Regex(@"^\d+:\d+:\d+,\d+\s*-->\s*\d+:\d+:\d+,\d+$", RegexOptions.Compiled);
static Regex _buggyTimeCodes = new Regex(@"^-?\d+:-?\d+:-?\d+[¡،]-?\d+\s*-->\s*-?\d+:-?\d+:-?\d+[¡،]-?\d+$", RegexOptions.Compiled);
public override string Extension
{

View File

@ -13,13 +13,14 @@ namespace Nikse.SubtitleEdit.Logic.VobSub
public readonly List<Color> Palette = new List<Color>();
public readonly List<string> Languages = new List<string>();
static Regex timeCodeLinePattern = new Regex(@"^timestamp: \d+:\d+:\d+:\d+, filepos: [\dabcdefABCDEF]+$", RegexOptions.Compiled);
public Idx(string fileName):this(File.ReadAllLines(fileName))
{
}
public Idx(string[] lines)
{
var timeCodeLinePattern = new Regex(@"^timestamp: \d+:\d+:\d+:\d+, filepos: [\dabcdefABCDEF]+$", RegexOptions.Compiled);
foreach (string line in lines)
{
if (timeCodeLinePattern.IsMatch(line))