Minor refact: more efficient LanguageAutoDetect.GetCount()

This commit is contained in:
Waldi Ravens 2015-10-19 14:39:30 +02:00
parent be1e4f0d49
commit c4beb2bac5

View File

@ -11,12 +11,9 @@ namespace Nikse.SubtitleEdit.Core
private static int GetCount(string text, params string[] words) private static int GetCount(string text, params string[] words)
{ {
int count = 0; var options = RegexOptions.CultureInvariant | RegexOptions.ExplicitCapture;
for (int i = 0; i < words.Length; i++) var pattern = "\\b(" + string.Join("|", words) + ")\\b";
{ return Regex.Matches(text, pattern, options).Count;
count += Regex.Matches(text, "\\b" + words[i] + "\\b", (RegexOptions.CultureInvariant | RegexOptions.ExplicitCapture)).Count;
}
return count;
} }
private static int GetCountContains(string text, params string[] words) private static int GetCountContains(string text, params string[] words)
@ -103,7 +100,7 @@ namespace Nikse.SubtitleEdit.Core
private static readonly string[] AutoDetectWordsHebrew = { "אתה", "אולי", "הוא", "בסדר", "יודע", "טוב" }; private static readonly string[] AutoDetectWordsHebrew = { "אתה", "אולי", "הוא", "בסדר", "יודע", "טוב" };
private static readonly string[] AutoDetectWordsVietnamese = { "không", "[Tt]ôi", "anh", "đó", "ông" }; private static readonly string[] AutoDetectWordsVietnamese = { "không", "[Tt]ôi", "anh", "đó", "ông" };
private static readonly string[] AutoDetectWordsHungarian = { "hogy", "lesz", "tudom", "vagy", "mondtam", "még" }; private static readonly string[] AutoDetectWordsHungarian = { "hogy", "lesz", "tudom", "vagy", "mondtam", "még" };
private static readonly string[] AutoDetectWordsTurkish = { "için", "Tamam", "Hayır", "benim", "daha", "deðil", "önce", "lazým", "benim", "çalýþýyor", "burada", "efendim" }; private static readonly string[] AutoDetectWordsTurkish = { "için", "Tamam", "Hayır", "benim", "daha", "deðil", "önce", "lazým", "çalýþýyor", "burada", "efendim" };
private static readonly string[] AutoDetectWordsCroatianAndSerbian = { "sam", "ali", "nije", "samo", "ovo", "kako", "dobro", "sve", "tako", "će", "mogu", "ću", "zašto", "nešto", "za" }; private static readonly string[] AutoDetectWordsCroatianAndSerbian = { "sam", "ali", "nije", "samo", "ovo", "kako", "dobro", "sve", "tako", "će", "mogu", "ću", "zašto", "nešto", "za" };
private static readonly string[] AutoDetectWordsCroatian = { "što", "ovdje", "gdje", "kamo", "tko", "prije", "uvijek", "vrijeme", "vidjeti", "netko", private static readonly string[] AutoDetectWordsCroatian = { "što", "ovdje", "gdje", "kamo", "tko", "prije", "uvijek", "vrijeme", "vidjeti", "netko",
"vidio", "nitko", "bok", "lijepo", "oprosti", "htio", "mjesto", "oprostite", "čovjek", "dolje", "vidio", "nitko", "bok", "lijepo", "oprosti", "htio", "mjesto", "oprostite", "čovjek", "dolje",
@ -548,14 +545,14 @@ namespace Nikse.SubtitleEdit.Core
return russianEncoding; return russianEncoding;
Encoding thaiEncoding = Encoding.GetEncoding(874); // Thai Encoding thaiEncoding = Encoding.GetEncoding(874); // Thai
if (GetCount(thaiEncoding.GetString(buffer), "โอ", "โรเบิร์ต", "วิตตอเรีย", "ดร", "คุณตำรวจ", "ราเชล") + GetCount(thaiEncoding.GetString(buffer), "ไม่", "เลดดิส", "พระเจ้า", "เท็ดดี้", "หัวหน้า", "แอนดรูว์") > 5) if (GetCount(thaiEncoding.GetString(buffer), "โอ", "โรเบิร์ต", "วิตตอเรีย", "ดร", "คุณตำรวจ", "ราเชล", "ไม่", "เลดดิส", "พระเจ้า", "เท็ดดี้", "หัวหน้า", "แอนดรูว์") > 5)
return thaiEncoding; return thaiEncoding;
Encoding arabicEncoding = Encoding.GetEncoding(28596); // Arabic Encoding arabicEncoding = Encoding.GetEncoding(28596); // Arabic
Encoding hewbrewEncoding = Encoding.GetEncoding(28598); // Hebrew Encoding hewbrewEncoding = Encoding.GetEncoding(28598); // Hebrew
if (GetCount(arabicEncoding.GetString(buffer), "من", "هل", "لا", "فى", "لقد", "ما") > 5) if (GetCount(arabicEncoding.GetString(buffer), AutoDetectWordsArabic) > 5)
{ {
if (GetCount(hewbrewEncoding.GetString(buffer), "אולי", "אולי", "אולי", "אולי", "טוב", "טוב") > 10) if (GetCount(hewbrewEncoding.GetString(buffer), AutoDetectWordsHebrew) > 10)
return hewbrewEncoding; return hewbrewEncoding;
return arabicEncoding; return arabicEncoding;
} }
@ -643,14 +640,14 @@ namespace Nikse.SubtitleEdit.Core
return russianEncoding; return russianEncoding;
Encoding thaiEncoding = Encoding.GetEncoding(874); // Thai Encoding thaiEncoding = Encoding.GetEncoding(874); // Thai
if (GetCount(thaiEncoding.GetString(buffer), "โอ", "โรเบิร์ต", "วิตตอเรีย", "ดร", "คุณตำรวจ", "ราเชล") + GetCount(thaiEncoding.GetString(buffer), "ไม่", "เลดดิส", "พระเจ้า", "เท็ดดี้", "หัวหน้า", "แอนดรูว์") > 5) if (GetCount(thaiEncoding.GetString(buffer), "โอ", "โรเบิร์ต", "วิตตอเรีย", "ดร", "คุณตำรวจ", "ราเชล", "ไม่", "เลดดิส", "พระเจ้า", "เท็ดดี้", "หัวหน้า", "แอนดรูว์") > 5)
return thaiEncoding; return thaiEncoding;
Encoding arabicEncoding = Encoding.GetEncoding(28596); // Arabic Encoding arabicEncoding = Encoding.GetEncoding(28596); // Arabic
Encoding hewbrewEncoding = Encoding.GetEncoding(28598); // Hebrew Encoding hewbrewEncoding = Encoding.GetEncoding(28598); // Hebrew
if (GetCount(arabicEncoding.GetString(buffer), "من", "هل", "لا", "فى", "لقد", "ما") > 5) if (GetCount(arabicEncoding.GetString(buffer), AutoDetectWordsArabic) > 5)
{ {
if (GetCount(hewbrewEncoding.GetString(buffer), "אולי", "אולי", "אולי", "אולי", "טוב", "טוב") > 10) if (GetCount(hewbrewEncoding.GetString(buffer), AutoDetectWordsHebrew) > 10)
return hewbrewEncoding; return hewbrewEncoding;
return arabicEncoding; return arabicEncoding;
} }