Extracted language auto detection to "LanguageAutoDetect" + added two simple unit tests

This commit is contained in:
niksedk 2015-10-07 21:58:57 +02:00
parent 66672b7c03
commit 075de1b239
32 changed files with 3090 additions and 767 deletions

View File

@ -41,7 +41,7 @@ namespace Nikse.SubtitleEdit.Core.Forms
var splittedIndexes = new List<int>();
var autoBreakedIndexes = new List<int>();
var splittedSubtitle = new Subtitle();
string language = Utilities.AutoDetectGoogleLanguage(subtitle);
string language = LanguageAutoDetect.AutoDetectGoogleLanguage(subtitle);
for (int i = 0; i < subtitle.Paragraphs.Count; i++)
{
bool added = false;

716
libse/LanguageAutoDetect.cs Normal file
View File

@ -0,0 +1,716 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
namespace Nikse.SubtitleEdit.Core
{
public static class LanguageAutoDetect
{
private static int GetCount(string text, params string[] words)
{
int count = 0;
for (int i = 0; i < words.Length; i++)
{
count += Regex.Matches(text, "\\b" + words[i] + "\\b", (RegexOptions.CultureInvariant | RegexOptions.ExplicitCapture)).Count;
}
return count;
}
private static int GetCountContains(string text, params string[] words)
{
int count = 0;
for (int i = 0; i < words.Length; i++)
{
var regEx = new Regex(words[i]);
count += regEx.Matches(text).Count;
}
return count;
}
public static string AutoDetectGoogleLanguage(Encoding encoding)
{
switch (encoding.CodePage)
{
case 860:
return "pt"; // Portuguese
case 28599:
case 1254:
return "tr"; // Turkish
case 28598:
case 1255:
return "he"; // Hebrew
case 28596:
case 1256:
return "ar"; // Arabic
case 1258:
return "vi"; // Vietnamese
case 949:
case 1361:
case 20949:
case 51949:
case 50225:
return "ko"; // Korean
case 1253:
case 28597:
return "el"; // Greek
case 50220:
case 50221:
case 50222:
case 51932:
case 20932:
case 10001:
return "ja"; // Japanese
case 20000:
case 20002:
case 20936:
case 950:
case 52936:
case 54936:
case 51936:
return "zh"; // Chinese
default:
return null;
}
}
public static readonly string[] AutoDetectWordsEnglish = { "we", "are", "and", "you", "your", "what" };
public static readonly string[] AutoDetectWordsDanish = { "vi", "han", "og", "jeg", "var", "men", "gider", "bliver", "virkelig", "kommer", "tilbage", "Hej" };
public static readonly string[] AutoDetectWordsNorwegian = { "vi", "er", "og", "jeg", "var", "men" };
public static readonly string[] AutoDetectWordsSwedish = { "vi", "är", "och", "Jag", "inte", "för" };
public static readonly string[] AutoDetectWordsSpanish = { "el", "bien", "Vamos", "Hola", "casa", "con" };
public static readonly string[] AutoDetectWordsFrench = { "un", "vous", "avec", "pas", "ce", "une" };
public static readonly string[] AutoDetectWordsGerman = { "und", "auch", "sich", "bin", "hast", "möchte" };
public static readonly string[] AutoDetectWordsDutch = { "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n" };
public static readonly string[] AutoDetectWordsPolish = { "Czy", "ale", "ty", "siê", "jest", "mnie" };
public static readonly string[] AutoDetectWordsItalian = { "Cosa", "sono", "Grazie", "Buongiorno", "bene", "questo", "ragazzi", "propriamente", "numero", "hanno", "giorno", "faccio", "davvero", "negativo", "essere", "vuole", "sensitivo", "venire" };
public static readonly string[] AutoDetectWordsPortuguese = { "[Nn]ão", "Então", "Estás", "isso", "com" };
public static readonly string[] AutoDetectWordsGreek = { "μου", "είναι", "Είναι", "αυτό", "Τόμπυ", "καλά", "Ενταξει", "Ενταξει", "πρεπει", "Λοιπον", "τιποτα", "ξερεις" };
public static readonly string[] AutoDetectWordsRussian = { "Это", "не", "ты", "что", "это", "Мы", "Да", "Нет", "Ты", "нет", "Он", "его", "тебя", "как", "Не", "вы", "меня", "Но", "то", "всё", "бы", "мы", "мне", "вас", "знаю", "ещё", "за", "нас", "чтобы", "был" };
public static readonly string[] AutoDetectWordsBulgarian = { "Какво", "тук", "може", "Как", "Ваше", "какво" };
public static readonly string[] AutoDetectWordsRomanian = { "Какво", "тук", "може", "Как", "Ваше", "какво" };
public static readonly string[] AutoDetectWordsArabic = { "Какво", "тук", "може", "Как", "Ваше", "какво" };
public static readonly string[] AutoDetectWordsHebrew = { "אתה", "אולי", "הוא", "בסדר", "יודע", "טוב" };
public static readonly string[] AutoDetectWordsVietnamese = { "không", "tôi", "anh", "đó", "Tôi", "ông" };
public static readonly string[] AutoDetectWordsHungarian = { "hogy", "lesz", "tudom", "vagy", "mondtam", "még" };
public static readonly string[] AutoDetectWordsTurkish = { "için", "Tamam", "Hayır", "benim", "daha", "deðil", "önce", "lazým", "benim", "çalýþýyor", "burada", "efendim" };
public static readonly string[] AutoDetectWordsCroatianAndSerbian = { "sam", "ali", "nije", "samo", "ovo", "kako", "dobro", "sve", "tako", "će", "mogu", "ću", "zašto", "nešto", "za" };
public static readonly string[] AutoDetectWordsCroatian = { "što", "ovdje", "gdje", "kamo", "tko", "prije", "uvijek", "vrijeme", "vidjeti", "netko",
"vidio", "nitko", "bok", "lijepo", "oprosti", "htio", "mjesto", "oprostite", "čovjek", "dolje",
"čovječe", "dvije", "dijete", "dio", "poslije", "događa", "vjerovati", "vjerojatno", "vjerujem", "točno",
"razumijem", "vidjela", "cijeli", "svijet", "obitelj", "volio", "sretan", "dovraga", "svijetu", "htjela",
"vidjeli", "negdje", "želio", "ponovno", "djevojka", "umrijeti", "čovjeka", "mjesta", "djeca", "osjećam",
"uopće", "djecu", "naprijed", "obitelji", "doista", "mjestu", "lijepa", "također", "riječ", "tijelo" };
public static readonly string[] AutoDetectWordsSerbian = { "šta", "ovde", "gde", "ko", "pre", "uvek", "vreme", "videti", "neko",
"video", "niko", "ćao", "lepo", "izvini", "hteo", "mesto", "izvinite", "čovek", "dole",
"čoveče", "dve", "dete", "deo", "posle", "dešava", "verovati", "verovatno", "verujem", "tačno",
"razumem", "videla", "ceo", "svet", "porodica", "voleo", "srećan", "dođavola", "svetu", "htela",
"videli", "negde", "želeo", "ponovo", "devojka", "umreti", "čoveka", "mesta", "deca", "osećam",
"uopšte", "decu", "napred", "porodicu", "zaista", "mestu", "lepa", "takođe", "reč", "telo" };
public static string AutoDetectGoogleLanguage(string text, int bestCount)
{
int count = GetCount(text, AutoDetectWordsEnglish);
if (count > bestCount)
return "en";
count = GetCount(text, AutoDetectWordsDanish);
if (count > bestCount)
{
int norwegianCount = GetCount(text, "ut", "deg", "meg", "merkelig", "mye", "spørre");
int dutchCount = GetCount(text, "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n");
if (norwegianCount < 2 && dutchCount < count)
return "da";
}
count = GetCount(text, AutoDetectWordsNorwegian);
if (count > bestCount)
{
int danishCount = GetCount(text, "siger", "dig", "mig", "mærkelig", "tilbage", "spørge");
int dutchCount = GetCount(text, "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n");
if (danishCount < 2 && dutchCount < count)
return "no";
}
count = GetCount(text, AutoDetectWordsSwedish);
if (count > bestCount)
return "sv";
count = GetCount(text, AutoDetectWordsSpanish);
if (count > bestCount)
{
int frenchCount = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not spanish words
int portugueseCount = GetCount(text, "[NnCc]ão", "Então", "h?ouve", "pessoal", "rapariga", "tivesse", "fizeste",
"jantar", "conheço", "atenção", "foste", "milhões", "devias", "ganhar", "raios"); // not spanish words
if (frenchCount < 2 && portugueseCount < 2)
return "es";
}
count = GetCount(text, AutoDetectWordsItalian);
if (count > bestCount)
{
int frenchCount = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not italian words
if (frenchCount < 2)
return "it";
}
count = GetCount(text, AutoDetectWordsFrench);
if (count > bestCount)
{
int romanianCount = GetCount(text, "[Ss]înt", "aici", "domnule", "pentru", "Vreau");
if (romanianCount < 5)
return "fr";
}
count = GetCount(text, AutoDetectWordsPortuguese);
if (count > bestCount)
return "pt"; // Portuguese
count = GetCount(text, AutoDetectWordsGerman);
if (count > bestCount)
return "de";
count = GetCount(text, AutoDetectWordsDutch);
if (count > bestCount)
return "nl";
count = GetCount(text, AutoDetectWordsPolish);
if (count > bestCount)
return "pl";
count = GetCount(text, AutoDetectWordsGreek);
if (count > bestCount)
return "el"; // Greek
count = GetCount(text, AutoDetectWordsRussian);
if (count > bestCount)
return "ru"; // Russian
count = GetCount(text, AutoDetectWordsBulgarian);
if (count > bestCount)
return "bg"; // Bulgarian
count = GetCount(text, AutoDetectWordsArabic);
if (count > bestCount)
{
if (GetCount(text, "אולי", "אולי", "אולי", "אולי", "טוב", "טוב") > 10)
return "he";
int romanianCount = GetCount(text, "sînt", "aici", "Sînt", "domnule", "pentru", "Vreau", "trãiascã", "niciodatã", "înseamnã",
"vorbesti", "oamenii", "Asteaptã", "fãcut", "Fãrã", "spune", "decât", "pentru", "vreau");
if (romanianCount > count)
return "ro"; // Romanian
romanianCount = GetCount(text, "daca", "pentru", "acum", "soare", "trebuie", "Trebuie", "nevoie", "decat", "echilibrul",
"vorbesti", "oamenii", "zeului", "vrea", "atunci", "Poate", "Acum", "memoria", "soarele");
if (romanianCount > count)
return "ro"; // Romanian
return "ar"; // Arabic
}
count = GetCount(text, AutoDetectWordsHebrew);
if (count > bestCount)
return "he"; // Hebrew
count = GetCount(text, AutoDetectWordsCroatianAndSerbian);
if (count > bestCount)
{
int croatianCount = GetCount(text, AutoDetectWordsCroatian);
int serbianCount = GetCount(text, AutoDetectWordsSerbian);
if (croatianCount > serbianCount)
return "hr"; // Croatian
return "sr"; // Serbian
}
count = GetCount(text, AutoDetectWordsVietnamese);
if (count > bestCount)
return "vi"; // Vietnamese
count = GetCount(text, AutoDetectWordsHungarian);
if (count > bestCount)
return "hu"; // Hungarian
count = GetCount(text, AutoDetectWordsTurkish);
if (count > bestCount)
return "tr"; // Turkish
count = GetCount(text, "yang", "tahu", "bisa", "akan", "tahun", "tapi", "dengan", "untuk", "rumah", "dalam", "sudah", "bertemu");
if (count > bestCount)
return "id"; // Indonesian
count = GetCount(text, "โอ", "โรเบิร์ต", "วิตตอเรีย", "ดร", "คุณตำรวจ", "ราเชล", "ไม่", "เลดดิส", "พระเจ้า", "เท็ดดี้", "หัวหน้า", "แอนดรูว์");
if (count > 10 || count > bestCount)
return "th"; // Thai
count = GetCount(text, "그리고", "아니야", "하지만", "말이야", "그들은", "우리가");
if (count > 10 || count > bestCount)
return "ko"; // Korean
count = GetCount(text, "että", "kuin", "minä", "mitään", "Mutta", "siitä", "täällä", "poika", "Kiitos", "enää", "vielä", "tässä");
if (count > bestCount)
return "fi"; // Finnish
count = GetCount(text, "sînt", "aici", "Sînt", "domnule", "pentru", "Vreau", "trãiascã", "niciodatã", "înseamnã", "vorbesti", "oamenii",
"Asteaptã", "fãcut", "Fãrã", "spune", "decât", "pentru", "vreau");
if (count > bestCount)
return "ro"; // Romanian
count = GetCount(text, "daca", "pentru", "acum", "soare", "trebuie", "Trebuie", "nevoie", "decat", "echilibrul", "vorbesti", "oamenii",
"zeului", "vrea", "atunci", "Poate", "Acum", "memoria", "soarele");
if (count > bestCount)
return "ro"; // Romanian
count = GetCountContains(text, "シ", "ュ", "シン", "シ", "ン", "ユ");
count += GetCountContains(text, "イ", "ン", "チ", "ェ", "ク", "ハ");
count += GetCountContains(text, "シ", "ュ", "う", "シ", "ン", "サ");
count += GetCountContains(text, "シ", "ュ", "シ", "ン", "だ", "う");
if (count > bestCount * 2)
return "ja"; // Japanese - not tested...
count = GetCountContains(text, "是", "是早", "吧", "的", "爱", "上好");
count += GetCountContains(text, "的", "啊", "好", "好", "亲", "的");
count += GetCountContains(text, "谢", "走", "吧", "晚", "上", "好");
count += GetCountContains(text, "来", "卡", "拉", "吐", "滚", "他");
if (count > bestCount * 2)
return "zh"; // Chinese (simplified) - not tested...
return string.Empty;
}
public static string AutoDetectGoogleLanguage(Subtitle subtitle)
{
string languageId = AutoDetectGoogleLanguageOrNull(subtitle);
if (languageId == null)
languageId = "en";
return languageId;
}
public static string AutoDetectGoogleLanguageOrNull(Subtitle subtitle)
{
var sb = new StringBuilder();
foreach (Paragraph p in subtitle.Paragraphs)
sb.AppendLine(p.Text);
string languageId = AutoDetectGoogleLanguage(sb.ToString(), subtitle.Paragraphs.Count / 14);
if (string.IsNullOrEmpty(languageId))
languageId = null;
return languageId;
}
public static string AutoDetectLanguageName(string languageName, Subtitle subtitle)
{
if (string.IsNullOrEmpty(languageName))
languageName = "en_US";
int bestCount = subtitle.Paragraphs.Count / 14;
var sb = new StringBuilder();
foreach (Paragraph p in subtitle.Paragraphs)
sb.AppendLine(p.Text);
string text = sb.ToString();
List<string> dictionaryNames = Utilities.GetDictionaryLanguages();
bool containsEnGb = false;
bool containsEnUs = false;
bool containsHrHr = false;
bool containsSrLatn = false;
foreach (string name in dictionaryNames)
{
if (name.Contains("[en_GB]"))
containsEnGb = true;
if (name.Contains("[en_US]"))
containsEnUs = true;
if (name.Contains("[hr_HR]"))
containsHrHr = true;
if (name.Contains("[sr-Latn]"))
containsSrLatn = true;
}
foreach (string name in dictionaryNames)
{
string shortName = string.Empty;
int start = name.IndexOf('[');
int end = name.IndexOf(']');
if (start > 0 && end > start)
{
start++;
shortName = name.Substring(start, end - start);
}
int count;
switch (shortName)
{
case "da_DK":
count = GetCount(text, "vi", "hun", "og", "jeg", "var", "men", "bliver", "meget", "spørger", "Hej", "utrolig", "dejligt");
if (count > bestCount)
{
int norweigianCount = GetCount(text, "ut", "deg", "meg", "merkelig", "mye", "spørre");
if (norweigianCount < 2)
languageName = shortName;
}
break;
case "nb_NO":
count = GetCount(text, AutoDetectWordsNorwegian);
if (count > bestCount)
{
int danishCount = GetCount(text, "siger", "dig", "mig", "mærkelig", "tilbage", "spørge");
int dutchCount = GetCount(text, "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n");
if (danishCount < 2 && dutchCount < count)
languageName = shortName;
}
break;
case "en_US":
count = GetCount(text, AutoDetectWordsEnglish);
if (count > bestCount)
{
languageName = shortName;
if (containsEnGb)
{
int usCount = GetCount(text, "color", "flavor", "honor", "humor", "neighbor", "honor");
int gbCount = GetCount(text, "colour", "flavour", "honour", "humour", "neighbour", "honour");
if (gbCount > usCount)
languageName = "en_GB";
}
}
break;
case "en_GB":
count = GetCount(text, "we", "are", "and", "you", "your", "what");
if (count > bestCount)
{
languageName = shortName;
if (containsEnUs)
{
int usCount = GetCount(text, "color", "flavor", "honor", "humor", "neighbor", "honor");
int gbCount = GetCount(text, "colour", "flavour", "honour", "humour", "neighbour", "honour");
if (gbCount < usCount)
languageName = "en_US";
}
}
break;
case "sv_SE":
count = GetCount(text, "vi", "är", "och", "Jag", "inte", "för");
if (count > bestCount)
languageName = shortName;
break;
case "es_ES":
count = GetCount(text, AutoDetectWordsSpanish);
if (count > bestCount)
{
int frenchWords = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not spanish words
if (frenchWords < 2)
languageName = shortName;
}
break;
case "fr_FR":
count = GetCount(text, AutoDetectWordsFrench);
if (count > bestCount)
{
int spanishWords = GetCount(text, "Hola", "nada", "Vamos", "pasa", "los", "como"); // not french words
int italianWords = GetCount(text, AutoDetectWordsItalian); // not italian words
if (spanishWords < 2 && italianWords < 2)
languageName = shortName;
}
break;
case "it_IT":
count = GetCount(text, AutoDetectWordsItalian);
if (count > bestCount)
{
int frenchWords = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not spanish words
int spanishWords = GetCount(text, "Hola", "nada", "Vamos", "pasa", "los", "como"); // not french words
if (frenchWords < 2 && spanishWords < 2)
languageName = shortName;
}
break;
case "de_DE":
count = GetCount(text, "und", "auch", "sich", "bin", "hast", "möchte");
if (count > bestCount)
languageName = shortName;
break;
case "nl_NL":
count = GetCount(text, "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n");
if (count > bestCount)
languageName = shortName;
break;
case "pl_PL":
count = GetCount(text, "Czy", "ale", "ty", "siê", "jest", "mnie");
if (count > bestCount)
languageName = shortName;
break;
case "el_GR":
count = GetCount(text, AutoDetectWordsGreek);
if (count > bestCount)
languageName = shortName;
break;
case "ru_RU":
count = GetCount(text, AutoDetectWordsRussian);
if (count > bestCount)
languageName = shortName;
break;
case "ro_RO":
count = GetCount(text, "sînt", "aici", "Sînt", "domnule", "pentru", "Vreau", "trãiascã", "niciodatã", "înseamnã", "vorbesti", "oamenii", "Asteaptã",
"fãcut", "Fãrã", "spune", "decât", "pentru", "vreau");
if (count > bestCount)
{
languageName = shortName;
}
else
{
count = GetCount(text, "daca", "pentru", "acum", "soare", "trebuie", "Trebuie", "nevoie", "decat", "echilibrul", "vorbesti", "oamenii", "zeului",
"vrea", "atunci", "Poate", "Acum", "memoria", "soarele");
if (count > bestCount)
languageName = shortName;
}
break;
case "hr_HR": // Croatian
count = GetCount(text, AutoDetectWordsCroatianAndSerbian);
if (count > bestCount)
{
languageName = shortName;
if (containsSrLatn)
{
int croatianCount = GetCount(text, AutoDetectWordsCroatian);
int serbianCount = GetCount(text, AutoDetectWordsSerbian);
if (serbianCount > croatianCount)
languageName = "sr-Latn";
}
}
break;
case "sr-Latn": // Serbian (Latin)
count = GetCount(text, AutoDetectWordsCroatianAndSerbian);
if (count > bestCount)
{
languageName = shortName;
if (containsHrHr)
{
int croatianCount = GetCount(text, AutoDetectWordsCroatian);
int serbianCount = GetCount(text, AutoDetectWordsSerbian);
if (serbianCount < croatianCount)
languageName = "hr_HR";
}
}
break;
case "pt_PT": // Portuguese
count = GetCount(text, AutoDetectWordsPortuguese);
if (count > bestCount)
languageName = shortName;
break;
case "pt_BR": // Portuguese (Brasil)
count = GetCount(text, AutoDetectWordsPortuguese);
if (count > bestCount)
languageName = shortName;
break;
case "hu_HU": // Hungarian
count = GetCount(text, AutoDetectWordsHungarian);
if (count > bestCount)
languageName = shortName;
break;
}
}
return languageName;
}
public static Encoding DetectAnsiEncoding(byte[] buffer)
{
if (Utilities.IsRunningOnMono())
return Encoding.Default;
try
{
Encoding encoding = DetectEncoding.EncodingTools.DetectInputCodepage(buffer);
Encoding greekEncoding = Encoding.GetEncoding(1253); // Greek
if (GetCount(greekEncoding.GetString(buffer), AutoDetectWordsGreek) > 5)
return greekEncoding;
Encoding russianEncoding = Encoding.GetEncoding(1251); // Cyrillic
if (GetCount(russianEncoding.GetString(buffer), "что", "быть", "весь", "этот", "один", "такой") > 5) // Russian
return russianEncoding;
if (GetCount(russianEncoding.GetString(buffer), "Какво", "тук", "може", "Как", "Ваше", "какво") > 5) // Bulgarian
return russianEncoding;
russianEncoding = Encoding.GetEncoding(28595); // Russian
if (GetCount(russianEncoding.GetString(buffer), "что", "быть", "весь", "этот", "один", "такой") > 5) // Russian
return russianEncoding;
Encoding thaiEncoding = Encoding.GetEncoding(874); // Thai
if (GetCount(thaiEncoding.GetString(buffer), "โอ", "โรเบิร์ต", "วิตตอเรีย", "ดร", "คุณตำรวจ", "ราเชล") + GetCount(thaiEncoding.GetString(buffer), "ไม่", "เลดดิส", "พระเจ้า", "เท็ดดี้", "หัวหน้า", "แอนดรูว์") > 5)
return thaiEncoding;
Encoding arabicEncoding = Encoding.GetEncoding(28596); // Arabic
Encoding hewbrewEncoding = Encoding.GetEncoding(28598); // Hebrew
if (GetCount(arabicEncoding.GetString(buffer), "من", "هل", "لا", "فى", "لقد", "ما") > 5)
{
if (GetCount(hewbrewEncoding.GetString(buffer), "אולי", "אולי", "אולי", "אולי", "טוב", "טוב") > 10)
return hewbrewEncoding;
return arabicEncoding;
}
if (GetCount(hewbrewEncoding.GetString(buffer), AutoDetectWordsHebrew) > 5)
return hewbrewEncoding;
return encoding;
}
catch
{
return Encoding.Default;
}
}
public static Encoding GetEncodingFromFile(string fileName)
{
var encoding = Encoding.Default;
try
{
foreach (EncodingInfo ei in Encoding.GetEncodings())
{
if (ei.CodePage + ": " + ei.DisplayName == Configuration.Settings.General.DefaultEncoding &&
ei.Name != Encoding.UTF8.BodyName &&
ei.Name != Encoding.Unicode.BodyName)
{
encoding = ei.GetEncoding();
break;
}
}
using (var file = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
var bom = new byte[12]; // Get the byte-order mark, if there is one
file.Position = 0;
file.Read(bom, 0, 12);
if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf)
encoding = Encoding.UTF8;
else if (bom[0] == 0xff && bom[1] == 0xfe)
encoding = Encoding.Unicode;
else if (bom[0] == 0xfe && bom[1] == 0xff) // utf-16 and ucs-2
encoding = Encoding.BigEndianUnicode;
else if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff) // ucs-4
encoding = Encoding.UTF32;
else if (bom[0] == 0x2b && bom[1] == 0x2f && bom[2] == 0x76 && (bom[3] == 0x38 || bom[3] == 0x39 || bom[3] == 0x2b || bom[3] == 0x2f)) // utf-7
encoding = Encoding.UTF7;
else if (file.Length > 12)
{
long length = file.Length;
if (length > 500000)
length = 500000;
file.Position = 0;
var buffer = new byte[length];
file.Read(buffer, 0, (int)length);
bool couldBeUtf8;
if (IsUtf8(buffer, out couldBeUtf8))
{
encoding = Encoding.UTF8;
}
else if (couldBeUtf8 && Configuration.Settings.General.DefaultEncoding == Encoding.UTF8.BodyName)
{ // keep utf-8 encoding if it's default
encoding = Encoding.UTF8;
}
else if (couldBeUtf8 && fileName.EndsWith(".xml", StringComparison.OrdinalIgnoreCase) && Encoding.Default.GetString(buffer).ToLower().Replace('\'', '"').Contains("encoding=\"utf-8\""))
{ // keep utf-8 encoding for xml files with utf-8 in header (without any utf-8 encoded characters, but with only allowed utf-8 characters)
encoding = Encoding.UTF8;
}
else if (Configuration.Settings.General.AutoGuessAnsiEncoding)
{
encoding = DetectAnsiEncoding(buffer);
Encoding greekEncoding = Encoding.GetEncoding(1253); // Greek
if (GetCount(greekEncoding.GetString(buffer), AutoDetectWordsGreek) > 5)
return greekEncoding;
Encoding russianEncoding = Encoding.GetEncoding(1251); // Cyrillic
if (GetCount(russianEncoding.GetString(buffer), "что", "быть", "весь", "этот", "один", "такой") > 5) // Russian
return russianEncoding;
if (GetCount(russianEncoding.GetString(buffer), "Какво", "тук", "може", "Как", "Ваше", "какво") > 5) // Bulgarian
return russianEncoding;
russianEncoding = Encoding.GetEncoding(28595); // Russian
if (GetCount(russianEncoding.GetString(buffer), "что", "быть", "весь", "этот", "один", "такой") > 5)
return russianEncoding;
Encoding thaiEncoding = Encoding.GetEncoding(874); // Thai
if (GetCount(thaiEncoding.GetString(buffer), "โอ", "โรเบิร์ต", "วิตตอเรีย", "ดร", "คุณตำรวจ", "ราเชล") + GetCount(thaiEncoding.GetString(buffer), "ไม่", "เลดดิส", "พระเจ้า", "เท็ดดี้", "หัวหน้า", "แอนดรูว์") > 5)
return thaiEncoding;
Encoding arabicEncoding = Encoding.GetEncoding(28596); // Arabic
Encoding hewbrewEncoding = Encoding.GetEncoding(28598); // Hebrew
if (GetCount(arabicEncoding.GetString(buffer), "من", "هل", "لا", "فى", "لقد", "ما") > 5)
{
if (GetCount(hewbrewEncoding.GetString(buffer), "אולי", "אולי", "אולי", "אולי", "טוב", "טוב") > 10)
return hewbrewEncoding;
return arabicEncoding;
}
if (GetCount(hewbrewEncoding.GetString(buffer), AutoDetectWordsHebrew) > 5)
return hewbrewEncoding;
Encoding romanianEncoding = Encoding.GetEncoding(1250); // Romanian
if (GetCount(romanianEncoding.GetString(buffer), "să", "şi", "văzut", "regulă", "găsit", "viaţă") > 99)
return romanianEncoding;
Encoding koreanEncoding = Encoding.GetEncoding(949); // Korean
if (GetCount(koreanEncoding.GetString(buffer), "그리고", "아니야", "하지만", "말이야", "그들은", "우리가") > 5)
return koreanEncoding;
}
}
}
}
catch
{
}
return encoding;
}
/// <summary>
/// Will try to determine if buffer is utf-8 encoded or not.
/// If any non-utf8 sequences are found then false is returned, if no utf8 multibytes sequences are found then false is returned.
/// </summary>
private static bool IsUtf8(byte[] buffer, out bool couldBeUtf8)
{
couldBeUtf8 = false;
int utf8Count = 0;
int i = 0;
while (i < buffer.Length - 3)
{
byte b = buffer[i];
if (b > 127)
{
if (b >= 194 && b <= 223 && buffer[i + 1] >= 128 && buffer[i + 1] <= 191)
{ // 2-byte sequence
utf8Count++;
i++;
}
else if (b >= 224 && b <= 239 && buffer[i + 1] >= 128 && buffer[i + 1] <= 191 &&
buffer[i + 2] >= 128 && buffer[i + 2] <= 191)
{ // 3-byte sequence
utf8Count++;
i += 2;
}
else if (b >= 240 && b <= 244 && buffer[i + 1] >= 128 && buffer[i + 1] <= 191 &&
buffer[i + 2] >= 128 && buffer[i + 2] <= 191 &&
buffer[i + 3] >= 128 && buffer[i + 3] <= 191)
{ // 4-byte sequence
utf8Count++;
i += 3;
}
else
{
return false;
}
}
i++;
}
couldBeUtf8 = true;
if (utf8Count == 0)
return false; // not utf-8 (no characters utf-8 encoded...)
return true;
}
}
}

View File

@ -173,6 +173,7 @@
<Compile Include="ImageSplitter.cs" />
<Compile Include="ImageSplitterItem.cs" />
<Compile Include="Language.cs" />
<Compile Include="LanguageAutoDetect.cs" />
<Compile Include="LanguageDeserializer.cs" />
<Compile Include="LanguageStructure.cs" />
<Compile Include="ManagedBitmap.cs" />

View File

@ -139,7 +139,7 @@ namespace Nikse.SubtitleEdit.Core
{
try
{
sr = new StreamReader(fileName, Utilities.GetEncodingFromFile(fileName), true);
sr = new StreamReader(fileName, LanguageAutoDetect.GetEncodingFromFile(fileName), true);
}
catch
{

View File

@ -215,7 +215,7 @@ namespace Nikse.SubtitleEdit.Core.SubtitleFormats
}
}
var language = Utilities.AutoDetectGoogleLanguage(subtitle);
var language = LanguageAutoDetect.AutoDetectGoogleLanguage(subtitle);
if (language == "he") // Hebrew
{
_languageIdLine1 = LanguageIdHebrew;

View File

@ -82,7 +82,7 @@ namespace Nikse.SubtitleEdit.Core.SubtitleFormats
string languageEnglishName;
try
{
string languageShortName = Utilities.AutoDetectGoogleLanguage(subtitle);
string languageShortName = LanguageAutoDetect.AutoDetectGoogleLanguage(subtitle);
var ci = CultureInfo.CreateSpecificCulture(languageShortName);
languageEnglishName = ci.EnglishName;
int indexOfStartP = languageEnglishName.IndexOf('(');

View File

@ -39,7 +39,7 @@ namespace Nikse.SubtitleEdit.Core.SubtitleFormats
public override string ToText(Subtitle subtitle, string title)
{
string language = Utilities.AutoDetectLanguageName("en_US", subtitle);
string language = LanguageAutoDetect.AutoDetectLanguageName("en_US", subtitle);
var ci = CultureInfo.GetCultureInfo(language.Replace("_", "-"));
string languageTag = string.Format("{0}CC", language.Replace("_", string.Empty).ToUpper());
string languageName = ci.Parent.EnglishName;

View File

@ -699,7 +699,7 @@ namespace Nikse.SubtitleEdit.Core.SubtitleFormats
var sb = new StringBuilder();
sb.AppendLine("Scenarist_SCC V1.0");
sb.AppendLine();
string language = Utilities.AutoDetectGoogleLanguage(subtitle);
string language = LanguageAutoDetect.AutoDetectGoogleLanguage(subtitle);
for (int i = 0; i < subtitle.Paragraphs.Count; i++)
{
Paragraph p = subtitle.Paragraphs[i];

View File

@ -40,7 +40,7 @@ namespace Nikse.SubtitleEdit.Core.SubtitleFormats
" <body />" + Environment.NewLine +
"</tmx>";
string lang = Utilities.AutoDetectLanguageName("en_US", subtitle);
string lang = LanguageAutoDetect.AutoDetectLanguageName("en_US", subtitle);
if (lang.StartsWith("en_"))
lang = "EN";
else if (lang.Length == 5)

View File

@ -744,202 +744,6 @@ namespace Nikse.SubtitleEdit.Core
return s;
}
public static Encoding GetEncodingFromFile(string fileName)
{
Encoding encoding = Encoding.Default;
try
{
foreach (EncodingInfo ei in Encoding.GetEncodings())
{
if (ei.CodePage + ": " + ei.DisplayName == Configuration.Settings.General.DefaultEncoding &&
ei.Name != Encoding.UTF8.BodyName &&
ei.Name != Encoding.Unicode.BodyName)
{
encoding = ei.GetEncoding();
break;
}
}
using (var file = new FileStream(fileName, FileMode.Open, FileAccess.Read, FileShare.ReadWrite))
{
var bom = new byte[12]; // Get the byte-order mark, if there is one
file.Position = 0;
file.Read(bom, 0, 12);
if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf)
encoding = Encoding.UTF8;
else if (bom[0] == 0xff && bom[1] == 0xfe)
encoding = Encoding.Unicode;
else if (bom[0] == 0xfe && bom[1] == 0xff) // utf-16 and ucs-2
encoding = Encoding.BigEndianUnicode;
else if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff) // ucs-4
encoding = Encoding.UTF32;
else if (bom[0] == 0x2b && bom[1] == 0x2f && bom[2] == 0x76 && (bom[3] == 0x38 || bom[3] == 0x39 || bom[3] == 0x2b || bom[3] == 0x2f)) // utf-7
encoding = Encoding.UTF7;
else if (file.Length > 12)
{
long length = file.Length;
if (length > 500000)
length = 500000;
file.Position = 0;
var buffer = new byte[length];
file.Read(buffer, 0, (int)length);
bool couldBeUtf8;
if (IsUtf8(buffer, out couldBeUtf8))
{
encoding = Encoding.UTF8;
}
else if (couldBeUtf8 && Configuration.Settings.General.DefaultEncoding == Encoding.UTF8.BodyName)
{ // keep utf-8 encoding if it's default
encoding = Encoding.UTF8;
}
else if (couldBeUtf8 && fileName.EndsWith(".xml", StringComparison.OrdinalIgnoreCase) && Encoding.Default.GetString(buffer).ToLower().Replace('\'', '"').Contains("encoding=\"utf-8\""))
{ // keep utf-8 encoding for xml files with utf-8 in header (without any utf-8 encoded characters, but with only allowed utf-8 characters)
encoding = Encoding.UTF8;
}
else if (Configuration.Settings.General.AutoGuessAnsiEncoding)
{
encoding = DetectAnsiEncoding(buffer);
Encoding greekEncoding = Encoding.GetEncoding(1253); // Greek
if (GetCount(greekEncoding.GetString(buffer), AutoDetectWordsGreek) > 5)
return greekEncoding;
Encoding russianEncoding = Encoding.GetEncoding(1251); // Cyrillic
if (GetCount(russianEncoding.GetString(buffer), "что", "быть", "весь", "этот", "один", "такой") > 5) // Russian
return russianEncoding;
if (GetCount(russianEncoding.GetString(buffer), "Какво", "тук", "може", "Как", "Ваше", "какво") > 5) // Bulgarian
return russianEncoding;
russianEncoding = Encoding.GetEncoding(28595); // Russian
if (GetCount(russianEncoding.GetString(buffer), "что", "быть", "весь", "этот", "один", "такой") > 5)
return russianEncoding;
Encoding thaiEncoding = Encoding.GetEncoding(874); // Thai
if (GetCount(thaiEncoding.GetString(buffer), "โอ", "โรเบิร์ต", "วิตตอเรีย", "ดร", "คุณตำรวจ", "ราเชล") + GetCount(thaiEncoding.GetString(buffer), "ไม่", "เลดดิส", "พระเจ้า", "เท็ดดี้", "หัวหน้า", "แอนดรูว์") > 5)
return thaiEncoding;
Encoding arabicEncoding = Encoding.GetEncoding(28596); // Arabic
Encoding hewbrewEncoding = Encoding.GetEncoding(28598); // Hebrew
if (GetCount(arabicEncoding.GetString(buffer), "من", "هل", "لا", "فى", "لقد", "ما") > 5)
{
if (GetCount(hewbrewEncoding.GetString(buffer), "אולי", "אולי", "אולי", "אולי", "טוב", "טוב") > 10)
return hewbrewEncoding;
return arabicEncoding;
}
if (GetCount(hewbrewEncoding.GetString(buffer), AutoDetectWordsHebrew) > 5)
return hewbrewEncoding;
Encoding romanianEncoding = Encoding.GetEncoding(1250); // Romanian
if (GetCount(romanianEncoding.GetString(buffer), "să", "şi", "văzut", "regulă", "găsit", "viaţă") > 99)
return romanianEncoding;
Encoding koreanEncoding = Encoding.GetEncoding(949); // Korean
if (GetCount(koreanEncoding.GetString(buffer), "그리고", "아니야", "하지만", "말이야", "그들은", "우리가") > 5)
return koreanEncoding;
}
}
}
}
catch
{
}
return encoding;
}
/// <summary>
/// Will try to determine if buffer is utf-8 encoded or not.
/// If any non-utf8 sequences are found then false is returned, if no utf8 multibytes sequences are found then false is returned.
/// </summary>
private static bool IsUtf8(byte[] buffer, out bool couldBeUtf8)
{
couldBeUtf8 = false;
int utf8Count = 0;
int i = 0;
while (i < buffer.Length - 3)
{
byte b = buffer[i];
if (b > 127)
{
if (b >= 194 && b <= 223 && buffer[i + 1] >= 128 && buffer[i + 1] <= 191)
{ // 2-byte sequence
utf8Count++;
i++;
}
else if (b >= 224 && b <= 239 && buffer[i + 1] >= 128 && buffer[i + 1] <= 191 &&
buffer[i + 2] >= 128 && buffer[i + 2] <= 191)
{ // 3-byte sequence
utf8Count++;
i += 2;
}
else if (b >= 240 && b <= 244 && buffer[i + 1] >= 128 && buffer[i + 1] <= 191 &&
buffer[i + 2] >= 128 && buffer[i + 2] <= 191 &&
buffer[i + 3] >= 128 && buffer[i + 3] <= 191)
{ // 4-byte sequence
utf8Count++;
i += 3;
}
else
{
return false;
}
}
i++;
}
couldBeUtf8 = true;
if (utf8Count == 0)
return false; // not utf-8 (no characters utf-8 encoded...)
return true;
}
public static Encoding DetectAnsiEncoding(byte[] buffer)
{
if (IsRunningOnMono())
return Encoding.Default;
try
{
Encoding encoding = DetectEncoding.EncodingTools.DetectInputCodepage(buffer);
Encoding greekEncoding = Encoding.GetEncoding(1253); // Greek
if (GetCount(greekEncoding.GetString(buffer), AutoDetectWordsGreek) > 5)
return greekEncoding;
Encoding russianEncoding = Encoding.GetEncoding(1251); // Cyrillic
if (GetCount(russianEncoding.GetString(buffer), "что", "быть", "весь", "этот", "один", "такой") > 5) // Russian
return russianEncoding;
if (GetCount(russianEncoding.GetString(buffer), "Какво", "тук", "може", "Как", "Ваше", "какво") > 5) // Bulgarian
return russianEncoding;
russianEncoding = Encoding.GetEncoding(28595); // Russian
if (GetCount(russianEncoding.GetString(buffer), "что", "быть", "весь", "этот", "один", "такой") > 5) // Russian
return russianEncoding;
Encoding thaiEncoding = Encoding.GetEncoding(874); // Thai
if (GetCount(thaiEncoding.GetString(buffer), "โอ", "โรเบิร์ต", "วิตตอเรีย", "ดร", "คุณตำรวจ", "ราเชล") + GetCount(thaiEncoding.GetString(buffer), "ไม่", "เลดดิส", "พระเจ้า", "เท็ดดี้", "หัวหน้า", "แอนดรูว์") > 5)
return thaiEncoding;
Encoding arabicEncoding = Encoding.GetEncoding(28596); // Arabic
Encoding hewbrewEncoding = Encoding.GetEncoding(28598); // Hebrew
if (GetCount(arabicEncoding.GetString(buffer), "من", "هل", "لا", "فى", "لقد", "ما") > 5)
{
if (GetCount(hewbrewEncoding.GetString(buffer), "אולי", "אולי", "אולי", "אולי", "טוב", "טוב") > 10)
return hewbrewEncoding;
return arabicEncoding;
}
if (GetCount(hewbrewEncoding.GetString(buffer), AutoDetectWordsHebrew) > 5)
return hewbrewEncoding;
return encoding;
}
catch
{
return Encoding.Default;
}
}
public static string DictionaryFolder
{
get
@ -995,513 +799,6 @@ namespace Nikse.SubtitleEdit.Core
return duration;
}
private static int GetCount(string text, params string[] words)
{
int count = 0;
for (int i = 0; i < words.Length; i++)
{
count += Regex.Matches(text, "\\b" + words[i] + "\\b", (RegexOptions.CultureInvariant | RegexOptions.ExplicitCapture)).Count;
}
return count;
}
private static int GetCountContains(string text, params string[] words)
{
int count = 0;
for (int i = 0; i < words.Length; i++)
{
var regEx = new Regex(words[i]);
count += regEx.Matches(text).Count;
}
return count;
}
public static string AutoDetectGoogleLanguage(Encoding encoding)
{
switch (encoding.CodePage)
{
case 860:
return "pt"; // Portuguese
case 28599:
case 1254:
return "tr"; // Turkish
case 28598:
case 1255:
return "he"; // Hebrew
case 28596:
case 1256:
return "ar"; // Arabic
case 1258:
return "vi"; // Vietnamese
case 949:
case 1361:
case 20949:
case 51949:
case 50225:
return "ko"; // Korean
case 1253:
case 28597:
return "el"; // Greek
case 50220:
case 50221:
case 50222:
case 51932:
case 20932:
case 10001:
return "ja"; // Japanese
case 20000:
case 20002:
case 20936:
case 950:
case 52936:
case 54936:
case 51936:
return "zh"; // Chinese
default:
return null;
}
}
public static readonly string[] AutoDetectWordsEnglish = { "we", "are", "and", "you", "your", "what" };
public static readonly string[] AutoDetectWordsDanish = { "vi", "han", "og", "jeg", "var", "men", "gider", "bliver", "virkelig", "kommer", "tilbage", "Hej" };
public static readonly string[] AutoDetectWordsNorwegian = { "vi", "er", "og", "jeg", "var", "men" };
public static readonly string[] AutoDetectWordsSwedish = { "vi", "är", "och", "Jag", "inte", "för" };
public static readonly string[] AutoDetectWordsSpanish = { "el", "bien", "Vamos", "Hola", "casa", "con" };
public static readonly string[] AutoDetectWordsFrench = { "un", "vous", "avec", "pas", "ce", "une" };
public static readonly string[] AutoDetectWordsGerman = { "und", "auch", "sich", "bin", "hast", "möchte" };
public static readonly string[] AutoDetectWordsDutch = { "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n" };
public static readonly string[] AutoDetectWordsPolish = { "Czy", "ale", "ty", "siê", "jest", "mnie" };
public static readonly string[] AutoDetectWordsItalian = { "Cosa", "sono", "Grazie", "Buongiorno", "bene", "questo", "ragazzi", "propriamente", "numero", "hanno", "giorno", "faccio", "davvero", "negativo", "essere", "vuole", "sensitivo", "venire" };
public static readonly string[] AutoDetectWordsPortuguese = { "[Nn]ão", "Então", "Estás", "isso", "com" };
public static readonly string[] AutoDetectWordsGreek = { "μου", "είναι", "Είναι", "αυτό", "Τόμπυ", "καλά", "Ενταξει", "Ενταξει", "πρεπει", "Λοιπον", "τιποτα", "ξερεις" };
public static readonly string[] AutoDetectWordsRussian = { "Это", "не", "ты", "что", "это", "Мы", "Да", "Нет", "Ты", "нет", "Он", "его", "тебя", "как", "Не", "вы", "меня", "Но", "то", "всё", "бы", "мы", "мне", "вас", "знаю", "ещё", "за", "нас", "чтобы", "был" };
public static readonly string[] AutoDetectWordsBulgarian = { "Какво", "тук", "може", "Как", "Ваше", "какво" };
public static readonly string[] AutoDetectWordsRomanian = { "Какво", "тук", "може", "Как", "Ваше", "какво" };
public static readonly string[] AutoDetectWordsArabic = { "Какво", "тук", "може", "Как", "Ваше", "какво" };
public static readonly string[] AutoDetectWordsHebrew = { "אתה", "אולי", "הוא", "בסדר", "יודע", "טוב" };
public static readonly string[] AutoDetectWordsVietnamese = { "không", "tôi", "anh", "đó", "Tôi", "ông" };
public static readonly string[] AutoDetectWordsHungarian = { "hogy", "lesz", "tudom", "vagy", "mondtam", "még" };
public static readonly string[] AutoDetectWordsTurkish = { "için", "Tamam", "Hayır", "benim", "daha", "deðil", "önce", "lazým", "benim", "çalýþýyor", "burada", "efendim" };
public static readonly string[] AutoDetectWordsCroatianAndSerbian = { "sam", "ali", "nije", "samo", "ovo", "kako", "dobro", "sve", "tako", "će", "mogu", "ću", "zašto", "nešto", "za" };
public static readonly string[] AutoDetectWordsCroatian = { "što", "ovdje", "gdje", "kamo", "tko", "prije", "uvijek", "vrijeme", "vidjeti", "netko",
"vidio", "nitko", "bok", "lijepo", "oprosti", "htio", "mjesto", "oprostite", "čovjek", "dolje",
"čovječe", "dvije", "dijete", "dio", "poslije", "događa", "vjerovati", "vjerojatno", "vjerujem", "točno",
"razumijem", "vidjela", "cijeli", "svijet", "obitelj", "volio", "sretan", "dovraga", "svijetu", "htjela",
"vidjeli", "negdje", "želio", "ponovno", "djevojka", "umrijeti", "čovjeka", "mjesta", "djeca", "osjećam",
"uopće", "djecu", "naprijed", "obitelji", "doista", "mjestu", "lijepa", "također", "riječ", "tijelo" };
public static readonly string[] AutoDetectWordsSerbian = { "šta", "ovde", "gde", "ko", "pre", "uvek", "vreme", "videti", "neko",
"video", "niko", "ćao", "lepo", "izvini", "hteo", "mesto", "izvinite", "čovek", "dole",
"čoveče", "dve", "dete", "deo", "posle", "dešava", "verovati", "verovatno", "verujem", "tačno",
"razumem", "videla", "ceo", "svet", "porodica", "voleo", "srećan", "dođavola", "svetu", "htela",
"videli", "negde", "želeo", "ponovo", "devojka", "umreti", "čoveka", "mesta", "deca", "osećam",
"uopšte", "decu", "napred", "porodicu", "zaista", "mestu", "lepa", "takođe", "reč", "telo" };
public static string AutoDetectGoogleLanguage(string text, int bestCount)
{
int count = GetCount(text, AutoDetectWordsEnglish);
if (count > bestCount)
return "en";
count = GetCount(text, AutoDetectWordsDanish);
if (count > bestCount)
{
int norwegianCount = GetCount(text, "ut", "deg", "meg", "merkelig", "mye", "spørre");
int dutchCount = GetCount(text, "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n");
if (norwegianCount < 2 && dutchCount < count)
return "da";
}
count = GetCount(text, AutoDetectWordsNorwegian);
if (count > bestCount)
{
int danishCount = GetCount(text, "siger", "dig", "mig", "mærkelig", "tilbage", "spørge");
int dutchCount = GetCount(text, "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n");
if (danishCount < 2 && dutchCount < count)
return "no";
}
count = GetCount(text, AutoDetectWordsSwedish);
if (count > bestCount)
return "sv";
count = GetCount(text, AutoDetectWordsSpanish);
if (count > bestCount)
{
int frenchCount = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not spanish words
int portugueseCount = GetCount(text, "[NnCc]ão", "Então", "h?ouve", "pessoal", "rapariga", "tivesse", "fizeste",
"jantar", "conheço", "atenção", "foste", "milhões", "devias", "ganhar", "raios"); // not spanish words
if (frenchCount < 2 && portugueseCount < 2)
return "es";
}
count = GetCount(text, AutoDetectWordsItalian);
if (count > bestCount)
{
int frenchCount = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not italian words
if (frenchCount < 2)
return "it";
}
count = GetCount(text, AutoDetectWordsFrench);
if (count > bestCount)
{
int romanianCount = GetCount(text, "[Ss]înt", "aici", "domnule", "pentru", "Vreau");
if (romanianCount < 5)
return "fr";
}
count = GetCount(text, AutoDetectWordsPortuguese);
if (count > bestCount)
return "pt"; // Portuguese
count = GetCount(text, AutoDetectWordsGerman);
if (count > bestCount)
return "de";
count = GetCount(text, AutoDetectWordsDutch);
if (count > bestCount)
return "nl";
count = GetCount(text, AutoDetectWordsPolish);
if (count > bestCount)
return "pl";
count = GetCount(text, AutoDetectWordsGreek);
if (count > bestCount)
return "el"; // Greek
count = GetCount(text, AutoDetectWordsRussian);
if (count > bestCount)
return "ru"; // Russian
count = GetCount(text, AutoDetectWordsBulgarian);
if (count > bestCount)
return "bg"; // Bulgarian
count = GetCount(text, AutoDetectWordsArabic);
if (count > bestCount)
{
if (GetCount(text, "אולי", "אולי", "אולי", "אולי", "טוב", "טוב") > 10)
return "he";
int romanianCount = GetCount(text, "sînt", "aici", "Sînt", "domnule", "pentru", "Vreau", "trãiascã", "niciodatã", "înseamnã",
"vorbesti", "oamenii", "Asteaptã", "fãcut", "Fãrã", "spune", "decât", "pentru", "vreau");
if (romanianCount > count)
return "ro"; // Romanian
romanianCount = GetCount(text, "daca", "pentru", "acum", "soare", "trebuie", "Trebuie", "nevoie", "decat", "echilibrul",
"vorbesti", "oamenii", "zeului", "vrea", "atunci", "Poate", "Acum", "memoria", "soarele");
if (romanianCount > count)
return "ro"; // Romanian
return "ar"; // Arabic
}
count = GetCount(text, AutoDetectWordsHebrew);
if (count > bestCount)
return "he"; // Hebrew
count = GetCount(text, AutoDetectWordsCroatianAndSerbian);
if (count > bestCount)
{
int croatianCount = GetCount(text, AutoDetectWordsCroatian);
int serbianCount = GetCount(text, AutoDetectWordsSerbian);
if (croatianCount > serbianCount)
return "hr"; // Croatian
return "sr"; // Serbian
}
count = GetCount(text, AutoDetectWordsVietnamese);
if (count > bestCount)
return "vi"; // Vietnamese
count = GetCount(text, AutoDetectWordsHungarian);
if (count > bestCount)
return "hu"; // Hungarian
count = GetCount(text, AutoDetectWordsTurkish);
if (count > bestCount)
return "tr"; // Turkish
count = GetCount(text, "yang", "tahu", "bisa", "akan", "tahun", "tapi", "dengan", "untuk", "rumah", "dalam", "sudah", "bertemu");
if (count > bestCount)
return "id"; // Indonesian
count = GetCount(text, "โอ", "โรเบิร์ต", "วิตตอเรีย", "ดร", "คุณตำรวจ", "ราเชล", "ไม่", "เลดดิส", "พระเจ้า", "เท็ดดี้", "หัวหน้า", "แอนดรูว์");
if (count > 10 || count > bestCount)
return "th"; // Thai
count = GetCount(text, "그리고", "아니야", "하지만", "말이야", "그들은", "우리가");
if (count > 10 || count > bestCount)
return "ko"; // Korean
count = GetCount(text, "että", "kuin", "minä", "mitään", "Mutta", "siitä", "täällä", "poika", "Kiitos", "enää", "vielä", "tässä");
if (count > bestCount)
return "fi"; // Finnish
count = GetCount(text, "sînt", "aici", "Sînt", "domnule", "pentru", "Vreau", "trãiascã", "niciodatã", "înseamnã", "vorbesti", "oamenii",
"Asteaptã", "fãcut", "Fãrã", "spune", "decât", "pentru", "vreau");
if (count > bestCount)
return "ro"; // Romanian
count = GetCount(text, "daca", "pentru", "acum", "soare", "trebuie", "Trebuie", "nevoie", "decat", "echilibrul", "vorbesti", "oamenii",
"zeului", "vrea", "atunci", "Poate", "Acum", "memoria", "soarele");
if (count > bestCount)
return "ro"; // Romanian
count = GetCountContains(text, "シ", "ュ", "シン", "シ", "ン", "ユ");
count += GetCountContains(text, "イ", "ン", "チ", "ェ", "ク", "ハ");
count += GetCountContains(text, "シ", "ュ", "う", "シ", "ン", "サ");
count += GetCountContains(text, "シ", "ュ", "シ", "ン", "だ", "う");
if (count > bestCount * 2)
return "ja"; // Japanese - not tested...
count = GetCountContains(text, "是", "是早", "吧", "的", "爱", "上好");
count += GetCountContains(text, "的", "啊", "好", "好", "亲", "的");
count += GetCountContains(text, "谢", "走", "吧", "晚", "上", "好");
count += GetCountContains(text, "来", "卡", "拉", "吐", "滚", "他");
if (count > bestCount * 2)
return "zh"; // Chinese (simplified) - not tested...
return string.Empty;
}
public static string AutoDetectGoogleLanguage(Subtitle subtitle)
{
string languageId = AutoDetectGoogleLanguageOrNull(subtitle);
if (languageId == null)
languageId = "en";
return languageId;
}
public static string AutoDetectGoogleLanguageOrNull(Subtitle subtitle)
{
var sb = new StringBuilder();
foreach (Paragraph p in subtitle.Paragraphs)
sb.AppendLine(p.Text);
string languageId = AutoDetectGoogleLanguage(sb.ToString(), subtitle.Paragraphs.Count / 14);
if (string.IsNullOrEmpty(languageId))
languageId = null;
return languageId;
}
public static string AutoDetectLanguageName(string languageName, Subtitle subtitle)
{
if (string.IsNullOrEmpty(languageName))
languageName = "en_US";
int bestCount = subtitle.Paragraphs.Count / 14;
var sb = new StringBuilder();
foreach (Paragraph p in subtitle.Paragraphs)
sb.AppendLine(p.Text);
string text = sb.ToString();
List<string> dictionaryNames = GetDictionaryLanguages();
bool containsEnGb = false;
bool containsEnUs = false;
bool containsHrHr = false;
bool containsSrLatn = false;
foreach (string name in dictionaryNames)
{
if (name.Contains("[en_GB]"))
containsEnGb = true;
if (name.Contains("[en_US]"))
containsEnUs = true;
if (name.Contains("[hr_HR]"))
containsHrHr = true;
if (name.Contains("[sr-Latn]"))
containsSrLatn = true;
}
foreach (string name in dictionaryNames)
{
string shortName = string.Empty;
int start = name.IndexOf('[');
int end = name.IndexOf(']');
if (start > 0 && end > start)
{
start++;
shortName = name.Substring(start, end - start);
}
int count;
switch (shortName)
{
case "da_DK":
count = GetCount(text, "vi", "hun", "og", "jeg", "var", "men", "bliver", "meget", "spørger", "Hej", "utrolig", "dejligt");
if (count > bestCount)
{
int norweigianCount = GetCount(text, "ut", "deg", "meg", "merkelig", "mye", "spørre");
if (norweigianCount < 2)
languageName = shortName;
}
break;
case "nb_NO":
count = GetCount(text, AutoDetectWordsNorwegian);
if (count > bestCount)
{
int danishCount = GetCount(text, "siger", "dig", "mig", "mærkelig", "tilbage", "spørge");
int dutchCount = GetCount(text, "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n");
if (danishCount < 2 && dutchCount < count)
languageName = shortName;
}
break;
case "en_US":
count = GetCount(text, AutoDetectWordsEnglish);
if (count > bestCount)
{
languageName = shortName;
if (containsEnGb)
{
int usCount = GetCount(text, "color", "flavor", "honor", "humor", "neighbor", "honor");
int gbCount = GetCount(text, "colour", "flavour", "honour", "humour", "neighbour", "honour");
if (gbCount > usCount)
languageName = "en_GB";
}
}
break;
case "en_GB":
count = GetCount(text, "we", "are", "and", "you", "your", "what");
if (count > bestCount)
{
languageName = shortName;
if (containsEnUs)
{
int usCount = GetCount(text, "color", "flavor", "honor", "humor", "neighbor", "honor");
int gbCount = GetCount(text, "colour", "flavour", "honour", "humour", "neighbour", "honour");
if (gbCount < usCount)
languageName = "en_US";
}
}
break;
case "sv_SE":
count = GetCount(text, "vi", "är", "och", "Jag", "inte", "för");
if (count > bestCount)
languageName = shortName;
break;
case "es_ES":
count = GetCount(text, AutoDetectWordsSpanish);
if (count > bestCount)
{
int frenchWords = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not spanish words
if (frenchWords < 2)
languageName = shortName;
}
break;
case "fr_FR":
count = GetCount(text, AutoDetectWordsFrench);
if (count > bestCount)
{
int spanishWords = GetCount(text, "Hola", "nada", "Vamos", "pasa", "los", "como"); // not french words
int italianWords = GetCount(text, AutoDetectWordsItalian); // not italian words
if (spanishWords < 2 && italianWords < 2)
languageName = shortName;
}
break;
case "it_IT":
count = GetCount(text, AutoDetectWordsItalian);
if (count > bestCount)
{
int frenchWords = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not spanish words
int spanishWords = GetCount(text, "Hola", "nada", "Vamos", "pasa", "los", "como"); // not french words
if (frenchWords < 2 && spanishWords < 2)
languageName = shortName;
}
break;
case "de_DE":
count = GetCount(text, "und", "auch", "sich", "bin", "hast", "möchte");
if (count > bestCount)
languageName = shortName;
break;
case "nl_NL":
count = GetCount(text, "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n");
if (count > bestCount)
languageName = shortName;
break;
case "pl_PL":
count = GetCount(text, "Czy", "ale", "ty", "siê", "jest", "mnie");
if (count > bestCount)
languageName = shortName;
break;
case "el_GR":
count = GetCount(text, AutoDetectWordsGreek);
if (count > bestCount)
languageName = shortName;
break;
case "ru_RU":
count = GetCount(text, AutoDetectWordsRussian);
if (count > bestCount)
languageName = shortName;
break;
case "ro_RO":
count = GetCount(text, "sînt", "aici", "Sînt", "domnule", "pentru", "Vreau", "trãiascã", "niciodatã", "înseamnã", "vorbesti", "oamenii", "Asteaptã",
"fãcut", "Fãrã", "spune", "decât", "pentru", "vreau");
if (count > bestCount)
{
languageName = shortName;
}
else
{
count = GetCount(text, "daca", "pentru", "acum", "soare", "trebuie", "Trebuie", "nevoie", "decat", "echilibrul", "vorbesti", "oamenii", "zeului",
"vrea", "atunci", "Poate", "Acum", "memoria", "soarele");
if (count > bestCount)
languageName = shortName;
}
break;
case "hr_HR": // Croatian
count = GetCount(text, AutoDetectWordsCroatianAndSerbian);
if (count > bestCount)
{
languageName = shortName;
if (containsSrLatn)
{
int croatianCount = GetCount(text, AutoDetectWordsCroatian);
int serbianCount = GetCount(text, AutoDetectWordsSerbian);
if (serbianCount > croatianCount)
languageName = "sr-Latn";
}
}
break;
case "sr-Latn": // Serbian (Latin)
count = GetCount(text, AutoDetectWordsCroatianAndSerbian);
if (count > bestCount)
{
languageName = shortName;
if (containsHrHr)
{
int croatianCount = GetCount(text, AutoDetectWordsCroatian);
int serbianCount = GetCount(text, AutoDetectWordsSerbian);
if (serbianCount < croatianCount)
languageName = "hr_HR";
}
}
break;
case "pt_PT": // Portuguese
count = GetCount(text, AutoDetectWordsPortuguese);
if (count > bestCount)
languageName = shortName;
break;
case "pt_BR": // Portuguese (Brasil)
count = GetCount(text, AutoDetectWordsPortuguese);
if (count > bestCount)
languageName = shortName;
break;
case "hu_HU": // Hungarian
count = GetCount(text, AutoDetectWordsHungarian);
if (count > bestCount)
languageName = shortName;
break;
}
}
return languageName;
}
public static string ColorToHex(Color c)
{
return string.Format("#{0:x2}{1:x2}{2:x2}", c.R, c.G, c.B);

View File

@ -43,7 +43,7 @@ namespace Nikse.SubtitleEdit.Forms
}
comboBoxDictionaries.Items.Clear();
string languageName = Utilities.AutoDetectLanguageName(Configuration.Settings.General.SpellCheckLanguage, _subtitle);
string languageName = LanguageAutoDetect.AutoDetectLanguageName(Configuration.Settings.General.SpellCheckLanguage, _subtitle);
foreach (string name in Utilities.GetDictionaryLanguages())
{
comboBoxDictionaries.Items.Add(name);
@ -109,7 +109,7 @@ namespace Nikse.SubtitleEdit.Forms
}
}
languageName = Utilities.AutoDetectLanguageName(languageName, _subtitle);
languageName = LanguageAutoDetect.AutoDetectLanguageName(languageName, _subtitle);
if (comboBoxDictionaries.Items.Count > 0)
{
string name = comboBoxDictionaries.SelectedItem.ToString();

View File

@ -105,7 +105,7 @@ namespace Nikse.SubtitleEdit.Forms
var sub = new Subtitle();
foreach (Paragraph p in _paragraphs)
sub.Paragraphs.Add(p);
var language = Utilities.AutoDetectGoogleLanguage(sub);
var language = LanguageAutoDetect.AutoDetectGoogleLanguage(sub);
listViewFixes.BeginUpdate();
listViewFixes.Items.Clear();

View File

@ -773,7 +773,7 @@ namespace Nikse.SubtitleEdit.Forms
sub.RemoveEmptyLines();
if (checkBoxFixCasing.Checked)
{
_changeCasing.FixCasing(sub, Utilities.AutoDetectGoogleLanguage(sub));
_changeCasing.FixCasing(sub, LanguageAutoDetect.AutoDetectGoogleLanguage(sub));
_changeCasingNames.Initialize(sub);
_changeCasingNames.FixCasing();
}

View File

@ -110,7 +110,7 @@ namespace Nikse.SubtitleEdit.Forms
private void FindAllNames()
{
string language = Utilities.AutoDetectLanguageName("en_US", _subtitle);
string language = LanguageAutoDetect.AutoDetectLanguageName("en_US", _subtitle);
if (string.IsNullOrEmpty(language))
language = "en_US";

View File

@ -54,7 +54,7 @@ namespace Nikse.SubtitleEdit.Forms
_fileBuffer = new byte[0];
}
Encoding encoding = Utilities.DetectAnsiEncoding(_fileBuffer);
Encoding encoding = LanguageAutoDetect.DetectAnsiEncoding(_fileBuffer);
foreach (EncodingInfo ei in Encoding.GetEncodings())
{
var item = new ListViewItem(new[] { ei.CodePage.ToString(), ei.Name, ei.DisplayName });

View File

@ -68,7 +68,7 @@ namespace Nikse.SubtitleEdit.Forms
openFileDialog1.Filter = Utilities.GetOpenDialogFilter();
subtitleListView1.SelectIndexAndEnsureVisible(0);
_language1 = Utilities.AutoDetectGoogleLanguage(_subtitle1);
_language1 = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle1);
}
public void Initialize(Subtitle subtitle1, string subtitleFileName1, Subtitle subtitle2, string subtitleFileName2)
@ -81,7 +81,7 @@ namespace Nikse.SubtitleEdit.Forms
_subtitle2 = subtitle2;
labelSubtitle2.Text = subtitleFileName2;
_language1 = Utilities.AutoDetectGoogleLanguage(_subtitle1);
_language1 = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle1);
CompareSubtitles();
if (string.IsNullOrEmpty(subtitleFileName1))
@ -167,7 +167,7 @@ namespace Nikse.SubtitleEdit.Forms
subtitleListView1.SelectIndexAndEnsureVisible(0);
subtitleListView2.SelectIndexAndEnsureVisible(0);
labelSubtitle1.Text = openFileDialog1.FileName;
_language1 = Utilities.AutoDetectGoogleLanguage(_subtitle1);
_language1 = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle1);
if (_subtitle1.Paragraphs.Count > 0)
CompareSubtitles();
}
@ -908,7 +908,7 @@ namespace Nikse.SubtitleEdit.Forms
subtitleListView1.SelectIndexAndEnsureVisible(0);
subtitleListView2.SelectIndexAndEnsureVisible(0);
labelSubtitle1.Text = filePath;
_language1 = Utilities.AutoDetectGoogleLanguage(_subtitle1);
_language1 = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle1);
if (_subtitle1.Paragraphs.Count > 0)
CompareSubtitles();
}

View File

@ -2854,7 +2854,7 @@ $DROP=[DROPVALUE]" + Environment.NewLine + Environment.NewLine +
labelLanguage.Visible = true;
comboBoxLanguage.Visible = true;
comboBoxLanguage.Items.Clear();
string languageCode = Utilities.AutoDetectGoogleLanguageOrNull(subtitle);
string languageCode = LanguageAutoDetect.AutoDetectGoogleLanguageOrNull(subtitle);
if (languageCode == null)
languageCode = Configuration.Settings.Tools.ExportVobSubLanguage;
for (int i = 0; i < IfoParser.ArrayOfLanguage.Count; i++)

View File

@ -222,9 +222,9 @@ namespace Nikse.SubtitleEdit.Forms
public void Initialize(Subtitle subtitle, SubtitleFormat format, Encoding encoding)
{
_autoDetectGoogleLanguage = Utilities.AutoDetectGoogleLanguage(encoding); // Guess language via encoding
_autoDetectGoogleLanguage = LanguageAutoDetect.AutoDetectGoogleLanguage(encoding); // Guess language via encoding
if (string.IsNullOrEmpty(_autoDetectGoogleLanguage))
_autoDetectGoogleLanguage = Utilities.AutoDetectGoogleLanguage(subtitle); // Guess language based on subtitle contents
_autoDetectGoogleLanguage = LanguageAutoDetect.AutoDetectGoogleLanguage(subtitle); // Guess language based on subtitle contents
if (_autoDetectGoogleLanguage.Equals("zh", StringComparison.OrdinalIgnoreCase))
_autoDetectGoogleLanguage = "zh-CHS"; // Note that "zh-CHS" (Simplified Chinese) and "zh-CHT" (Traditional Chinese) are neutral cultures
CultureInfo ci = CultureInfo.GetCultureInfo(_autoDetectGoogleLanguage);
@ -515,7 +515,7 @@ namespace Nikse.SubtitleEdit.Forms
if (_namesEtcList == null)
{
_namesEtcList = new List<string>();
string languageTwoLetterCode = Utilities.AutoDetectGoogleLanguage(Subtitle);
string languageTwoLetterCode = LanguageAutoDetect.AutoDetectGoogleLanguage(Subtitle);
// Will contains both one word names and multi names
var namesList = new NamesList(Configuration.DictionariesFolder, languageTwoLetterCode, Configuration.Settings.WordLists.UseOnlineNamesEtc, Configuration.Settings.WordLists.NamesEtcUrl);

View File

@ -101,9 +101,9 @@ namespace Nikse.SubtitleEdit.Forms
_subtitle = subtitle;
_translatedSubtitle = new Subtitle(subtitle);
string defaultFromLanguage = Utilities.AutoDetectGoogleLanguage(encoding); // Guess language via encoding
string defaultFromLanguage = LanguageAutoDetect.AutoDetectGoogleLanguage(encoding); // Guess language via encoding
if (string.IsNullOrEmpty(defaultFromLanguage))
defaultFromLanguage = Utilities.AutoDetectGoogleLanguage(subtitle); // Guess language based on subtitle contents
defaultFromLanguage = LanguageAutoDetect.AutoDetectGoogleLanguage(subtitle); // Guess language based on subtitle contents
FillComboWithLanguages(comboBoxFrom);
int i = 0;

View File

@ -46,7 +46,7 @@ namespace Nikse.SubtitleEdit.Forms
{
try
{
Encoding encoding = Utilities.GetEncodingFromFile(fileName);
Encoding encoding = LanguageAutoDetect.GetEncodingFromFile(fileName);
string s = File.ReadAllText(fileName, encoding).Trim();
if (s.Contains('.'))
radioButtonSeconds.Checked = true;

View File

@ -607,7 +607,7 @@ namespace Nikse.SubtitleEdit.Forms
{
try
{
Encoding encoding = Utilities.GetEncodingFromFile(fileName);
Encoding encoding = LanguageAutoDetect.GetEncodingFromFile(fileName);
textBoxText.Text = File.ReadAllText(fileName, encoding);
SetVideoFileName(fileName);
}

View File

@ -66,7 +66,7 @@ namespace Nikse.SubtitleEdit.Forms
try
{
SubtitleListview1.Items.Clear();
Encoding encoding = Utilities.GetEncodingFromFile(fileName);
Encoding encoding = LanguageAutoDetect.GetEncodingFromFile(fileName);
textBoxText.Text = File.ReadAllText(fileName, encoding);
// check for RTF file

View File

@ -1934,7 +1934,7 @@ namespace Nikse.SubtitleEdit.Forms
if (format == null && ext == ".wsb")
{
var wsb = new Wsb();
var list = new List<string>(File.ReadAllLines(fileName, Utilities.GetEncodingFromFile(fileName)));
var list = new List<string>(File.ReadAllLines(fileName, LanguageAutoDetect.GetEncodingFromFile(fileName)));
if (wsb.IsMine(list, fileName))
{
wsb.LoadSubtitle(_subtitle, list, fileName);
@ -2102,7 +2102,7 @@ namespace Nikse.SubtitleEdit.Forms
try
{
var bdnXml = new BdnXml();
var list = new List<string>(File.ReadAllLines(fileName, Utilities.GetEncodingFromFile(fileName)));
var list = new List<string>(File.ReadAllLines(fileName, LanguageAutoDetect.GetEncodingFromFile(fileName)));
if (bdnXml.IsMine(list, fileName))
{
if (ContinueNewOrExit())
@ -2123,7 +2123,7 @@ namespace Nikse.SubtitleEdit.Forms
try
{
var fcpImage = new FinalCutProImage();
var list = new List<string>(File.ReadAllLines(fileName, Utilities.GetEncodingFromFile(fileName)));
var list = new List<string>(File.ReadAllLines(fileName, LanguageAutoDetect.GetEncodingFromFile(fileName)));
if (fcpImage.IsMine(list, fileName))
{
if (ContinueNewOrExit())
@ -2204,7 +2204,7 @@ namespace Nikse.SubtitleEdit.Forms
try
{
var dost = new Dost();
var list = new List<string>(File.ReadAllLines(fileName, Utilities.GetEncodingFromFile(fileName)));
var list = new List<string>(File.ReadAllLines(fileName, LanguageAutoDetect.GetEncodingFromFile(fileName)));
if (dost.IsMine(list, fileName))
{
if (ContinueNewOrExit())
@ -2223,7 +2223,7 @@ namespace Nikse.SubtitleEdit.Forms
try
{
var son = new Son();
var list = new List<string>(File.ReadAllLines(fileName, Utilities.GetEncodingFromFile(fileName)));
var list = new List<string>(File.ReadAllLines(fileName, LanguageAutoDetect.GetEncodingFromFile(fileName)));
if (son.IsMine(list, fileName))
{
if (ContinueNewOrExit())
@ -2264,7 +2264,7 @@ namespace Nikse.SubtitleEdit.Forms
try
{
var satBoxPng = new SatBoxPng();
var list = new List<string>(File.ReadAllLines(fileName, Utilities.GetEncodingFromFile(fileName)));
var list = new List<string>(File.ReadAllLines(fileName, LanguageAutoDetect.GetEncodingFromFile(fileName)));
if (satBoxPng.IsMine(list, fileName))
{
var subtitle = new Subtitle();
@ -2285,7 +2285,7 @@ namespace Nikse.SubtitleEdit.Forms
try
{
var sst = new SonicScenaristBitmaps();
var list = new List<string>(File.ReadAllLines(fileName, Utilities.GetEncodingFromFile(fileName)));
var list = new List<string>(File.ReadAllLines(fileName, LanguageAutoDetect.GetEncodingFromFile(fileName)));
if (sst.IsMine(list, fileName))
{
if (ContinueNewOrExit())
@ -2304,7 +2304,7 @@ namespace Nikse.SubtitleEdit.Forms
try
{
var htmlSamiArray = new HtmlSamiArray();
var list = new List<string>(File.ReadAllLines(fileName, Utilities.GetEncodingFromFile(fileName)));
var list = new List<string>(File.ReadAllLines(fileName, LanguageAutoDetect.GetEncodingFromFile(fileName)));
if (htmlSamiArray.IsMine(list, fileName))
{
htmlSamiArray.LoadSubtitle(_subtitle, list, fileName);
@ -2429,7 +2429,7 @@ namespace Nikse.SubtitleEdit.Forms
if (ext == ".xml" || ext == ".dfxp")
{
var sb = new StringBuilder();
foreach (var line in File.ReadAllLines(fileName, Utilities.GetEncodingFromFile(fileName)))
foreach (var line in File.ReadAllLines(fileName, LanguageAutoDetect.GetEncodingFromFile(fileName)))
sb.AppendLine(line);
var xmlAsString = sb.ToString().Trim();
@ -2453,7 +2453,7 @@ namespace Nikse.SubtitleEdit.Forms
// Try to use a generic subtitle format parser (guessing subtitle format)
try
{
var enc = Utilities.GetEncodingFromFile(fileName);
var enc = LanguageAutoDetect.GetEncodingFromFile(fileName);
var s = File.ReadAllText(fileName, enc);
// check for RTF file
@ -5119,7 +5119,7 @@ namespace Nikse.SubtitleEdit.Forms
return;
}
bool isSwedish = Utilities.AutoDetectGoogleLanguage(_subtitle) == "sv";
bool isSwedish = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle) == "sv";
string promptText = _language.TranslateSwedishToDanish;
if (!isSwedish)
promptText = _language.TranslateSwedishToDanishWarning;
@ -5424,7 +5424,7 @@ namespace Nikse.SubtitleEdit.Forms
int totalLinesChanged = 0;
try
{
wordSpellChecker = new WordSpellChecker(this, Utilities.AutoDetectGoogleLanguage(_subtitle));
wordSpellChecker = new WordSpellChecker(this, LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle));
wordSpellChecker.NewDocument();
Application.DoEvents();
}
@ -6694,10 +6694,10 @@ namespace Nikse.SubtitleEdit.Forms
private void ButtonAutoBreakClick(object sender, EventArgs e)
{
string language = Utilities.AutoDetectGoogleLanguage(_subtitle);
string language = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle);
string languageOriginal = string.Empty;
if (_subtitleAlternate != null)
languageOriginal = Utilities.AutoDetectGoogleLanguage(_subtitleAlternate);
languageOriginal = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitleAlternate);
if (SubtitleListview1.SelectedItems.Count > 1)
{
@ -7201,7 +7201,7 @@ namespace Nikse.SubtitleEdit.Forms
private void SplitSelectedParagraph(double? splitSeconds, int? textIndex)
{
string language = Utilities.AutoDetectGoogleLanguage(_subtitle);
string language = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle);
int? alternateTextIndex = null;
if (textBoxListViewTextAlternate.Focused)
@ -7454,7 +7454,7 @@ namespace Nikse.SubtitleEdit.Forms
var originalCurrent = Utilities.GetOriginalParagraph(firstSelectedIndex, currentParagraph, _subtitleAlternate.Paragraphs);
if (originalCurrent != null)
{
string languageOriginal = Utilities.AutoDetectGoogleLanguage(_subtitleAlternate);
string languageOriginal = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitleAlternate);
originalCurrent.EndTime.TotalMilliseconds = currentParagraph.EndTime.TotalMilliseconds;
var originalNew = new Paragraph(newParagraph);
@ -7639,7 +7639,7 @@ namespace Nikse.SubtitleEdit.Forms
private void MergeBeforeToolStripMenuItemClick(object sender, EventArgs e)
{
string language = Utilities.AutoDetectGoogleLanguage(_subtitle);
string language = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle);
if (_subtitle.Paragraphs.Count > 0 && SubtitleListview1.SelectedItems.Count > 0)
{
int firstSelectedIndex = SubtitleListview1.SelectedItems[0].Index;
@ -7746,7 +7746,7 @@ namespace Nikse.SubtitleEdit.Forms
string text = sb.ToString();
text = HtmlUtil.FixInvalidItalicTags(text);
text = ChangeAllLinesItalictoSingleItalic(text);
text = Utilities.AutoBreakLine(text, Utilities.AutoDetectGoogleLanguage(_subtitle));
text = Utilities.AutoBreakLine(text, LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle));
currentParagraph.Text = text;
//display time
@ -7894,7 +7894,7 @@ namespace Nikse.SubtitleEdit.Forms
if (old1.Contains(Environment.NewLine) || old2.Contains(Environment.NewLine) ||
old1.Length > Configuration.Settings.General.SubtitleLineMaximumLength || old2.Length > Configuration.Settings.General.SubtitleLineMaximumLength)
original.Text = Utilities.AutoBreakLine(original.Text, Utilities.AutoDetectGoogleLanguage(_subtitleAlternate));
original.Text = Utilities.AutoBreakLine(original.Text, LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitleAlternate));
if (string.IsNullOrWhiteSpace(old1))
original.Text = original.Text.TrimStart();
@ -7939,7 +7939,7 @@ namespace Nikse.SubtitleEdit.Forms
if (old1.Contains(Environment.NewLine) || old2.Contains(Environment.NewLine) ||
old1.Length > Configuration.Settings.General.SubtitleLineMaximumLength || old2.Length > Configuration.Settings.General.SubtitleLineMaximumLength)
currentParagraph.Text = Utilities.AutoBreakLine(currentParagraph.Text, Utilities.AutoDetectGoogleLanguage(_subtitle));
currentParagraph.Text = Utilities.AutoBreakLine(currentParagraph.Text, LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle));
if (string.IsNullOrWhiteSpace(old1))
currentParagraph.Text = currentParagraph.Text.TrimStart();
@ -9803,7 +9803,7 @@ namespace Nikse.SubtitleEdit.Forms
bool saveChangeCaseChanges = true;
var casingNamesLinesChanged = 0;
changeCasing.FixCasing(selectedLines, Utilities.AutoDetectLanguageName(Configuration.Settings.General.SpellCheckLanguage, _subtitle));
changeCasing.FixCasing(selectedLines, LanguageAutoDetect.AutoDetectLanguageName(Configuration.Settings.General.SpellCheckLanguage, _subtitle));
if (changeCasing.ChangeNamesToo)
{
using (var changeCasingNames = new ChangeCasingNames())
@ -11572,10 +11572,10 @@ namespace Nikse.SubtitleEdit.Forms
if (_subtitle.Paragraphs.Count > 0 && SubtitleListview1.SelectedItems.Count > 0)
{
MakeHistoryForUndo(_language.BeforeAutoBalanceSelectedLines);
string language = Utilities.AutoDetectGoogleLanguage(_subtitle);
string language = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle);
string languageOriginal = string.Empty;
if (_subtitleAlternate != null)
Utilities.AutoDetectGoogleLanguage(_subtitleAlternate);
LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitleAlternate);
foreach (ListViewItem item in SubtitleListview1.SelectedItems)
{
var p = _subtitle.GetParagraphOrDefault(item.Index);
@ -11779,7 +11779,7 @@ namespace Nikse.SubtitleEdit.Forms
if (autoBreakUnbreakLines.ShowDialog() == DialogResult.OK && autoBreakUnbreakLines.FixedText.Count > 0)
{
MakeHistoryForUndo(_language.BeforeAutoBalanceSelectedLines);
var language = Utilities.AutoDetectGoogleLanguage(_subtitle);
var language = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle);
SubtitleListview1.BeginUpdate();
foreach (int index in SubtitleListview1.SelectedIndices)
{
@ -13744,7 +13744,7 @@ namespace Nikse.SubtitleEdit.Forms
private void buttonGoogleTranslateIt_Click(object sender, EventArgs e)
{
string languageId = Utilities.AutoDetectGoogleLanguage(_subtitle);
string languageId = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle);
System.Diagnostics.Process.Start("https://translate.google.com/#auto|" + languageId + "|" + Utilities.UrlEncode(textBoxSearchWord.Text));
}
@ -17087,7 +17087,7 @@ namespace Nikse.SubtitleEdit.Forms
var p = _subtitle.GetParagraphOrDefault(firstSelectedIndex);
if (p != null)
{
string defaultFromLanguage = Utilities.AutoDetectGoogleLanguage(_subtitle);
string defaultFromLanguage = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitle);
string defaultToLanguage = defaultFromLanguage;
if (_subtitleAlternate != null)
{
@ -17095,7 +17095,7 @@ namespace Nikse.SubtitleEdit.Forms
if (o != null)
{
p = o;
defaultFromLanguage = Utilities.AutoDetectGoogleLanguage(_subtitleAlternate);
defaultFromLanguage = LanguageAutoDetect.AutoDetectGoogleLanguage(_subtitleAlternate);
}
}
Cursor = Cursors.WaitCursor;

View File

@ -110,7 +110,7 @@ namespace Nikse.SubtitleEdit.Forms
if (clearFixes)
listViewFixes.Items.Clear();
numberOfMerges = 0;
string language = Utilities.AutoDetectGoogleLanguage(subtitle);
string language = LanguageAutoDetect.AutoDetectGoogleLanguage(subtitle);
var mergedSubtitle = new Subtitle();
bool lastMerged = false;
Paragraph p = null;

View File

@ -52,7 +52,7 @@ namespace Nikse.SubtitleEdit.Forms
NumberOfMerges = 0;
_subtitle = subtitle;
MergeTextWithSameTimeCodes_ResizeEnd(null, null);
_language = Utilities.AutoDetectGoogleLanguage(subtitle);
_language = LanguageAutoDetect.AutoDetectGoogleLanguage(subtitle);
}
private void previewTimer_Tick(object sender, EventArgs e)

View File

@ -967,7 +967,7 @@ namespace Nikse.SubtitleEdit.Forms
}
}
if (autoDetect || string.IsNullOrEmpty(_languageName))
_languageName = Utilities.AutoDetectLanguageName(_languageName, subtitle);
_languageName = LanguageAutoDetect.AutoDetectLanguageName(_languageName, subtitle);
string dictionary = Utilities.DictionaryFolder + _languageName;
LoadDictionaries(dictionaryFolder, dictionary);
@ -1128,7 +1128,7 @@ namespace Nikse.SubtitleEdit.Forms
{
gd.ShowDialog(this);
}
FillSpellCheckDictionaries(Utilities.AutoDetectLanguageName(null, _subtitle));
FillSpellCheckDictionaries(LanguageAutoDetect.AutoDetectLanguageName(null, _subtitle));
if (comboBoxDictionaries.Items.Count > 0 && comboBoxDictionaries.SelectedIndex == -1)
comboBoxDictionaries.SelectedIndex = 0;
ComboBoxDictionariesSelectedIndexChanged(null, null);

View File

@ -145,7 +145,7 @@ namespace Nikse.SubtitleEdit.Forms
if (clearFixes)
listViewFixes.Items.Clear();
numberOfSplits = 0;
string language = Utilities.AutoDetectGoogleLanguage(subtitle);
string language = LanguageAutoDetect.AutoDetectGoogleLanguage(subtitle);
var splittedSubtitle = new Subtitle();
string[] expectedPunctuations = { ". -", "! -", "? -" };
for (int i = 0; i < subtitle.Paragraphs.Count; i++)

View File

@ -0,0 +1,39 @@
using System.IO;
using System.Text;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using Nikse.SubtitleEdit.Core;
namespace Test.Core
{
[DeploymentItem("Files")]
[TestClass]
public class LanguageAutoDetectTest
{
private static string GetLanguageCode(string fileName)
{
fileName = Path.Combine(Directory.GetCurrentDirectory(), fileName);
var sub = new Subtitle();
Encoding encoding;
sub.LoadSubtitle(fileName, out encoding, null);
return LanguageAutoDetect.AutoDetectGoogleLanguage(sub);
}
[TestMethod]
public void AutoDetectRussian()
{
var languageCode = GetLanguageCode("auto_detect_Russian.srt");
Assert.AreEqual(languageCode, "ru");
}
[TestMethod]
public void AutoDetectDanish()
{
var languageCode = GetLanguageCode("auto_detect_Danish.srt");
Assert.AreEqual(languageCode, "da");
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,860 @@
1
00:00:51,397 --> 00:00:56,603
Ричмонде, штат Вирджиния 1865
BORGERKRIGENS оформление
2
00:01:05,211 --> 00:01:09,515
Последний броненосец
блокады DER BRYDER союз
3
00:01:22,929 --> 00:01:25,303
Простите, г-н Kaptajn.
4
00:01:25,315 --> 00:01:27,700
Просто дальше. Вы принимаете
на себя в будущем!
5
00:01:32,105 --> 00:01:34,908
Готовы к вылету, г-н капитан.
6
00:01:43,817 --> 00:01:48,421
Готовьтесь войти kanalen.
Hastighed пяти узлов.
7
00:01:54,127 --> 00:01:59,199
- Направление nord. 115 градусов,
115 градусов на север.
8
00:02:01,401 --> 00:02:03,403
Огонь!
9
00:02:11,211 --> 00:02:13,113
Огонь!
10
00:02:18,318 --> 00:02:22,322
- Полный вперед - Да,
господин капитан!
11
00:02:53,019 --> 00:02:57,223
Стоп motorerne. Jeg хотим мира!
12
00:02:57,823 --> 00:03:02,628
Подожгли порты Лук пистолет!
13
00:03:49,609 --> 00:03:53,913
Датский перевод LHB и
Fields Synkroniseret Inside
14
00:04:15,801 --> 00:04:19,905
FLÅDEHISTORIKERS фанатичный SØGEN
EFTER GHOST военный корабль
15
00:06:17,421 --> 00:06:19,924
DEN не Батин.
16
00:07:44,008 --> 00:07:48,412
Г-жа Nwokolo. Я Ева Рохас, WHO.
Dette является д-р. Хоппер.
17
00:07:48,612 --> 00:07:50,614
Пожалуйста, следуйте с.
18
00:07:55,219 --> 00:07:58,623
Извините mørket. Hans
глаза не терпят света.
19
00:07:58,723 --> 00:08:02,627
- Как его зовут - Азикиве?
Большинство людей называют его Kiwe.
20
00:08:02,827 --> 00:08:07,398
Здравствуйте, Kiwe, меня зовут Eva.
Vi должны смотреть на тебя, ладно?
21
00:08:07,598 --> 00:08:10,401
- Как долго он болел
- два дня?
22
00:08:10,601 --> 00:08:15,205
- Неужели он ездил недавно - Он был
с отцом в Мали на прошлой неделе.
23
00:08:15,406 --> 00:08:19,410
- Где его отец теперь - Он в маяк.
Там он работает.
24
00:08:19,611 --> 00:08:21,801
Артериальное давление 80
более 50 Kredsløbssvigt.
25
00:08:21,813 --> 00:08:24,015
Мы даем ему кровь.
26
00:08:30,721 --> 00:08:33,824
- Сколько транквилизаторов, он должен иметь
- Дайте ему 2 мл?
27
00:08:38,897 --> 00:08:41,799
Это хорошо, Kiwe. Alt хорошо.
28
00:08:49,607 --> 00:08:53,310
- Ты в порядке -
Мали, как и другие?
29
00:08:55,613 --> 00:08:59,417
- Это epidemi.
- Есть шесть случаев. Это не достаточно.
30
00:08:59,517 --> 00:09:02,120
Сколько длится Tres?
31
00:09:02,220 --> 00:09:04,522
Шесть тысяч?
32
00:09:05,323 --> 00:09:09,226
Когда начинать det
at значит ничего?
33
00:09:15,900 --> 00:09:18,802
Нам нужно найти источник, Фрэнк.
34
00:09:20,204 --> 00:09:23,907
Вы хотите Мали, мужчин это
происходит у вас нет.
35
00:09:24,007 --> 00:09:27,811
ВОЗ не потеряет flere
medarbejdere в гражданскую войну.
36
00:09:28,411 --> 00:09:30,413
Сделайте свой доклад закончил.
37
00:09:30,513 --> 00:09:32,904
Прекрасно. Так что может быть
использовано для ligsynet.
38
00:09:32,916 --> 00:09:35,319
Ева?
39
00:09:38,021 --> 00:09:42,826
Я делаю то, что kan. Jeg передать
его на рассмотрение Совета.
40
00:09:43,026 --> 00:09:46,797
- Может быть, они слушают это gang.
- Tak.
41
00:09:47,297 --> 00:09:51,702
- Нам нужна кровь из faderen.
, я могу его найти.
42
00:10:14,124 --> 00:10:16,126
Алло?
43
00:10:18,496 --> 00:10:20,698
Г-н Nwokolo?
44
00:10:30,608 --> 00:10:32,810
Г-н Nwokolo?
45
00:11:17,921 --> 00:11:20,823
Вам нечего здесь делать.
46
00:11:27,798 --> 00:11:30,500
Быстрый Пометьте ее кошелек.
47
00:11:55,925 --> 00:11:58,495
Ты в порядке?
48
00:12:31,394 --> 00:12:34,697
Возьмите его Hvad ты делаешь?
49
00:12:40,302 --> 00:12:43,205
Остановить его с дерьмом!
50
00:12:46,109 --> 00:12:48,811
Stop держите спокойно.
51
00:12:54,416 --> 00:12:58,420
Постой тегов прямо сейчас!!
52
00:13:01,923 --> 00:13:04,894
Дайте мне skiftenøglen.
Jeg нужно сейчас.
53
00:13:05,094 --> 00:13:07,096
Есть ли?
54
00:13:09,198 --> 00:13:13,803
Поэтому я должен использовать olien.
Mange спасибо!
55
00:13:16,906 --> 00:13:20,709
Хорошо. Так же как и klaret.
Vi закончены.
56
00:13:21,310 --> 00:13:26,716
Извините. Позвольте мне воспользоваться
det. Jeg имени Аль Giordino.
57
00:13:27,016 --> 00:13:30,319
- Ева Rojas.
- Хорошая работа.
58
00:13:30,419 --> 00:13:32,822
Добро пожаловать на борт.
59
00:13:33,622 --> 00:13:36,725
Включите den. Få его.
60
00:13:41,497 --> 00:13:45,502
Привет. Чувствовать себя лучше?
СГЭ Вызывается Руди.
61
00:13:45,602 --> 00:13:49,605
- Простите, а где мы - Мы находимся
на Martha Ann. Это NUMA-офф судна.
62
00:13:49,806 --> 00:13:54,310
Мы не знаем, кто вы, и мы valgte
selv пропатчить себя в руки
63
00:13:54,510 --> 00:13:59,815
я ждал два месяца her. Ødelæg
речь идет не о дать ему утонуть.
64
00:14:00,015 --> 00:14:01,818
Да, адмирал.
65
00:14:02,318 --> 00:14:07,924
Пятый .. Четвёртое .. Третий ..
Второе .. 1!
66
00:14:20,302 --> 00:14:22,503
Дамы и господа, -
67
00:14:22,504 --> 00:14:27,709
- позвольте мне, после 772 лет пребывания
på havbunden представить вам -
68
00:14:28,010 --> 00:14:32,614
- король Батин!
69
00:14:34,115 --> 00:14:36,818
Молодцы, все.
70
00:14:36,918 --> 00:14:41,523
Кроме вас, Ал. Какого черта ты делаешь?
Det 10 тонн игры, а не дверь гаража!
71
00:14:41,723 --> 00:14:46,895
Вы думаете о ваших Томпсон 1291.
Men это 1293rd
72
00:14:47,196 --> 00:14:49,086
Но вы не можете
использовать любой из dem.
73
00:14:49,098 --> 00:14:50,999
Boys?
74
00:14:51,099 --> 00:14:54,503
Короля должна возвышаться над
folket på музей около пяти часов.
75
00:14:54,803 --> 00:14:58,808
- Это будет fremme.
, я надеюсь, тоже.
76
00:15:00,309 --> 00:15:04,713
- Вы на ноги
- Спасибо вам?
77
00:15:04,913 --> 00:15:08,116
К счастью, вы только потеряли taske.
Det это не место, чтобы пойти.
78
00:15:08,216 --> 00:15:12,320
- Черт, я потерял ход taske.
- Это было надеяться не стоит умирать?
79
00:15:12,521 --> 00:15:16,424
- Это жесткий spørgsmål.
- Нет ничего более ценного, чем ваша жизнь.
80
00:15:16,624 --> 00:15:20,196
Получить, что осел здесь и hjælp
med, чтобы очистить его отсюда.
81
00:15:20,296 --> 00:15:24,099
- Простите, "жена" вызов -
Получить его снимают с крючка!
82
00:15:24,300 --> 00:15:26,402
Получить промывают, мы
собираемся на вечеринку.
83
00:15:26,502 --> 00:15:28,704
--Адмирал отставке.
84
00:15:28,804 --> 00:15:32,808
- Джим Sandecker.
- Ева Рохас, я работаю в ВОЗ.
85
00:15:34,009 --> 00:15:37,013
Ты похож на тех, кто
нуждается кофе.
86
00:15:38,314 --> 00:15:42,818
Мне нужна ваша hjælp. Er его
слева направо или наоборот
87
00:15:43,018 --> 00:15:46,822
Это сводит меня с vanvid. Du должна
научить меня, чтобы связать его.
88
00:15:47,022 --> 00:15:50,992
- Это то, что делать с "вокруг дерева."
- I'll быть там через час.
89
00:15:51,292 --> 00:15:55,396
- Что? Дирк?
- Это было Oshodi.
90
00:15:55,696 --> 00:15:58,700
Он считает, что он что-то нашел.
91
00:15:58,900 --> 00:16:03,104
- Есть также. Это здорово!
- Спасибо.
92
00:16:03,304 --> 00:16:05,807
Нет, нет Det велик для меня!
93
00:16:05,907 --> 00:16:10,211
Я рад сказать правду Экер, у вас
не приходят на сегодня музей -
94
00:16:10,411 --> 00:16:13,914
- потому что одна из nigerianske
underverden нашли доказательства -
95
00:16:14,014 --> 00:16:18,420
- о том, что корабль затонул от
borgerkrigen er во время шторма в Африка.
96
00:16:18,620 --> 00:16:22,724
Это то, что вы говорите,
право Sandecker Freak Out!
97
00:16:22,924 --> 00:16:25,748
Я там. Он получает
все красные дюйма ..
98
00:16:25,760 --> 00:16:28,596
лиса в погоне за кроликом
вокруг дерева, -
99
00:16:28,796 --> 00:16:33,601
- в яму. Как завязать пн det.
Tag легко. Я буду там.
100
00:16:34,001 --> 00:16:37,606
Спасибо.
101
00:16:38,707 --> 00:16:42,510
Во-первых, я благодарю Лагос museum
for этот удивительный прием.
102
00:16:42,710 --> 00:16:48,816
Я также хочу поблагодарить наших
hovedsponsor på этого проекта, Ив Massarde.
103
00:16:54,022 --> 00:16:59,394
Мы NUMA. Это Nationale
Undervands Морское Агентство -
104
00:16:59,694 --> 00:17:03,999
- и это, дамы и господа,
hvad что мы делаем.
105
00:17:05,600 --> 00:17:08,002
Kong Батин.
106
00:17:10,505 --> 00:17:13,408
- Он не в буфете
- Черт.!
107
00:17:13,508 --> 00:17:15,410
- Хочешь Кебаб
- Нет, спасибо.
108
00:17:15,510 --> 00:17:19,614
Правительства и private
organisationer, как наша -
109
00:17:19,814 --> 00:17:25,820
- могут совместно содействовать at
historie, которые были потеряны по пути -
110
00:17:26,120 --> 00:17:31,592
- снова вернулся в свою благодарность folk.
Mange. Хорошего вечера.
111
00:17:37,798 --> 00:17:40,689
- Если у вас есть
компакт-дисков на вашем
112
00:17:40,701 --> 00:17:43,604
корабле - Да, я купил у
вас в прошлом месяце?
113
00:17:44,405 --> 00:17:46,795
У меня есть чудесное
произведение искусства
114
00:17:46,807 --> 00:17:49,209
her. Direkte из Иракского
национального музея.
115
00:17:49,409 --> 00:17:52,814
Не показывать мне эти ting. Så
хорошие друзья, это не так.
116
00:17:54,315 --> 00:17:57,218
Вот курса.
117
00:17:57,318 --> 00:18:02,022
Особый stykke. Det
я вам говорил.
118
00:18:03,323 --> 00:18:07,194
Это давит мое hjerte bare,
чтобы показать вам.
119
00:18:14,202 --> 00:18:18,606
- Где ты это
- Не касаясь?
120
00:18:19,006 --> 00:18:22,509
Таким образом, хорошие
друзья, мы бы и нет?
121
00:18:37,992 --> 00:18:40,494
Это большая партия.
122
00:18:42,096 --> 00:18:45,299
- Спасибо за приглашение, admiral.
- удовольствие на моей стороне.
123
00:18:45,399 --> 00:18:49,404
Ив, это женщина jeg fortalte вы, доктор.
Ева Рохас.
124
00:18:49,604 --> 00:18:52,707
Мне очень приятно встретиться Dem.
Mit зовут Ив Massarde.
125
00:18:52,807 --> 00:18:55,510
Это доктор. Фрэнк Хоппер.
126
00:18:55,610 --> 00:18:58,512
Ив делает masse
forretninger в Африке.
127
00:18:58,612 --> 00:19:01,115
Даже некоторые в Мали.
128
00:19:01,715 --> 00:19:03,717
Вы извините меня?
129
00:19:03,817 --> 00:19:08,623
Я понимаю, что я считаю, - дер-это
эпидемия на пути из Мали?
130
00:19:08,823 --> 00:19:11,592
- Мы не будем называть его epidemi.
- Что вы это называете?
131
00:19:11,692 --> 00:19:14,595
Epidemi. Så вы делаете
бизнес в Мали?
132
00:19:14,695 --> 00:19:19,299
Знаете кого-то, кто может помочь нам med at
нажмите ВОЗ направить группу там, внизу?
133
00:19:19,499 --> 00:19:23,203
- Мали находится под контролем
полевых командиров... генерал-Казим?
134
00:19:23,403 --> 00:19:25,304
Ты его знаешь?
135
00:19:25,305 --> 00:19:27,333
Он был лейтенантом в
армии, но дал себя selv en
136
00:19:27,345 --> 00:19:29,385
продвижение по службе, когда
он выстрелил в президента.
137
00:19:29,510 --> 00:19:33,313
- Он дает слово «военачальник» betydning.
И он управляет страной?
138
00:19:33,513 --> 00:19:37,117
Половина. Другие kontrollerer ingen
половины. Я не знаю, что хуже.
139
00:19:37,217 --> 00:19:41,221
Я предупреждаю вас. Это meget farligt
для иностранцев прямо сейчас.
140
00:19:41,421 --> 00:19:44,046
Но, вероятно, более
опасными для indfødte.
141
00:19:44,058 --> 00:19:46,694
Но ваши мертвые будут плохо
выглядеть в газетах.
142
00:19:46,894 --> 00:19:49,697
Это делает эпидемию тоже.
143
00:19:51,498 --> 00:19:57,604
Хорошо. Я пытаюсь позвонить вокруг
немного, мужчин я просто бизнесмен -
144
00:19:57,904 --> 00:20:01,708
- так было tålmodighed.
- Да, это ее сильная сторона.
145
00:20:01,808 --> 00:20:05,011
Он не поможет os. Det была
пустая трата времени.
146
00:20:05,211 --> 00:20:11,418
Вы можете не только баржи в en borgerkrig.
Вы знаете, это слишком опасно.
147
00:20:11,718 --> 00:20:13,620
Я думаю, мы должны вернуться...
148
00:20:14,221 --> 00:20:17,391
Адмирал, вы когда-нибудь видели
en gulddollar от Конфедерации?
149
00:20:17,591 --> 00:20:21,294
- Не начинайте снова - Нет, потому
что они никогда не делали один!
150
00:20:21,394 --> 00:20:23,797
Импринтинг машина
ødelagt ved войны.
151
00:20:23,997 --> 00:20:28,602
- Я молюсь dig., но не раньше,
чем Джефферсон получил пять лет.
152
00:20:28,802 --> 00:20:31,905
Четыре из них он дал
til sine генералов.
153
00:20:32,106 --> 00:20:34,496
Ли Джексон...
154
00:20:34,508 --> 00:20:36,910
Каждый раз, когда мы находимся
в Африке, есть корабль.
155
00:20:37,110 --> 00:20:40,714
А старые havnejournaler. Vi
едет в Австралию завтра.
156
00:20:40,814 --> 00:20:45,119
Четыре из них fundet.
Men пятый не является.
157
00:20:45,319 --> 00:20:50,790
Это был дан друг familien. En умелым
капитаном имени Мейсон гробниц.
158
00:20:50,990 --> 00:20:55,094
Капитан броненосец, CSS Техас.
159
00:20:56,796 --> 00:21:00,800
- Кто это у тебя - Oshodi,
и у него от Endigue?
160
00:21:00,900 --> 00:21:04,704
Важно то, что в Endigue
fandt Labbezanga в Мали.
161
00:21:04,904 --> 00:21:09,909
- Мой отец собирает mønter.
- монета отплыл в Нигере с Техас.
162
00:21:10,109 --> 00:21:14,914
- Невозможно. Она не может с strøm.
- отпусти меня к Labbezanga и нюхать мало.
163
00:21:15,114 --> 00:21:18,617
- Вы не получите моей båd.
- три дня. Всего за три дня!
164
00:21:19,418 --> 00:21:21,620
Представьте себе, что.
165
00:21:24,591 --> 00:21:29,796
Хорошо, если это не удается, snakker Я
никогда не говорить об этом больше!
166
00:21:31,698 --> 00:21:34,100
У вас есть 72 часа.
167
00:21:34,200 --> 00:21:38,204
Не наносекундных længere. I мальчика
только что купили вы на лодке.
168
00:21:38,404 --> 00:21:41,408
Вы джентльмен, uanset, что
говорят другие о тебе!
169
00:21:41,508 --> 00:21:44,711
Бьюсь об заклад, бутылку на
, мы никогда не найти его.
170
00:21:44,811 --> 00:21:47,414
Скажем, целый ящик.
171
00:22:17,910 --> 00:22:20,213
Привет.
172
00:22:20,314 --> 00:22:24,718
Я пришел sent. Jeg сказали, я
хотел бы получить в. Восьмой
173
00:22:24,918 --> 00:22:28,421
- Сказал я в. 9.
- это более 10!
174
00:22:30,489 --> 00:22:34,493
Правда Экер сказал, что мы должны
называть его hvis возникли проблемы.
175
00:22:34,693 --> 00:22:36,595
С чем?
176
00:22:36,795 --> 00:22:40,500
- Я должен взять нас вверх по реке к Mali.
- Что?
177
00:22:40,600 --> 00:22:45,704
- Нет, нет. Мы вас не Mali.
- Подожди!
178
00:22:45,905 --> 00:22:49,608
Существует вспышки в Mali.
Det может начаться эпидемия.
179
00:22:49,808 --> 00:22:52,311
И вы хотите, лифт, доктор?
180
00:22:52,411 --> 00:22:56,415
- ВОЗ сократит свой бюджет
- Это очень важно.
181
00:22:56,615 --> 00:23:00,720
- Иметь достаточно оборудования
- Наверное, нет?
182
00:23:05,891 --> 00:23:08,332
Это небольшой лодке. Нет
конфиденциальность!
183
00:23:08,344 --> 00:23:10,796
Я не стесняюсь.

View File

@ -1056,7 +1056,6 @@ namespace Test.Logic.Forms
}
[TestMethod]
[DeploymentItem("SubtitleEdit.exe")]
public void RemoveTextKeepMusicSymbolsButRemoveHI()
{
RemoveTextForHI target = GetRemoveTextForHiLib();
@ -1070,7 +1069,6 @@ namespace Test.Logic.Forms
}
[TestMethod]
[DeploymentItem("SubtitleEdit.exe")]
public void RemoveTextRemoveEmdash()
{
RemoveTextForHI target = GetRemoveTextForHiLib();
@ -1085,7 +1083,6 @@ namespace Test.Logic.Forms
}
[TestMethod]
[DeploymentItem("SubtitleEdit.exe")]
public void RemoveTextIfUppercaseEmdashRemoveInDialogue()
{
RemoveTextForHI target = GetRemoveTextForHiLib();
@ -1098,7 +1095,6 @@ namespace Test.Logic.Forms
}
[TestMethod]
[DeploymentItem("SubtitleEdit.exe")]
public void RemoveTextIfUppercaseEmdashRemoveInDialogueWithSpaces()
{
RemoveTextForHI target = GetRemoveTextForHiLib();

View File

@ -43,6 +43,7 @@
<Reference Include="System.Xml.Linq" />
</ItemGroup>
<ItemGroup>
<Compile Include="Core\LanguageAutoDetectTest.cs" />
<Compile Include="Core\StringExtensionsTest.cs" />
<Compile Include="Core\SubtitleTest.cs" />
<Compile Include="Logic\Ocr\BinaryOcrTest.cs" />
@ -114,6 +115,16 @@
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
</ItemGroup>
<ItemGroup>
<Content Include="Files\auto_detect_Danish.srt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
</ItemGroup>
<ItemGroup>
<Content Include="Files\auto_detect_Russian.srt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
</ItemGroup>
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
<PropertyGroup>
<PreBuildEvent>