Merge pull request #1370 from xylographe/uadgl

Updated LanguageAutoDetect
This commit is contained in:
Nikolaj Olsson 2015-10-09 14:55:58 +02:00
commit 0d63370c10

View File

@ -113,6 +113,13 @@ namespace Nikse.SubtitleEdit.Core
"razumem", "videla", "ceo", "svet", "porodica", "voleo", "srećan", "dođavola", "svetu", "htela", "razumem", "videla", "ceo", "svet", "porodica", "voleo", "srećan", "dođavola", "svetu", "htela",
"videli", "negde", "želeo", "ponovo", "devojka", "umreti", "čoveka", "mesta", "deca", "osećam", "videli", "negde", "želeo", "ponovo", "devojka", "umreti", "čoveka", "mesta", "deca", "osećam",
"uopšte", "decu", "napred", "porodicu", "zaista", "mestu", "lepa", "takođe", "reč", "telo" }; "uopšte", "decu", "napred", "porodicu", "zaista", "mestu", "lepa", "takođe", "reč", "telo" };
private static readonly string[] AutoDetectWordsSerbianCyrillic = { "сам", "али", "није", "само", "ово", "како", "добро", "све", "тако", "ће", "могу", "ћу", "зашто", "нешто", "за", "шта", "овде" };
private static readonly string[] AutoDetectWordsIndonesian = { "yang", "tahu", "bisa", "akan", "tahun", "tapi", "dengan", "untuk", "rumah", "dalam", "sudah", "bertemu" };
private static readonly string[] AutoDetectWordsThai = { "โอ", "โรเบิร์ต", "วิตตอเรีย", "ดร", "คุณตำรวจ", "ราเชล", "ไม่", "เลดดิส", "พระเจ้า", "เท็ดดี้", "หัวหน้า", "แอนดรูว์" };
private static readonly string[] AutoDetectWordsKorean = { "그리고", "아니야", "하지만", "말이야", "그들은", "우리가" };
private static readonly string[] AutoDetectWordsFinnish = { "että", "kuin", "minä", "mitään", "Mutta", "siitä", "täällä", "poika", "Kiitos", "enää", "vielä", "tässä" };
private static readonly string[] AutoDetectWordsRomanian1 = { "pentru", "oamenii", "decât", "[Vv]reau", "[Ss]înt", "Asteaptã", "Fãrã", "aici", "domnule", "trãiascã", "niciodatã", "înseamnã", "vorbesti", "fãcut", "spune" };
private static readonly string[] AutoDetectWordsRomanian2 = { "pentru", "oamenii", "decat", "[Tt]rebuie", "[Aa]cum", "Poate", "vrea", "soare", "nevoie", "daca", "echilibrul", "vorbesti", "zeului", "atunci", "memoria", "soarele" };
private static string AutoDetectGoogleLanguage(string text, int bestCount) private static string AutoDetectGoogleLanguage(string text, int bestCount)
{ {
@ -163,7 +170,7 @@ namespace Nikse.SubtitleEdit.Core
count = GetCount(text, AutoDetectWordsFrench); count = GetCount(text, AutoDetectWordsFrench);
if (count > bestCount) if (count > bestCount)
{ {
int romanianCount = GetCount(text, "[Ss]înt", "aici", "domnule", "pentru", "Vreau"); int romanianCount = GetCount(text, "[Vv]reau", "[Ss]înt", "[Aa]cum", "pentru", "domnule", "aici");
if (romanianCount < 5) if (romanianCount < 5)
return "fr"; return "fr";
} }
@ -203,20 +210,9 @@ namespace Nikse.SubtitleEdit.Core
count = GetCount(text, AutoDetectWordsArabic); count = GetCount(text, AutoDetectWordsArabic);
if (count > bestCount) if (count > bestCount)
{ {
if (GetCount(text, "אולי", "אולי", "אולי", "אולי", "טוב", "טוב") > 10) int hebrewCount = GetCount(text, AutoDetectWordsHebrew);
return "he"; if (hebrewCount < count)
return "ar"; // Arabic
int romanianCount = GetCount(text, "sînt", "aici", "Sînt", "domnule", "pentru", "Vreau", "trãiascã", "niciodatã", "înseamnã",
"vorbesti", "oamenii", "Asteaptã", "fãcut", "Fãrã", "spune", "decât", "pentru", "vreau");
if (romanianCount > count)
return "ro"; // Romanian
romanianCount = GetCount(text, "daca", "pentru", "acum", "soare", "trebuie", "Trebuie", "nevoie", "decat", "echilibrul",
"vorbesti", "oamenii", "zeului", "vrea", "atunci", "Poate", "Acum", "memoria", "soarele");
if (romanianCount > count)
return "ro"; // Romanian
return "ar"; // Arabic
} }
count = GetCount(text, AutoDetectWordsHebrew); count = GetCount(text, AutoDetectWordsHebrew);
@ -246,29 +242,27 @@ namespace Nikse.SubtitleEdit.Core
if (count > bestCount) if (count > bestCount)
return "tr"; // Turkish return "tr"; // Turkish
count = GetCount(text, "yang", "tahu", "bisa", "akan", "tahun", "tapi", "dengan", "untuk", "rumah", "dalam", "sudah", "bertemu"); count = GetCount(text, AutoDetectWordsIndonesian);
if (count > bestCount) if (count > bestCount)
return "id"; // Indonesian return "id"; // Indonesian
count = GetCount(text, "โอ", "โรเบิร์ต", "วิตตอเรีย", "ดร", "คุณตำรวจ", "ราเชล", "ไม่", "เลดดิส", "พระเจ้า", "เท็ดดี้", "หัวหน้า", "แอนดรูว์"); count = GetCount(text, AutoDetectWordsThai);
if (count > 10 || count > bestCount) if (count > 10 || count > bestCount)
return "th"; // Thai return "th"; // Thai
count = GetCount(text, "그리고", "아니야", "하지만", "말이야", "그들은", "우리가"); count = GetCount(text, AutoDetectWordsKorean);
if (count > 10 || count > bestCount) if (count > 10 || count > bestCount)
return "ko"; // Korean return "ko"; // Korean
count = GetCount(text, "että", "kuin", "minä", "mitään", "Mutta", "siitä", "täällä", "poika", "Kiitos", "enää", "vielä", "tässä"); count = GetCount(text, AutoDetectWordsFinnish);
if (count > bestCount) if (count > bestCount)
return "fi"; // Finnish return "fi"; // Finnish
count = GetCount(text, "sînt", "aici", "Sînt", "domnule", "pentru", "Vreau", "trãiascã", "niciodatã", "înseamnã", "vorbesti", "oamenii", count = GetCount(text, AutoDetectWordsRomanian1);
"Asteaptã", "fãcut", "Fãrã", "spune", "decât", "pentru", "vreau");
if (count > bestCount) if (count > bestCount)
return "ro"; // Romanian return "ro"; // Romanian
count = GetCount(text, "daca", "pentru", "acum", "soare", "trebuie", "Trebuie", "nevoie", "decat", "echilibrul", "vorbesti", "oamenii", count = GetCount(text, AutoDetectWordsRomanian2);
"zeului", "vrea", "atunci", "Poate", "Acum", "memoria", "soarele");
if (count > bestCount) if (count > bestCount)
return "ro"; // Romanian return "ro"; // Romanian
@ -355,11 +349,12 @@ namespace Nikse.SubtitleEdit.Core
switch (shortName) switch (shortName)
{ {
case "da_DK": case "da_DK":
count = GetCount(text, "vi", "hun", "og", "jeg", "var", "men", "bliver", "meget", "spørger", "Hej", "utrolig", "dejligt"); count = GetCount(text, AutoDetectWordsDanish);
if (count > bestCount) if (count > bestCount)
{ {
int norweigianCount = GetCount(text, "ut", "deg", "meg", "merkelig", "mye", "spørre"); int norwegianCount = GetCount(text, "ut", "deg", "meg", "merkelig", "mye", "spørre");
if (norweigianCount < 2) int dutchCount = GetCount(text, "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n");
if (norwegianCount < 2 && dutchCount < count)
languageName = shortName; languageName = shortName;
} }
break; break;
@ -373,6 +368,11 @@ namespace Nikse.SubtitleEdit.Core
languageName = shortName; languageName = shortName;
} }
break; break;
case "sv_SE":
count = GetCount(text, AutoDetectWordsSwedish);
if (count > bestCount)
languageName = shortName;
break;
case "en_US": case "en_US":
count = GetCount(text, AutoDetectWordsEnglish); count = GetCount(text, AutoDetectWordsEnglish);
if (count > bestCount) if (count > bestCount)
@ -388,7 +388,7 @@ namespace Nikse.SubtitleEdit.Core
} }
break; break;
case "en_GB": case "en_GB":
count = GetCount(text, "we", "are", "and", "you", "your", "what"); count = GetCount(text, AutoDetectWordsEnglish);
if (count > bestCount) if (count > bestCount)
{ {
languageName = shortName; languageName = shortName;
@ -401,27 +401,14 @@ namespace Nikse.SubtitleEdit.Core
} }
} }
break; break;
case "sv_SE":
count = GetCount(text, "vi", "är", "och", "Jag", "inte", "för");
if (count > bestCount)
languageName = shortName;
break;
case "es_ES": case "es_ES":
count = GetCount(text, AutoDetectWordsSpanish); count = GetCount(text, AutoDetectWordsSpanish);
if (count > bestCount) if (count > bestCount)
{ {
int frenchWords = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not spanish words int frenchCount = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not spanish words
if (frenchWords < 2) int portugueseCount = GetCount(text, "[NnCc]ão", "Então", "h?ouve", "pessoal", "rapariga", "tivesse", "fizeste",
languageName = shortName; "jantar", "conheço", "atenção", "foste", "milhões", "devias", "ganhar", "raios"); // not spanish words
} if (frenchCount < 2 && portugueseCount < 2)
break;
case "fr_FR":
count = GetCount(text, AutoDetectWordsFrench);
if (count > bestCount)
{
int spanishWords = GetCount(text, "Hola", "nada", "Vamos", "pasa", "los", "como"); // not french words
int italianWords = GetCount(text, AutoDetectWordsItalian); // not italian words
if (spanishWords < 2 && italianWords < 2)
languageName = shortName; languageName = shortName;
} }
break; break;
@ -429,24 +416,35 @@ namespace Nikse.SubtitleEdit.Core
count = GetCount(text, AutoDetectWordsItalian); count = GetCount(text, AutoDetectWordsItalian);
if (count > bestCount) if (count > bestCount)
{ {
int frenchWords = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not spanish words int frenchCount = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not italian words
int spanishWords = GetCount(text, "Hola", "nada", "Vamos", "pasa", "los", "como"); // not french words int spanishCount = GetCount(text, "Hola", "nada", "Vamos", "pasa", "los", "como"); // not italian words
if (frenchWords < 2 && spanishWords < 2) if (frenchCount < 2 && spanishCount < 2)
languageName = shortName;
}
break;
case "fr_FR":
count = GetCount(text, AutoDetectWordsFrench);
if (count > bestCount)
{
int romanianCount = GetCount(text, "[Vv]reau", "[Ss]înt", "[Aa]cum", "pentru", "domnule", "aici");
int spanishCount = GetCount(text, "Hola", "nada", "Vamos", "pasa", "los", "como"); // not french words
int italianCount = GetCount(text, AutoDetectWordsItalian);
if (romanianCount < 5 && spanishCount < 2 && italianCount < 2)
languageName = shortName; languageName = shortName;
} }
break; break;
case "de_DE": case "de_DE":
count = GetCount(text, "und", "auch", "sich", "bin", "hast", "möchte"); count = GetCount(text, AutoDetectWordsGerman);
if (count > bestCount) if (count > bestCount)
languageName = shortName; languageName = shortName;
break; break;
case "nl_NL": case "nl_NL":
count = GetCount(text, "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n"); count = GetCount(text, AutoDetectWordsDutch);
if (count > bestCount) if (count > bestCount)
languageName = shortName; languageName = shortName;
break; break;
case "pl_PL": case "pl_PL":
count = GetCount(text, "Czy", "ale", "ty", "siê", "jest", "mnie"); count = GetCount(text, AutoDetectWordsPolish);
if (count > bestCount) if (count > bestCount)
languageName = shortName; languageName = shortName;
break; break;
@ -460,21 +458,17 @@ namespace Nikse.SubtitleEdit.Core
if (count > bestCount) if (count > bestCount)
languageName = shortName; languageName = shortName;
break; break;
case "ro_RO": case "uk_UA":
count = GetCount(text, "sînt", "aici", "Sînt", "domnule", "pentru", "Vreau", "trãiascã", "niciodatã", "înseamnã", "vorbesti", "oamenii", "Asteaptã", count = GetCount(text, AutoDetectWordsUkrainian);
"fãcut", "Fãrã", "spune", "decât", "pentru", "vreau"); if (count > bestCount)
languageName = shortName;
break;
case "ro_RO":
count = GetCount(text, AutoDetectWordsRomanian1);
if (count <= bestCount)
count = GetCount(text, AutoDetectWordsRomanian2);
if (count > bestCount) if (count > bestCount)
{
languageName = shortName; languageName = shortName;
}
else
{
count = GetCount(text, "daca", "pentru", "acum", "soare", "trebuie", "Trebuie", "nevoie", "decat", "echilibrul", "vorbesti", "oamenii", "zeului",
"vrea", "atunci", "Poate", "Acum", "memoria", "soarele");
if (count > bestCount)
languageName = shortName;
}
break; break;
case "hr_HR": // Croatian case "hr_HR": // Croatian
count = GetCount(text, AutoDetectWordsCroatianAndSerbian); count = GetCount(text, AutoDetectWordsCroatianAndSerbian);
@ -504,6 +498,11 @@ namespace Nikse.SubtitleEdit.Core
} }
} }
break; break;
case "sr": // Serbian (Cyrillic)
count = GetCount(text, AutoDetectWordsSerbianCyrillic);
if (count > bestCount)
languageName = shortName;
break;
case "pt_PT": // Portuguese case "pt_PT": // Portuguese
count = GetCount(text, AutoDetectWordsPortuguese); count = GetCount(text, AutoDetectWordsPortuguese);
if (count > bestCount) if (count > bestCount)