mirror of
https://github.com/SubtitleEdit/subtitleedit.git
synced 2024-10-27 22:42:38 +01:00
Added some static readonly language auto detect words - aaaxx :)
This commit is contained in:
parent
44f5d5672b
commit
ca17ec6e8a
@ -918,7 +918,7 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
encoding = DetectAnsiEncoding(buffer);
|
||||
|
||||
Encoding greekEncoding = Encoding.GetEncoding(1253); // Greek
|
||||
if (GetCount(greekEncoding.GetString(buffer), "μου", "είναι", "Είναι", "αυτό", "Τόμπυ", "καλά") > 5)
|
||||
if (GetCount(greekEncoding.GetString(buffer), AutoDetectWordsGreek) > 5)
|
||||
return greekEncoding;
|
||||
|
||||
Encoding russianEncoding = Encoding.GetEncoding(1251); // Cyrillic
|
||||
@ -942,7 +942,7 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
return hewbrewEncoding;
|
||||
return arabicEncoding;
|
||||
}
|
||||
if (GetCount(hewbrewEncoding.GetString(buffer), "אתה", "אולי", "הוא", "בסדר", "יודע", "טוב") > 5)
|
||||
if (GetCount(hewbrewEncoding.GetString(buffer), AutoDetectWordsHebrew) > 5)
|
||||
return hewbrewEncoding;
|
||||
|
||||
Encoding romanianEncoding = Encoding.GetEncoding(1250); // Romanian
|
||||
@ -1018,7 +1018,7 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
Encoding encoding = DetectEncoding.EncodingTools.DetectInputCodepage(buffer);
|
||||
|
||||
Encoding greekEncoding = Encoding.GetEncoding(1253); // Greek
|
||||
if (GetCount(greekEncoding.GetString(buffer), "μου", "είναι", "Είναι", "αυτό", "Τόμπυ", "καλά") > 5)
|
||||
if (GetCount(greekEncoding.GetString(buffer), AutoDetectWordsGreek) > 5)
|
||||
return greekEncoding;
|
||||
|
||||
Encoding russianEncoding = Encoding.GetEncoding(1251); // Cyrillic
|
||||
@ -1043,7 +1043,7 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
return hewbrewEncoding;
|
||||
return arabicEncoding;
|
||||
}
|
||||
if (GetCount(hewbrewEncoding.GetString(buffer), "אתה", "אולי", "הוא", "בסדר", "יודע", "טוב") > 5)
|
||||
if (GetCount(hewbrewEncoding.GetString(buffer), AutoDetectWordsHebrew) > 5)
|
||||
return hewbrewEncoding;
|
||||
|
||||
return encoding;
|
||||
@ -1176,13 +1176,37 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
}
|
||||
}
|
||||
|
||||
public static readonly string[] AutoDetectWordsEnglish = { "we", "are", "and", "you", "your", "what" };
|
||||
public static readonly string[] AutoDetectWordsDanish = { "vi", "han", "og", "jeg", "var", "men", "gider", "bliver", "virkelig", "kommer", "tilbage", "Hej" };
|
||||
public static readonly string[] AutoDetectWordsNorwegian = { "vi", "er", "og", "jeg", "var", "men" };
|
||||
public static readonly string[] AutoDetectWordsSwedish = { "vi", "är", "och", "Jag", "inte", "för" };
|
||||
public static readonly string[] AutoDetectWordsSpanish = { "el", "bien", "Vamos", "Hola", "casa", "con" };
|
||||
public static readonly string[] AutoDetectWordsFrench = { "un", "vous", "avec", "pas", "ce", "une" };
|
||||
public static readonly string[] AutoDetectWordsGerman = { "und", "auch", "sich", "bin", "hast", "möchte" };
|
||||
public static readonly string[] AutoDetectWordsDutch = { "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n" };
|
||||
public static readonly string[] AutoDetectWordsPolish = { "Czy", "ale", "ty", "siê", "jest", "mnie" };
|
||||
public static readonly string[] AutoDetectWordsItalian = { "Cosa", "sono", "Grazie", "Buongiorno", "bene", "questo", "ragazzi", "propriamente", "numero", "hanno", "giorno", "faccio", "davvero", "negativo", "essere", "vuole", "sensitivo", "venire" };
|
||||
public static readonly string[] AutoDetectWordsPortuguese = { "não", "Não", "Estás", "Então", "isso", "com" };
|
||||
public static readonly string[] AutoDetectWordsGreek = { "μου", "είναι", "Είναι", "αυτό", "Τόμπυ", "καλά", "Ενταξει", "Ενταξει", "πρεπει", "Λοιπον", "τιποτα", "ξερεις" };
|
||||
public static readonly string[] AutoDetectWordsRussian = { "все", "это", "как", "Воробей", "сюда", "Давай" };
|
||||
public static readonly string[] AutoDetectWordsBulgarian = { "Какво", "тук", "може", "Как", "Ваше", "какво" };
|
||||
public static readonly string[] AutoDetectWordsRomanian = { "Какво", "тук", "може", "Как", "Ваше", "какво" };
|
||||
public static readonly string[] AutoDetectWordsArabic = { "Какво", "тук", "може", "Как", "Ваше", "какво" };
|
||||
public static readonly string[] AutoDetectWordsHebrew = { "אתה", "אולי", "הוא", "בסדר", "יודע", "טוב" };
|
||||
public static readonly string[] AutoDetectWordsSerbian = { "sam", "što", "öto", "äto", "ovo", "vas", "nije", "Šta", "ovde", "za" };
|
||||
public static readonly string[] AutoDetectWordsVietnamese = { "không", "tôi", "anh", "đó", "Tôi", "ông" };
|
||||
public static readonly string[] AutoDetectWordsHungarian = { "hogy", "lesz", "tudom", "vagy", "mondtam", "még" };
|
||||
public static readonly string[] AutoDetectWordsCroatian = { "sam", "öto", "äto", "ovo", "vas", "što" };
|
||||
public static readonly string[] AutoDetectWordsTurkish = { "için", "Tamam", "Hayır", "benim", "daha", "deðil", "önce", "lazým", "benim", "çalýþýyor", "burada", "efendim" };
|
||||
|
||||
|
||||
public static string AutoDetectGoogleLanguage(string text, int bestCount)
|
||||
{
|
||||
int count = GetCount(text, "we", "are", "and", "you", "your", "what");
|
||||
int count = GetCount(text, AutoDetectWordsEnglish);
|
||||
if (count > bestCount)
|
||||
return "en";
|
||||
|
||||
count = GetCount(text, "vi", "han", "og", "jeg", "var", "men", "gider", "bliver", "virkelig", "kommer", "tilbage", "Hej");
|
||||
count = GetCount(text, AutoDetectWordsDanish);
|
||||
if (count > bestCount)
|
||||
{
|
||||
int norwegianCount = GetCount(text, "ut", "deg", "meg", "merkelig", "mye", "spørre");
|
||||
@ -1191,7 +1215,7 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
return "da";
|
||||
}
|
||||
|
||||
count = GetCount(text, "vi", "er", "og", "jeg", "var", "men");
|
||||
count = GetCount(text, AutoDetectWordsNorwegian);
|
||||
if (count > bestCount)
|
||||
{
|
||||
int danishCount = GetCount(text, "siger", "dig", "mig", "mærkelig", "tilbage", "spørge");
|
||||
@ -1200,11 +1224,11 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
return "no";
|
||||
}
|
||||
|
||||
count = GetCount(text, "vi", "är", "och", "Jag", "inte", "för");
|
||||
count = GetCount(text, AutoDetectWordsSwedish);
|
||||
if (count > bestCount)
|
||||
return "sv";
|
||||
|
||||
count = GetCount(text, "el", "bien", "Vamos", "Hola", "casa", "con");
|
||||
count = GetCount(text, AutoDetectWordsSpanish);
|
||||
if (count > bestCount)
|
||||
{
|
||||
int frenchCount = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not spanish words
|
||||
@ -1212,30 +1236,29 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
return "es";
|
||||
}
|
||||
|
||||
count = GetCount(text, "un", "vous", "avec", "pas", "ce", "une");
|
||||
count = GetCount(text, AutoDetectWordsFrench);
|
||||
if (count > bestCount)
|
||||
{
|
||||
int spanishCount = GetCount(text, "Hola", "nada", "Vamos", "pasa", "los", "como"); // not french words
|
||||
int italianCount = GetCount(text, "Cosa", "sono", "Grazie", "Buongiorno", "bene", "questo");
|
||||
int italianCount = GetCount(text, AutoDetectWordsItalian);
|
||||
int romanianCount = GetCount(text, "sînt", "aici", "Sînt", "domnule", "pentru", "Vreau");
|
||||
if (spanishCount < 2 && italianCount < 2 && romanianCount < 5)
|
||||
return "fr";
|
||||
}
|
||||
|
||||
count = GetCount(text, "und", "auch", "sich", "bin", "hast", "möchte");
|
||||
count = GetCount(text, AutoDetectWordsGerman);
|
||||
if (count > bestCount)
|
||||
return "de";
|
||||
|
||||
count = GetCount(text, "van", "een", "[Hh]et", "m(ij|ij)", "z(ij|ij)n");
|
||||
count = GetCount(text, AutoDetectWordsDutch);
|
||||
if (count > bestCount)
|
||||
return "nl";
|
||||
|
||||
count = GetCount(text, "Czy", "ale", "ty", "siê", "jest", "mnie");
|
||||
count = GetCount(text, AutoDetectWordsPolish);
|
||||
if (count > bestCount)
|
||||
return "pl";
|
||||
|
||||
count = GetCount(text, "Cosa", "sono", "Grazie", "Buongiorno", "bene", "questo", "ragazzi", "propriamente", "numero", "hanno",
|
||||
"giorno", "faccio", "davvero", "negativo", "essere", "vuole", "sensitivo", "venire");
|
||||
count = GetCount(text, AutoDetectWordsItalian);
|
||||
if (count > bestCount)
|
||||
{
|
||||
int frenchCount = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not spanish words
|
||||
@ -1244,23 +1267,23 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
return "it";
|
||||
}
|
||||
|
||||
count = GetCount(text, "não", "Não", "Estás", "Então", "isso", "com");
|
||||
count = GetCount(text, AutoDetectWordsPortuguese);
|
||||
if (count > bestCount)
|
||||
return "pt"; // Portuguese
|
||||
|
||||
count = GetCount(text, "μου", "είναι", "Είναι", "αυτό", "Τόμπυ", "καλά", "Ενταξει", "Ενταξει", "πρεπει", "Λοιπον", "τιποτα", "ξερεις");
|
||||
count = GetCount(text, AutoDetectWordsGreek);
|
||||
if (count > bestCount)
|
||||
return "el"; // Greek
|
||||
|
||||
count = GetCount(text, "все", "это", "как", "Воробей", "сюда", "Давай");
|
||||
count = GetCount(text, AutoDetectWordsRussian);
|
||||
if (count > bestCount)
|
||||
return "ru"; // Russian
|
||||
|
||||
count = GetCount(text, "Какво", "тук", "може", "Как", "Ваше", "какво");
|
||||
count = GetCount(text, AutoDetectWordsBulgarian);
|
||||
if (count > bestCount)
|
||||
return "bg"; // Bulgarian
|
||||
|
||||
count = GetCount(text, "من", "هل", "لا", "فى", "لقد", "ما");
|
||||
count = GetCount(text, AutoDetectWordsArabic);
|
||||
if (count > bestCount)
|
||||
{
|
||||
if (GetCount(text, "אולי", "אולי", "אולי", "אולי", "טוב", "טוב") > 10)
|
||||
@ -1279,11 +1302,11 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
return "ar"; // Arabic
|
||||
}
|
||||
|
||||
count = GetCount(text, "אתה", "אולי", "הוא", "בסדר", "יודע", "טוב");
|
||||
count = GetCount(text, AutoDetectWordsHebrew);
|
||||
if (count > bestCount)
|
||||
return "he"; // Hebrew
|
||||
|
||||
count = GetCount(text, "sam", "što", "öto", "äto", "ovo", "vas", "nije", "Šta", "ovde", "za");
|
||||
count = GetCount(text, AutoDetectWordsSerbian);
|
||||
if (count > bestCount)
|
||||
{
|
||||
int croatianCount = GetCount(text, "sigurnošću", "ubojstvo", "službeni", "nedjelja", "izražava", "dogodilo", "svjetlo", "sigurno", "shvaćam",
|
||||
@ -1298,15 +1321,15 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
return "sr"; // Serbian
|
||||
}
|
||||
|
||||
count = GetCount(text, "không", "tôi", "anh", "đó", "Tôi", "ông");
|
||||
count = GetCount(text, AutoDetectWordsVietnamese);
|
||||
if (count > bestCount)
|
||||
return "vi"; // Vietnamese
|
||||
|
||||
count = GetCount(text, "hogy", "lesz", "tudom", "vagy", "mondtam", "még");
|
||||
count = GetCount(text, AutoDetectWordsHungarian);
|
||||
if (count > bestCount)
|
||||
return "hu"; // Hungarian
|
||||
|
||||
count = GetCount(text, "için", "Tamam", "Hayır", "benim", "daha", "deðil", "önce", "lazým", "benim", "çalýþýyor", "burada", "efendim");
|
||||
count = GetCount(text, AutoDetectWordsTurkish);
|
||||
if (count > bestCount)
|
||||
return "tr"; // Turkish
|
||||
|
||||
@ -1422,7 +1445,7 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
}
|
||||
break;
|
||||
case "nb_NO":
|
||||
count = GetCount(text, "vi", "er", "og", "jeg", "var", "men");
|
||||
count = GetCount(text, AutoDetectWordsNorwegian);
|
||||
if (count > bestCount)
|
||||
{
|
||||
int danishCount = GetCount(text, "siger", "dig", "mig", "mærkelig", "tilbage", "spørge");
|
||||
@ -1432,7 +1455,7 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
}
|
||||
break;
|
||||
case "en_US":
|
||||
count = GetCount(text, "we", "are", "and", "you", "your", "what");
|
||||
count = GetCount(text, AutoDetectWordsEnglish);
|
||||
if (count > bestCount)
|
||||
{
|
||||
if (containsEnGb)
|
||||
@ -1471,7 +1494,7 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
languageName = shortName;
|
||||
break;
|
||||
case "es_ES":
|
||||
count = GetCount(text, "el", "bien", "Vamos", "Hola", "casa", "con");
|
||||
count = GetCount(text, AutoDetectWordsSpanish);
|
||||
if (count > bestCount)
|
||||
{
|
||||
int frenchWords = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not spanish words
|
||||
@ -1480,17 +1503,17 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
}
|
||||
break;
|
||||
case "fr_FR":
|
||||
count = GetCount(text, "un", "vous", "avec", "pas", "ce", "une");
|
||||
count = GetCount(text, AutoDetectWordsFrench);
|
||||
if (count > bestCount)
|
||||
{
|
||||
int spanishWords = GetCount(text, "Hola", "nada", "Vamos", "pasa", "los", "como"); // not french words
|
||||
int italianWords = GetCount(text, "Cosa", "sono", "Grazie", "Buongiorno", "bene", "questo"); // not italian words
|
||||
int italianWords = GetCount(text, AutoDetectWordsItalian); // not italian words
|
||||
if (spanishWords < 2 && italianWords < 2)
|
||||
languageName = shortName;
|
||||
}
|
||||
break;
|
||||
case "it_IT":
|
||||
count = GetCount(text, "Cosa", "sono", "Grazie", "Buongiorno", "bene", "questo");
|
||||
count = GetCount(text, AutoDetectWordsItalian);
|
||||
if (count > bestCount)
|
||||
{
|
||||
int frenchWords = GetCount(text, "[Cc]'est", "pas", "vous", "pour", "suis", "Pourquoi", "maison", "souviens", "quelque"); // not spanish words
|
||||
@ -1515,12 +1538,12 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
languageName = shortName;
|
||||
break;
|
||||
case "el_GR":
|
||||
count = GetCount(text, "μου", "είναι", "Είναι", "αυτό", "Τόμπυ", "καλά");
|
||||
count = GetCount(text, AutoDetectWordsGreek);
|
||||
if (count > bestCount)
|
||||
languageName = shortName;
|
||||
break;
|
||||
case "ru_RU":
|
||||
count = GetCount(text, "все", "это", "как", "Воробей", "сюда", "Давай");
|
||||
count = GetCount(text, AutoDetectWordsRussian);
|
||||
if (count > bestCount)
|
||||
languageName = shortName;
|
||||
break;
|
||||
@ -1541,22 +1564,22 @@ namespace Nikse.SubtitleEdit.Logic
|
||||
}
|
||||
break;
|
||||
case "hr_HR": // Croatian
|
||||
count = GetCount(text, "sam", "öto", "äto", "ovo", "vas", "što");
|
||||
count = GetCount(text, AutoDetectWordsCroatian);
|
||||
if (count > bestCount)
|
||||
languageName = shortName;
|
||||
break;
|
||||
case "pt_PT": // Portuguese
|
||||
count = GetCount(text, "não", "Não", "Estás", "Então", "isso", "com");
|
||||
count = GetCount(text, AutoDetectWordsPortuguese);
|
||||
if (count > bestCount)
|
||||
languageName = shortName;
|
||||
break;
|
||||
case "pt_BR": // Portuguese (Brasil)
|
||||
count = GetCount(text, "não", "Não", "Estás", "Então", "isso", "com");
|
||||
count = GetCount(text, AutoDetectWordsPortuguese);
|
||||
if (count > bestCount)
|
||||
languageName = shortName;
|
||||
break;
|
||||
case "hu_HU": // Hungarian
|
||||
count = GetCount(text, "hogy", "lesz", "tudom", "vagy", "mondtam", "még");
|
||||
count = GetCount(text, AutoDetectWordsHungarian);
|
||||
if (count > bestCount)
|
||||
languageName = shortName;
|
||||
break;
|
||||
|
Loading…
Reference in New Issue
Block a user