SubtitleEdit/libse/Translate/GoogleTranslator1.cs

228 lines
7.7 KiB
C#
Raw Normal View History

2018-11-30 15:43:46 +01:00
using System;
using System.Collections.Generic;
using System.Linq;
2018-11-30 15:43:46 +01:00
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
2018-12-01 21:56:07 +01:00
using Nikse.SubtitleEdit.Core.SubtitleFormats;
2018-11-30 15:43:46 +01:00
namespace Nikse.SubtitleEdit.Core.Translate
{
/// <summary>
2018-12-05 17:46:35 +01:00
/// Google translate via Google V1 API - see https://cloud.google.com/translate/
2018-11-30 15:43:46 +01:00
/// </summary>
public class GoogleTranslator1 : ITranslator
{
2018-12-09 19:35:27 +01:00
private const char SplitChar = '\n';
2018-11-30 15:43:46 +01:00
public List<TranslationPair> GetTranslationPairs()
{
2018-12-05 17:46:35 +01:00
return new GoogleTranslator2(string.Empty).GetTranslationPairs();
2018-11-30 15:43:46 +01:00
}
public string GetName()
{
return "Google translate (old)";
}
public string GetUrl()
{
return "https://translate.google.com/";
}
2018-12-01 21:56:07 +01:00
public List<string> Translate(string sourceLanguage, string targetLanguage, List<Paragraph> paragraphs, StringBuilder log)
2018-11-30 15:43:46 +01:00
{
2018-12-01 21:56:07 +01:00
string result;
2018-11-30 15:43:46 +01:00
var input = new StringBuilder();
2018-12-01 21:56:07 +01:00
var formattings = new Formatting[paragraphs.Count];
for (var index = 0; index < paragraphs.Count; index++)
2018-11-30 15:43:46 +01:00
{
2018-12-01 21:56:07 +01:00
var p = paragraphs[index];
var f = new Formatting();
formattings[index] = f;
if (input.Length > 0)
{
input.Append(" " + SplitChar + " ");
}
var text = f.SetTagsAndReturnTrimmed(TranslationHelper.PreTranslate(p.Text.Replace(SplitChar.ToString(), string.Empty), sourceLanguage), sourceLanguage);
2018-12-09 19:35:27 +01:00
text = f.Unbreak(text, p.Text);
2018-12-01 21:56:07 +01:00
input.Append(text);
2018-11-30 15:43:46 +01:00
}
2018-12-01 21:56:07 +01:00
using (var wc = new WebClient())
{
string url = $"https://translate.googleapis.com/translate_a/single?client=gtx&sl={sourceLanguage}&tl={targetLanguage}&dt=t&q={Utilities.UrlEncode(input.ToString())}";
wc.Proxy = Utilities.GetProxy();
wc.Encoding = Encoding.UTF8;
wc.Headers.Add("user-agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36");
result = wc.DownloadString(url).Trim();
}
2018-11-30 15:43:46 +01:00
2018-12-01 21:56:07 +01:00
var sbAll = new StringBuilder();
int count = 0;
int i = 1;
int level = result.StartsWith('[') ? 1 : 0;
while (i < result.Length - 1)
2018-11-30 15:43:46 +01:00
{
2018-12-01 21:56:07 +01:00
var sb = new StringBuilder();
var start = false;
for (; i < result.Length - 1; i++)
2018-11-30 15:43:46 +01:00
{
2018-12-01 21:56:07 +01:00
var c = result[i];
if (start)
2018-11-30 15:43:46 +01:00
{
2018-12-01 21:56:07 +01:00
if (c == '"' && result[i - 1] != '\\')
2018-11-30 15:43:46 +01:00
{
2018-12-01 21:56:07 +01:00
count++;
if (count % 2 == 1 && level > 2 && level < 5) // even numbers are original text, level 3 is translation
2019-01-19 14:40:37 +01:00
{
2018-12-01 21:56:07 +01:00
sbAll.Append(" " + sb);
2019-01-19 14:40:37 +01:00
}
2018-12-01 21:56:07 +01:00
i++;
break;
2018-11-30 15:43:46 +01:00
}
2018-12-01 21:56:07 +01:00
sb.Append(c);
}
else if (c == '"')
{
start = true;
}
else if (c == '[')
{
level++;
}
else if (c == ']')
{
level--;
2018-11-30 15:43:46 +01:00
}
}
}
2018-12-01 21:56:07 +01:00
var res = sbAll.ToString().Trim();
res = Regex.Unescape(res);
2018-12-09 19:35:27 +01:00
var lines = res.SplitToLines().ToList();
var resultList = new List<string>();
for (var index = 0; index < lines.Count; index++)
{
var line = lines[index];
var s = Json.DecodeJsonText(line);
s = string.Join(Environment.NewLine, s.SplitToLines());
s = TranslationHelper.PostTranslate(s, targetLanguage);
s = s.Replace(Environment.NewLine + Environment.NewLine, Environment.NewLine);
s = s.Replace(Environment.NewLine + " ", Environment.NewLine);
s = s.Replace(Environment.NewLine + " ", Environment.NewLine);
s = s.Replace(" " + Environment.NewLine, Environment.NewLine);
s = s.Replace(" " + Environment.NewLine, Environment.NewLine).Trim();
if (formattings.Length > index)
2018-12-09 19:35:27 +01:00
{
s = formattings[index].ReAddFormatting(s);
2018-12-09 19:35:27 +01:00
s = formattings[index].Rebreak(s);
}
resultList.Add(s);
}
if (resultList.Count > paragraphs.Count)
{
var timmedList = resultList.Where(p => !string.IsNullOrEmpty(p)).ToList();
if (timmedList.Count == paragraphs.Count)
2019-01-19 14:40:37 +01:00
{
return timmedList;
2019-01-19 14:40:37 +01:00
}
}
if (resultList.Count < paragraphs.Count)
{
var splitList = SplitMergedLines(resultList, paragraphs);
if (splitList.Count == paragraphs.Count)
2019-01-19 14:40:37 +01:00
{
return splitList;
2019-01-19 14:40:37 +01:00
}
}
return resultList;
}
private static List<string> SplitMergedLines(List<string> input, List<Paragraph> paragraphs)
{
var hits = 0;
var results = new List<string>();
for (var index = 0; index < input.Count; index++)
{
var line = input[index];
var text = paragraphs[index].Text;
var badPoints = 0;
if (text.StartsWith("[") && !line.StartsWith("["))
2019-01-19 14:40:37 +01:00
{
badPoints++;
2019-01-19 14:40:37 +01:00
}
if (text.StartsWith("-") && !line.StartsWith("-"))
2019-01-19 14:40:37 +01:00
{
badPoints++;
2019-01-19 14:40:37 +01:00
}
if (text.Length > 0 && char.IsUpper(text[0]) && line.Length > 0 && !char.IsUpper(line[0]))
2019-01-19 14:40:37 +01:00
{
badPoints++;
2019-01-19 14:40:37 +01:00
}
if (text.EndsWith(".") && !line.EndsWith("."))
2019-01-19 14:40:37 +01:00
{
badPoints++;
2019-01-19 14:40:37 +01:00
}
if (text.EndsWith("!") && !line.EndsWith("!"))
2019-01-19 14:40:37 +01:00
{
badPoints++;
2019-01-19 14:40:37 +01:00
}
if (text.EndsWith("?") && !line.EndsWith("?"))
2019-01-19 14:40:37 +01:00
{
badPoints++;
2019-01-19 14:40:37 +01:00
}
if (text.EndsWith(",") && !line.EndsWith(","))
2019-01-19 14:40:37 +01:00
{
badPoints++;
2019-01-19 14:40:37 +01:00
}
if (text.EndsWith(":") && !line.EndsWith(":"))
2019-01-19 14:40:37 +01:00
{
badPoints++;
2019-01-19 14:40:37 +01:00
}
var added = false;
if (badPoints > 0 && hits + input.Count < paragraphs.Count)
{
var percent = line.Length * 100.0 / text.Length;
if (percent > 150)
{
var temp = Utilities.AutoBreakLine(line).SplitToLines();
if (temp.Count == 2)
{
hits++;
results.Add(temp[0]);
results.Add(temp[1]);
added = true;
}
}
}
if (!added)
{
results.Add(line);
}
}
if (results.Count == paragraphs.Count)
2019-01-19 14:40:37 +01:00
{
return results;
2019-01-19 14:40:37 +01:00
}
return input;
2018-11-30 15:43:46 +01:00
}
2018-11-30 15:43:46 +01:00
}
}