From 905c3a28f6565e8f190c5ae0406883ef4683b6d5 Mon Sep 17 00:00:00 2001 From: Nick Zatkovich Date: Sun, 23 Feb 2020 00:51:09 -0800 Subject: [PATCH] Add UTF.Unknown to detect encodings, replacing Win32-only dependency --- build.bat | 2 +- libse/DetectEncoding/EncodingTools.cs | 57 +++++---------------------- libse/LanguageAutoDetect.cs | 7 +--- libse/LibSE.csproj | 3 +- 4 files changed, 14 insertions(+), 55 deletions(-) diff --git a/build.bat b/build.bat index 1a9304349..1740cf75f 100644 --- a/build.bat +++ b/build.bat @@ -104,7 +104,7 @@ ECHO Merging assemblies with ILRepack... FOR /D %%A IN (packages\ILRepack.*) DO (SET "ILREPACKDIR=%%A") ECHO. "%ILREPACKDIR%\tools\ILRepack.exe" /parallel /internalize /targetplatform:v4 /out:"bin\Release\SubtitleEdit.exe" "bin\Release\SubtitleEdit.exe"^ - "bin\Release\libse.dll" "bin\Release\zlib.net.dll" "bin\Release\NHunspell.dll" "DLLs\Interop.QuartzTypeLib.dll" + "bin\Release\libse.dll" "bin\Release\zlib.net.dll" "bin\Release\NHunspell.dll" "bin\Release\UtfUnknown.dll" "DLLs\Interop.QuartzTypeLib.dll" IF %ERRORLEVEL% NEQ 0 GOTO EndWithError POPD diff --git a/libse/DetectEncoding/EncodingTools.cs b/libse/DetectEncoding/EncodingTools.cs index d311b85ee..0e911eb65 100644 --- a/libse/DetectEncoding/EncodingTools.cs +++ b/libse/DetectEncoding/EncodingTools.cs @@ -1,6 +1,7 @@ // Ripped from http://www.codeproject.com/KB/recipes/DetectEncoding.aspx using Nikse.SubtitleEdit.Core.DetectEncoding.Multilang; +using UtfUnknown; using System; using System.Collections.Generic; using System.IO; @@ -385,59 +386,21 @@ namespace Nikse.SubtitleEdit.Core.DetectEncoding { return new[] { Encoding.ASCII }; } - - // expand the string to be at least 256 bytes - if (input.Length < 256) - { - byte[] newInput = new byte[256]; - int steps = 256 / input.Length; - for (int i = 0; i < steps; i++) - { - Array.Copy(input, 0, newInput, input.Length * i, input.Length); - } - - int rest = 256 % input.Length; - if (rest > 0) - { - Array.Copy(input, 0, newInput, steps * input.Length, rest); - } - - input = newInput; - } - + + // Use UTF.Unknown to detect from input byte string + DetectionResult detectionResult = CharsetDetector.DetectFromBytes(input); List result = new List(); - // get the IMultiLanguage" interface - IMultiLanguage2 multilang2 = new CMultiLanguageClass(); - try + //add in order (results ordered from most likely to least likely) + foreach (var detected in detectionResult.Details) { - DetectEncodingInfo[] detectedEncdings = new DetectEncodingInfo[maxEncodings]; - - int scores = detectedEncdings.Length; - int srcLen = input.Length; - - // setup options (none) - const MLDETECTCP options = MLDETECTCP.MLDETECTCP_NONE; - - // finally... call to DetectInputCodepage - multilang2.DetectInputCodepage(options, 0, - ref input[0], ref srcLen, ref detectedEncdings[0], ref scores); - - // get result - if (scores > 0) + result.Add(detected.Encoding); + if (result.Count == maxEncodings) { - for (int i = 0; i < scores; i++) - { - // add the result - result.Add(Encoding.GetEncoding((int)detectedEncdings[i].nCodePage)); - } + break; } } - finally - { - Marshal.FinalReleaseComObject(multilang2); - } - // nothing found + return result.ToArray(); } diff --git a/libse/LanguageAutoDetect.cs b/libse/LanguageAutoDetect.cs index 0dcd50691..f208ed660 100644 --- a/libse/LanguageAutoDetect.cs +++ b/libse/LanguageAutoDetect.cs @@ -1,4 +1,4 @@ -using System; +using System; using System.Collections.Generic; using System.IO; using System.Linq; @@ -1053,11 +1053,6 @@ namespace Nikse.SubtitleEdit.Core public static Encoding DetectAnsiEncoding(byte[] buffer) { - if (Utilities.IsRunningOnMono()) - { - return Encoding.Default; - } - try { var greekEncoding = Encoding.GetEncoding(1253); // Greek diff --git a/libse/LibSE.csproj b/libse/LibSE.csproj index acb412898..033a482e8 100644 --- a/libse/LibSE.csproj +++ b/libse/LibSE.csproj @@ -38,10 +38,11 @@ - + +