From a758a68a430eec558318bd4c7cb73e6da293b6a9 Mon Sep 17 00:00:00 2001 From: niksedk Date: Tue, 12 Oct 2010 11:23:33 +0000 Subject: [PATCH] Initial version git-svn-id: https://subtitleedit.googlecode.com/svn/trunk@16 99eadd0c-20b8-1223-b5c4-2a2b2df33de2 --- src/Logic/DetectEncoding/EncodingTools.cs | 529 ++++++++++++++++++++++ 1 file changed, 529 insertions(+) create mode 100644 src/Logic/DetectEncoding/EncodingTools.cs diff --git a/src/Logic/DetectEncoding/EncodingTools.cs b/src/Logic/DetectEncoding/EncodingTools.cs new file mode 100644 index 000000000..54fb96fe1 --- /dev/null +++ b/src/Logic/DetectEncoding/EncodingTools.cs @@ -0,0 +1,529 @@ +// Ripped from http://www.codeproject.com/KB/recipes/DetectEncoding.aspx + +using System; +using System.Collections.Generic; +using System.Text; +using System.Runtime.InteropServices; +using System.IO; + +namespace Nikse.SubtitleEdit.Logic.DetectEncoding +{ + public static class EncodingTools + { + // this only contains ascii, default windows code page and unicode + public static int[] PreferedEncodingsForStream; + + // this contains all codepages, sorted by preference and byte usage + public static int[] PreferedEncodings; + + // this contains all codepages, sorted by preference and byte usage + public static int[] AllEncodings; + + + + /// + /// Static constructor that fills the default preferred codepages + /// + static EncodingTools() + { + + List streamEcodings= new List(); + List allEncodings = new List(); + List mimeEcodings = new List(); + + // asscii - most simple so put it in first place... + streamEcodings.Add(Encoding.ASCII.CodePage); + mimeEcodings.Add(Encoding.ASCII.CodePage); + allEncodings.Add(Encoding.ASCII.CodePage); + + + // add default 2nd for all encodings + allEncodings.Add(Encoding.Default.CodePage); + // default is single byte? + if (Encoding.Default.IsSingleByte) + { + // put it in second place + streamEcodings.Add(Encoding.Default.CodePage); + mimeEcodings.Add(Encoding.Default.CodePage); + } + + + + // prefer JIS over JIS-SHIFT (JIS is detected better than JIS-SHIFT) + // this one does include cyrilic (strange but true) + allEncodings.Add(50220); + mimeEcodings.Add(50220); + + + // always allow unicode flavours for streams (they all have a preamble) + streamEcodings.Add(Encoding.Unicode.CodePage); + foreach (EncodingInfo enc in Encoding.GetEncodings()) + { + if (!streamEcodings.Contains(enc.CodePage)) + { + Encoding encoding = Encoding.GetEncoding(enc.CodePage); + if (encoding.GetPreamble().Length > 0) + streamEcodings.Add(enc.CodePage); + } + } + + + // stream is done here + PreferedEncodingsForStream = streamEcodings.ToArray(); + + + // all singlebyte encodings + foreach (EncodingInfo enc in Encoding.GetEncodings()) + { + + + if (!enc.GetEncoding().IsSingleByte) + continue; + + if (!allEncodings.Contains(enc.CodePage)) + allEncodings.Add(enc.CodePage); + + // only add iso and IBM encodings to mime encodings + if (enc.CodePage <= 1258) + { + mimeEcodings.Add(enc.CodePage); + } + } + + // add the rest (multibyte) + foreach (EncodingInfo enc in Encoding.GetEncodings()) + { + if (!enc.GetEncoding().IsSingleByte) + { + if (!allEncodings.Contains(enc.CodePage)) + allEncodings.Add(enc.CodePage); + + // only add iso and IBM encodings to mime encodings + if (enc.CodePage <= 1258) + { + mimeEcodings.Add(enc.CodePage); + } + } + } + + // add unicodes + mimeEcodings.Add(Encoding.Unicode.CodePage); + + + PreferedEncodings = mimeEcodings.ToArray(); + AllEncodings = allEncodings.ToArray(); + } + + + /// + /// Checks if specified string data is acii data. + /// + /// + /// + public static bool IsAscii(string data) + { + // assume empty string to be ascii + if ((data == null) || (data.Length == 0)) + return true; + foreach (char c in data) + { + if ((int)c > 127) + { + return false; + } + } + + return true; + } + + /// + /// Gets the best Encoding for usage in mime encodings + /// + /// text to detect + /// the suggested encoding + public static Encoding GetMostEfficientEncoding(string input) + { + return GetMostEfficientEncoding(input, PreferedEncodings); + } + + /// + /// Gets the best ISO Encoding for usage in a stream + /// + /// text to detect + /// the suggested encoding + public static Encoding GetMostEfficientEncodingForStream(string input) + { + return GetMostEfficientEncoding(input, PreferedEncodingsForStream); + } + + /// + /// Gets the best fitting encoding from a list of possible encodings + /// + /// text to detect + /// an array of codepages + /// the suggested encoding + public static Encoding GetMostEfficientEncoding(string input, int[] preferedEncodings) + { + Encoding enc = DetectOutgoingEncoding(input,preferedEncodings,true); + // unicode.. hmmm... check for smallest encoding + if (enc.CodePage == Encoding.Unicode.CodePage) + { + int byteCount = Encoding.UTF7.GetByteCount(input); + enc = Encoding.UTF7; + int bestByteCount = byteCount; + + // utf8 smaller? + byteCount = Encoding.UTF8.GetByteCount(input); + if (byteCount < bestByteCount) + { + enc = Encoding.UTF8; + bestByteCount = byteCount; + } + + // unicode smaller? + byteCount = Encoding.Unicode.GetByteCount(input); + if (byteCount < bestByteCount) + { + enc = Encoding.Unicode; + bestByteCount = byteCount; + } + } + else + { + + } + return enc; + } + + public static Encoding DetectOutgoingEncoding(string input) + { + return DetectOutgoingEncoding(input, PreferedEncodings, true); + } + + public static Encoding DetectOutgoingStreamEncoding(string input) + { + return DetectOutgoingEncoding(input, PreferedEncodingsForStream, true); + } + + public static Encoding[] DetectOutgoingEncodings(string input) + { + return DetectOutgoingEncodings(input, PreferedEncodings, true); + } + + public static Encoding[] DetectOutgoingStreamEncodings(string input) + { + return DetectOutgoingEncodings(input, PreferedEncodingsForStream, true); + } + + private static Encoding DetectOutgoingEncoding(string input, int[] preferedEncodings, bool preserveOrder) + { + + if (input == null) + throw new ArgumentNullException("input"); + + // empty strings can always be encoded as ASCII + if (input.Length == 0) + return Encoding.ASCII; + + Encoding result = Encoding.ASCII; + + // get the IMultiLanguage3 interface + MultiLanguage.IMultiLanguage3 multilang3 = new MultiLanguage.CMultiLanguageClass(); + if (multilang3 == null) + throw new System.Runtime.InteropServices.COMException("Failed to get IMultilang3"); + try + { + int[] resultCodePages = new int[preferedEncodings != null ? preferedEncodings.Length : Encoding.GetEncodings().Length]; + uint detectedCodepages = (uint)resultCodePages.Length; + ushort specialChar = (ushort)'?'; + + + // get unmanaged arrays + IntPtr pPrefEncs = preferedEncodings == null ? IntPtr.Zero : Marshal.AllocCoTaskMem(sizeof(uint) * preferedEncodings.Length); + IntPtr pDetectedEncs = Marshal.AllocCoTaskMem(sizeof(uint) * resultCodePages.Length); + + try + { + if (preferedEncodings != null) + Marshal.Copy(preferedEncodings, 0, pPrefEncs, preferedEncodings.Length); + + Marshal.Copy(resultCodePages, 0, pDetectedEncs, resultCodePages.Length); + + MultiLanguage.MLCPF options = MultiLanguage.MLCPF.MLDETECTF_VALID_NLS; + if (preserveOrder) + options |= MultiLanguage.MLCPF.MLDETECTF_PRESERVE_ORDER; + + if (preferedEncodings != null) + options |= MultiLanguage.MLCPF.MLDETECTF_PREFERRED_ONLY; + + multilang3.DetectOutboundCodePage(options, + input, (uint)input.Length, + pPrefEncs, (uint) (preferedEncodings==null ? 0 : preferedEncodings.Length), + + pDetectedEncs, ref detectedCodepages, + ref specialChar); + + // get result + if (detectedCodepages > 0) + { + int[] theResult = new int[detectedCodepages]; + Marshal.Copy(pDetectedEncs, theResult, 0, theResult.Length); + result = Encoding.GetEncoding(theResult[0]); + } + + } + finally + { + if (pPrefEncs != IntPtr.Zero) + Marshal.FreeCoTaskMem(pPrefEncs); + Marshal.FreeCoTaskMem(pDetectedEncs); + } + } + finally + { + Marshal.FinalReleaseComObject(multilang3); + } + return result; + } + + public static Encoding[] DetectOutgoingEncodings(string input, int[] preferedEncodings, bool preserveOrder) + { + + if (input == null) + throw new ArgumentNullException("input"); + + // empty strings can always be encoded as ASCII + if (input.Length == 0) + return new Encoding[] { Encoding.ASCII }; + + List result = new List(); + + // get the IMultiLanguage3 interface + MultiLanguage.IMultiLanguage3 multilang3 = new MultiLanguage.CMultiLanguageClass(); + if (multilang3 == null) + throw new System.Runtime.InteropServices.COMException("Failed to get IMultilang3"); + try + { + int[] resultCodePages = new int[preferedEncodings.Length]; + uint detectedCodepages = (uint)resultCodePages.Length; + ushort specialChar = (ushort)'?'; + + + // get unmanaged arrays + IntPtr pPrefEncs = Marshal.AllocCoTaskMem(sizeof(uint) * preferedEncodings.Length); + IntPtr pDetectedEncs = preferedEncodings == null ? IntPtr.Zero : Marshal.AllocCoTaskMem(sizeof(uint) * resultCodePages.Length); + + try + { + if (preferedEncodings != null) + Marshal.Copy(preferedEncodings, 0, pPrefEncs, preferedEncodings.Length); + + Marshal.Copy(resultCodePages, 0, pDetectedEncs, resultCodePages.Length); + + MultiLanguage.MLCPF options = MultiLanguage.MLCPF.MLDETECTF_VALID_NLS | MultiLanguage.MLCPF.MLDETECTF_PREFERRED_ONLY; + if (preserveOrder) + options |= MultiLanguage.MLCPF.MLDETECTF_PRESERVE_ORDER; + + if (preferedEncodings != null) + options |= MultiLanguage.MLCPF.MLDETECTF_PREFERRED_ONLY; + + // finally... call to DetectOutboundCodePage + multilang3.DetectOutboundCodePage(options, + input, (uint)input.Length, + pPrefEncs, (uint) (preferedEncodings==null ? 0 : preferedEncodings.Length), + pDetectedEncs, ref detectedCodepages, + ref specialChar); + + // get result + if (detectedCodepages > 0) + { + int[] theResult = new int[detectedCodepages]; + Marshal.Copy(pDetectedEncs, theResult, 0, theResult.Length); + + + // get the encodings for the codepages + for (int i = 0; i < detectedCodepages; i++) + result.Add(Encoding.GetEncoding(theResult[i])); + + } + + } + finally + { + if (pPrefEncs != IntPtr.Zero) + Marshal.FreeCoTaskMem(pPrefEncs); + Marshal.FreeCoTaskMem(pDetectedEncs); + } + } + finally + { + Marshal.FinalReleaseComObject(multilang3); + } + // nothing found + return result.ToArray(); + } + + + /// + /// Detect the most probable codepage from an byte array + /// + /// array containing the raw data + /// the detected encoding or the default encoding if the detection failed + public static Encoding DetectInputCodepage(byte[] input) + { + try + { + Encoding[] detected = DetectInputCodepages(input, 1); + + if (detected.Length > 0) + return detected[0]; + return Encoding.Default; + } + catch(COMException) + { + // return default codepage on error + return Encoding.Default; + } + } + + /// + /// Rerurns up to maxEncodings codpages that are assumed to be apropriate + /// + /// array containing the raw data + /// maxiumum number of encodings to detect + /// an array of Encoding with assumed encodings + public static Encoding[] DetectInputCodepages(byte[] input, int maxEncodings) + { + + if (maxEncodings < 1) + throw new ArgumentOutOfRangeException("at least one encoding must be returend", "maxEncodings"); + + if (input == null) + throw new ArgumentNullException("input"); + + // empty strings can always be encoded as ASCII + if (input.Length == 0) + return new Encoding[] { Encoding.ASCII }; + + // expand the string to be at least 256 bytes + if (input.Length < 256) + { + byte[] newInput = new byte[256]; + int steps = 256 / input.Length; + for (int i = 0; i < steps; i++) + Array.Copy(input, 0, newInput, input.Length * i, input.Length); + + int rest = 256 % input.Length; + if (rest > 0) + Array.Copy(input, 0, newInput, steps * input.Length, rest); + input = newInput; + } + + List result = new List(); + + // get the IMultiLanguage" interface + MultiLanguage.IMultiLanguage2 multilang2 = new MultiLanguage.CMultiLanguageClass(); + if (multilang2 == null) + throw new System.Runtime.InteropServices.COMException("Failed to get IMultilang2"); + try + { + MultiLanguage.DetectEncodingInfo[] detectedEncdings = new MultiLanguage.DetectEncodingInfo[maxEncodings]; + + int scores = detectedEncdings.Length; + int srcLen = input.Length; + + // setup options (none) + MultiLanguage.MLDETECTCP options = MultiLanguage.MLDETECTCP.MLDETECTCP_NONE; + + // finally... call to DetectInputCodepage + multilang2.DetectInputCodepage(options,0, + ref input[0], ref srcLen, ref detectedEncdings[0], ref scores); + + // get result + if (scores > 0) + { + for (int i = 0; i < scores; i++) + { + // add the result + result.Add(Encoding.GetEncoding((int)detectedEncdings[i].nCodePage)); + } + } + } + finally + { + Marshal.FinalReleaseComObject(multilang2); + } + // nothing found + return result.ToArray(); + } + + + /// + /// Opens a text file and returns the content + /// encoded in the most probable encoding + /// + /// path to the souce file + /// the text content of the file + public static string ReadTextFile(string path) + { + if (path == null) + throw new ArgumentNullException("path"); + + using (Stream fs = File.Open(path, FileMode.Open)) + { + byte[] rawData = new byte[fs.Length]; + Encoding enc = DetectInputCodepage(rawData); + return enc.GetString(rawData); + } + } + + /// + /// Returns a stream reader for the given + /// text file with the best encoding applied + /// + /// path to the file + /// a StreamReader for the file + public static StreamReader OpenTextFile(string path) + { + if (path == null) + throw new ArgumentNullException("path"); + return OpenTextStream(File.Open(path, FileMode.Open)); + } + + /// + /// Creates a stream reader from a stream and detects + /// the encoding form the first bytes in the stream + /// + /// a stream to wrap + /// the newly created StreamReader + public static StreamReader OpenTextStream(Stream stream) + { + // check stream parameter + if (stream == null) + throw new ArgumentNullException("stream"); + if (!stream.CanSeek) + throw new ArgumentException("the stream must support seek operations","stream"); + + // assume default encoding at first place + Encoding detectedEncoding = Encoding.Default; + + // seek to stream start + stream.Seek(0, SeekOrigin.Begin); + + // buffer for preamble and up to 512b sample text for dection + byte[] buf = new byte[System.Math.Min(stream.Length, 512)]; + + stream.Read(buf, 0, buf.Length); + detectedEncoding = DetectInputCodepage(buf); + // seek back to stream start + stream.Seek(0, SeekOrigin.Begin); + + + return new StreamReader(stream,detectedEncoding); + + } + + } + + +}