1
0
mirror of synced 2024-11-24 06:10:11 +01:00
TakoTako/TJAConvert/FileEncoding/FileEncoding.cs
2023-01-05 19:56:08 -07:00

344 lines
13 KiB
C#

#region * License *
/*
SimpleHelpers - FileEncoding
Copyright © 2014 Khalid Salomão
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the “Software”), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
License: http://www.opensource.org/licenses/mit-license.php
Website: https://github.com/khalidsalomao/SimpleHelpers.Net
*/
#endregion
using System;
using System.IO;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace SimpleHelpers
{
public class FileEncoding
{
const int DEFAULT_BUFFER_SIZE = 128 * 1024;
/// <summary>
/// Tries to detect the file encoding.
/// </summary>
/// <param name="inputFilename">The input filename.</param>
/// <param name="defaultIfNotDetected">The default encoding if none was detected.</param>
/// <returns></returns>
public static Encoding DetectFileEncoding (string inputFilename, Encoding defaultIfNotDetected = null)
{
using (var stream = new System.IO.FileStream (inputFilename, System.IO.FileMode.Open, System.IO.FileAccess.Read, System.IO.FileShare.ReadWrite | System.IO.FileShare.Delete, DEFAULT_BUFFER_SIZE))
{
return DetectFileEncoding (stream) ?? defaultIfNotDetected;
}
}
/// <summary>
/// Tries to detect the file encoding.
/// </summary>
/// <param name="inputStream">The input stream.</param>
/// <param name="defaultIfNotDetected">The default encoding if none was detected.</param>
/// <returns></returns>
public static Encoding DetectFileEncoding (Stream inputStream, Encoding defaultIfNotDetected = null)
{
var det = new FileEncoding ();
det.Detect (inputStream);
return det.Complete () ?? defaultIfNotDetected;
}
/// <summary>
/// Tries to detect the file encoding.
/// </summary>
/// <param name="inputData">The input data.</param>
/// <param name="start">The start.</param>
/// <param name="count">The count.</param>
/// <param name="defaultIfNotDetected">The default encoding if none was detected.</param>
/// <returns></returns>
public static Encoding DetectFileEncoding (byte[] inputData, int start, int count, Encoding defaultIfNotDetected = null)
{
var det = new FileEncoding ();
det.Detect (inputData, start, count);
return det.Complete () ?? defaultIfNotDetected;
}
/// <summary>
/// Tries to load file content with the correct encoding.
/// </summary>
/// <param name="filename">The filename.</param>
/// <param name="defaultValue">The default value if unable to load file content.</param>
/// <returns>File content</returns>
public static string TryLoadFile (string filename, string defaultValue = "")
{
try
{
if (System.IO.File.Exists (filename))
{
// enable file encoding detection
var encoding = SimpleHelpers.FileEncoding.DetectFileEncoding (filename);
// Load data based on parameters
return System.IO.File.ReadAllText (filename, encoding);
}
}
catch { }
return defaultValue;
}
/// <summary>
/// Detects if contains textual data.
/// </summary>
/// <param name="rawData">The raw data.</param>
public static bool CheckForTextualData (byte[] rawData)
{
return CheckForTextualData (rawData, 0, rawData.Length);
}
/// <summary>
/// Detects if contains textual data.
/// </summary>
/// <param name="rawData">The raw data.</param>
/// <param name="start">The start.</param>
/// <param name="count">The count.</param>
public static bool CheckForTextualData (byte[] rawData, int start, int count)
{
if (rawData.Length < count || count < 4 || start + 1 >= count)
return true;
if (CheckForByteOrderMark (rawData, start))
{
return true;
}
// http://stackoverflow.com/questions/910873/how-can-i-determine-if-a-file-is-binary-or-text-in-c
// http://www.gnu.org/software/diffutils/manual/html_node/Binary.html
// count the number od null bytes sequences
// considering only sequeces of 2 0s: "\0\0" or control characters below 10
int nullSequences = 0;
int controlSequences = 0;
for (var i = start + 1; i < count; i++)
{
if (rawData[i - 1] == 0 && rawData[i] == 0)
{
if (++nullSequences > 1)
break;
}
else if (rawData[i - 1] == 0 && rawData[i] < 10)
{
++controlSequences;
}
}
// is text if there is no null byte sequences or less than 10% of the buffer has control caracteres
return nullSequences == 0 && (controlSequences <= (rawData.Length / 10));
}
/// <summary>
/// Detects if data has bytes order mark to indicate its encoding for textual data.
/// </summary>
/// <param name="rawData">The raw data.</param>
/// <param name="start">The start.</param>
/// <returns></returns>
private static bool CheckForByteOrderMark (byte[] rawData, int start = 0)
{
if (rawData.Length - start < 4)
return false;
// Detect encoding correctly (from Rick Strahl's blog)
// http://www.west-wind.com/weblog/posts/2007/Nov/28/Detecting-Text-Encoding-for-StreamReader
if (rawData[start] == 0xef && rawData[start + 1] == 0xbb && rawData[start + 2] == 0xbf)
{
// Encoding.UTF8;
return true;
}
else if (rawData[start] == 0xfe && rawData[start + 1] == 0xff)
{
// Encoding.Unicode;
return true;
}
else if (rawData[start] == 0 && rawData[start + 1] == 0 && rawData[start + 2] == 0xfe && rawData[start + 3] == 0xff)
{
// Encoding.UTF32;
return true;
}
else if (rawData[start] == 0x2b && rawData[start + 1] == 0x2f && rawData[start + 2] == 0x76)
{
// Encoding.UTF7;
return true;
}
return false;
}
Ude.CharsetDetector ude = new Ude.CharsetDetector ();
bool _started = false;
/// <summary>
/// If the detection has reached a decision.
/// </summary>
/// <value>The done.</value>
public bool Done { get; set; }
/// <summary>
/// Detected encoding name.
/// </summary>
public string EncodingName { get; set; }
/// <summary>
/// If the data contains textual data.
/// </summary>
public bool IsText { get; set; }
/// <summary>
/// If the file or data has any mark indicating encoding information (byte order mark).
/// </summary>
public bool HasByteOrderMark { get; set; }
List<string> singleEncodings = new List<string> ();
/// <summary>
/// Resets this instance.
/// </summary>
public void Reset ()
{
_started = false;
Done = false;
HasByteOrderMark = false;
singleEncodings.Clear ();
ude.Reset ();
EncodingName = null;
}
/// <summary>
/// Detects the encoding of textual data of the specified input data.
/// </summary>
/// <param name="inputData">The input data.</param>
/// <returns>Detected encoding name</returns>
public string Detect (Stream inputData)
{
const int bufferSize = 16 * 1024;
const int maxIterations = (20 * 1024 * 1024) / bufferSize;
int i = 0;
byte[] buffer = new byte[bufferSize];
while (i++ < maxIterations)
{
int sz = inputData.Read (buffer, 0, (int)buffer.Length);
if (sz <= 0)
{
break;
}
Detect (buffer, 0, sz);
if (Done)
break;
}
Complete ();
return EncodingName;
}
/// <summary>
/// Detects the encoding of textual data of the specified input data.
/// </summary>
/// <param name="inputData">The input data.</param>
/// <param name="start">The start.</param>
/// <param name="count">The count.</param>
/// <returns>Detected encoding name</returns>
public string Detect (byte[] inputData, int start, int count)
{
if (Done)
return EncodingName;
if (!_started)
{
Reset ();
_started = true;
if (!CheckForTextualData (inputData, start, count))
{
IsText = false;
Done = true;
return EncodingName;
}
HasByteOrderMark = CheckForByteOrderMark (inputData, start);
IsText = true;
}
// execute charset detector
ude.Feed (inputData, start, count);
ude.DataEnd ();
if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset))
{
Done = true;
return EncodingName;
}
const int bufferSize = 4 * 1024;
// singular buffer detection
if (singleEncodings.Count < 2000)
{
var u = new Ude.CharsetDetector ();
int step = (count - start) < bufferSize ? (count - start) : bufferSize;
for (var i = start; i < count; i += step)
{
u.Reset ();
if (i + step > count)
u.Feed (inputData, i, count - i);
else
u.Feed (inputData, i, step);
u.DataEnd ();
if (u.Confidence > 0.3 && !String.IsNullOrEmpty (u.Charset))
singleEncodings.Add (u.Charset);
}
}
return EncodingName;
}
/// <summary>
/// Finalize detection phase and gets detected encoding name.
/// </summary>
/// <returns></returns>
public Encoding Complete ()
{
Done = true;
ude.DataEnd ();
if (ude.IsDone () && !String.IsNullOrEmpty (ude.Charset))
{
EncodingName = ude.Charset;
}
else if (singleEncodings.Count > 0)
{
// vote for best encoding
EncodingName = singleEncodings.GroupBy (i => i)
.OrderByDescending (i => i.Count () *
(i.Key.StartsWith ("UTF-32") ? 2 :
i.Key.StartsWith ("UTF-16") ? 1.8 :
i.Key.StartsWith ("UTF-8") ? 1.5 :
i.Key.StartsWith ("UTF-7") ? 1.3 :
i.Key != ("ASCII") ? 1 : 0.2))
.Select (i => i.Key).FirstOrDefault ();
}
if (!String.IsNullOrEmpty (EncodingName))
return Encoding.GetEncoding (EncodingName);
return null;
}
}
}