C# 查找任何文件的有效方法';s编码
是的,这是一个最常见的问题,这件事对我来说很模糊,因为我对它不太了解 但是我想要一个非常精确的方法来找到一个文件编码。C# 查找任何文件的有效方法';s编码,c#,encoding,C#,Encoding,是的,这是一个最常见的问题,这件事对我来说很模糊,因为我对它不太了解 但是我想要一个非常精确的方法来找到一个文件编码。 就像记事本++一样精确。我会尝试以下步骤: 1) 检查是否有字节顺序标记 2) 检查文件是否为有效的UTF8 3) 使用本地“ANSI”代码页(由Microsoft定义的ANSI) 步骤2之所以有效,是因为除UTF8之外的代码页中的大多数非ASCII序列都不是有效的UTF8。StreamReader.CurrentEncoding属性很少为我返回正确的文本文件编码。通过分析文件
就像记事本++一样精确。我会尝试以下步骤: 1) 检查是否有字节顺序标记 2) 检查文件是否为有效的UTF8 3) 使用本地“ANSI”代码页(由Microsoft定义的ANSI)
步骤2之所以有效,是因为除UTF8之外的代码页中的大多数非ASCII序列都不是有效的UTF8。StreamReader.CurrentEncoding属性很少为我返回正确的文本文件编码。通过分析文件的字节顺序标记(BOM),我更成功地确定了文件的结尾。如果文件没有BOM表,则无法确定文件的编码 *更新于2020年8月4日,包括UTF-32LE检测和UTF-32BE返回正确编码
/// <summary>
/// Determines a text file's encoding by analyzing its byte order mark (BOM).
/// Defaults to ASCII when detection of the text file's endianness fails.
/// </summary>
/// <param name="filename">The text file to analyze.</param>
/// <returns>The detected encoding.</returns>
public static Encoding GetEncoding(string filename)
{
// Read the BOM
var bom = new byte[4];
using (var file = new FileStream(filename, FileMode.Open, FileAccess.Read))
{
file.Read(bom, 0, 4);
}
// Analyze the BOM
if (bom[0] == 0x2b && bom[1] == 0x2f && bom[2] == 0x76) return Encoding.UTF7;
if (bom[0] == 0xef && bom[1] == 0xbb && bom[2] == 0xbf) return Encoding.UTF8;
if (bom[0] == 0xff && bom[1] == 0xfe && bom[2] == 0 && bom[3] == 0) return Encoding.UTF32; //UTF-32LE
if (bom[0] == 0xff && bom[1] == 0xfe) return Encoding.Unicode; //UTF-16LE
if (bom[0] == 0xfe && bom[1] == 0xff) return Encoding.BigEndianUnicode; //UTF-16BE
if (bom[0] == 0 && bom[1] == 0 && bom[2] == 0xfe && bom[3] == 0xff) return new UTF32Encoding(true, true); //UTF-32BE
// We actually have no idea what the encoding is if we reach this point, so
// you may wish to return null instead of defaulting to ASCII
return Encoding.ASCII;
}
//
///通过分析文本文件的字节顺序标记(BOM)来确定文本文件的编码。
///当检测文本文件的结尾失败时,默认为ASCII。
///
///要分析的文本文件。
///检测到的编码无效。
公共静态编码GetEncoding(字符串文件名)
{
//阅读BOM表
var bom=新字节[4];
使用(var file=newfilestream(文件名,FileMode.Open,FileAccess.Read))
{
文件读取(bom,0,4);
}
//分析BOM表
如果(bom[0]==0x2b&&bom[1]==0x2f&&bom[2]==0x76)返回Encoding.UTF7;
if(bom[0]==0xef&&bom[1]==0xbb&&bom[2]==0xbf)返回Encoding.UTF8;
如果(bom[0]==0xff&&bom[1]==0xfe&&bom[2]==0&&bom[3]==0)返回Encoding.UTF32;//UTF-32LE
if(bom[0]==0xff&&bom[1]==0xfe)返回Encoding.Unicode;//UTF-16LE
if(bom[0]==0xfe&&bom[1]==0xff)返回Encoding.bigendianucode;//UTF-16BE
如果(bom[0]==0&&bom[1]==0&&bom[2]==0xfe&&bom[3]==0xff)返回新的UTF32编码(true,true);//UTF-32BE
//如果我们达到这一点,我们实际上不知道编码是什么,所以
//您可能希望返回null,而不是默认为ASCII
返回编码.ASCII;
}
使用StreamReader
类,以下代码对我来说很好:
using (var reader = new StreamReader(fileName, defaultEncodingIfNoBom, true))
{
reader.Peek(); // you need this!
var encoding = reader.CurrentEncoding;
}
诀窍是使用Peek
调用,否则,.NET没有做任何事情(并且它没有读取序言,BOM)。当然,如果您在检查编码之前使用任何其他ReadXXX
调用,它也可以工作
如果文件没有BOM表,则将使用defaultEncodingIfNoBom
编码。还有一个没有这种超载方法的流读取器(在这种情况下,默认(ANSI)编码将被用作Debug ToCurnIFNoBOM),但我建议定义您认为上下文中的默认编码。
我已经成功地用BOM为UTF8、UTF16/Unicode(LE&BE)和UTF32(LE&BE)的文件测试了这一点。它不适用于UTF7。请在此处查找c#
以下代码是我的Powershell代码,用于确定某些cpp或h或ml文件是否使用ISO-8859-1(拉丁语-1)或UTF-8编码,而不使用BOM,如果两者都不使用,则假定其为GB18030。我是一名在法国工作的中国人,MSVC在法国计算机上保存为Latin-1,在中国计算机上保存为GB,因此这有助于我在系统和同事之间交换源文件时避免编码问题 方法很简单,如果所有字符都在x00-x7E之间,ASCII、UTF-8和拉丁语-1都是相同的,但是如果我通过UTF-8读取非ASCII文件,我们将找到特殊字符� 出现,所以试着用拉丁语-1阅读。在拉丁语-1中,\x7F和\xAF之间是空的,而GB使用的是x00和xFF之间的full,因此如果我在两者之间找到任何值,它就不是拉丁语-1 代码是在PowerShell中编写的,但使用.net,因此很容易翻译成C#或F# 看看这个 这是Mozilla通用字符集检测器的一个端口,您可以像这样使用它
public static void Main(String[] args)
{
string filename = args[0];
using (FileStream fs = File.OpenRead(filename)) {
Ude.CharsetDetector cdet = new Ude.CharsetDetector();
cdet.Feed(fs);
cdet.DataEnd();
if (cdet.Charset != null) {
Console.WriteLine("Charset: {0}, confidence: {1}",
cdet.Charset, cdet.Confidence);
} else {
Console.WriteLine("Detection failed.");
}
}
}
提供@CodesInChaos提出的步骤的实施细节: 1) 检查是否有字节顺序标记 2) 检查文件是否为有效的UTF8 3) 使用本地“ANSI”代码页(由Microsoft定义的ANSI) 步骤2之所以有效,是因为除UTF8之外的代码页中的大多数非ASCII序列都不是有效的UTF8。更详细地解释这个策略
using System; using System.IO; using System.Text;
// Using encoding from BOM or UTF8 if no BOM found,
// check if the file is valid, by reading all lines
// If decoding fails, use the local "ANSI" codepage
public string DetectFileEncoding(Stream fileStream)
{
var Utf8EncodingVerifier = Encoding.GetEncoding("utf-8", new EncoderExceptionFallback(), new DecoderExceptionFallback());
using (var reader = new StreamReader(fileStream, Utf8EncodingVerifier,
detectEncodingFromByteOrderMarks: true, leaveOpen: true, bufferSize: 1024))
{
string detectedEncoding;
try
{
while (!reader.EndOfStream)
{
var line = reader.ReadLine();
}
detectedEncoding = reader.CurrentEncoding.BodyName;
}
catch (Exception e)
{
// Failed to decode the file using the BOM/UT8.
// Assume it's local ANSI
detectedEncoding = "ISO-8859-1";
}
// Rewind the stream
fileStream.Seek(0, SeekOrigin.Begin);
return detectedEncoding;
}
}
[Test]
public void Test1()
{
Stream fs = File.OpenRead(@".\TestData\TextFile_ansi.csv");
var detectedEncoding = DetectFileEncoding(fs);
using (var reader = new StreamReader(fs, Encoding.GetEncoding(detectedEncoding)))
{
// Consume your file
var line = reader.ReadLine();
...
这可能有用
string path = @"address/to/the/file.extension";
using (StreamReader sr = new StreamReader(path))
{
Console.WriteLine(sr.CurrentEncoding);
}
.NET不是很有帮助,但您可以尝试以下算法:
var encoding = FileHelper.GetEncoding(filePath);
if (encoding == null)
throw new Exception("The file encoding is not supported. Please choose one of the following encodings: UTF8/UTF7/iso-8859-1");
代码如下:
public class FileHelper
{
/// <summary>
/// Determines a text file's encoding by analyzing its byte order mark (BOM) and if not found try parsing into diferent encodings
/// Defaults to UTF8 when detection of the text file's endianness fails.
/// </summary>
/// <param name="filename">The text file to analyze.</param>
/// <returns>The detected encoding or null.</returns>
public static Encoding GetEncoding(string filename)
{
var encodingByBOM = GetEncodingByBOM(filename);
if (encodingByBOM != null)
return encodingByBOM;
// BOM not found :(, so try to parse characters into several encodings
var encodingByParsingUTF8 = GetEncodingByParsing(filename, Encoding.UTF8);
if (encodingByParsingUTF8 != null)
return encodingByParsingUTF8;
var encodingByParsingLatin1 = GetEncodingByParsing(filename, Encoding.GetEncoding("iso-8859-1"));
if (encodingByParsingLatin1 != null)
return encodingByParsingLatin1;
var encodingByParsingUTF7 = GetEncodingByParsing(filename, Encoding.UTF7);
if (encodingByParsingUTF7 != null)
return encodingByParsingUTF7;
return null; // no encoding found
}
/// <summary>
/// Determines a text file's encoding by analyzing its byte order mark (BOM)
/// </summary>
/// <param name="filename">The text file to analyze.</param>
/// <returns>The detected encoding.</returns>
private static Encoding GetEncodingByBOM(string filename)
{
// Read the BOM
var byteOrderMark = new byte[4];
using (var file = new FileStream(filename, FileMode.Open, FileAccess.Read))
{
file.Read(byteOrderMark, 0, 4);
}
// Analyze the BOM
if (byteOrderMark[0] == 0x2b && byteOrderMark[1] == 0x2f && byteOrderMark[2] == 0x76) return Encoding.UTF7;
if (byteOrderMark[0] == 0xef && byteOrderMark[1] == 0xbb && byteOrderMark[2] == 0xbf) return Encoding.UTF8;
if (byteOrderMark[0] == 0xff && byteOrderMark[1] == 0xfe) return Encoding.Unicode; //UTF-16LE
if (byteOrderMark[0] == 0xfe && byteOrderMark[1] == 0xff) return Encoding.BigEndianUnicode; //UTF-16BE
if (byteOrderMark[0] == 0 && byteOrderMark[1] == 0 && byteOrderMark[2] == 0xfe && byteOrderMark[3] == 0xff) return Encoding.UTF32;
return null; // no BOM found
}
private static Encoding GetEncodingByParsing(string filename, Encoding encoding)
{
var encodingVerifier = Encoding.GetEncoding(encoding.BodyName, new EncoderExceptionFallback(), new DecoderExceptionFallback());
try
{
using (var textReader = new StreamReader(filename, encodingVerifier, detectEncodingFromByteOrderMarks: true))
{
while (!textReader.EndOfStream)
{
textReader.ReadLine(); // in order to increment the stream position
}
// all text parsed ok
return textReader.CurrentEncoding;
}
}
catch (Exception ex) { }
return null; //
}
}
公共类FileHelper
{
///
///通过分析文本文件的字节顺序标记(BOM)确定文本文件的编码,如果未找到,请尝试解析为不同的编码
///当检测文本文件的结尾失败时,默认为UTF8。
///
///要分析的文本文件。
///检测到的编码无效或为空。
公共静态编码GetEncoding(字符串文件名)
{
var encodingByBOM=GetEncodingByBOM(文件名);
if(encodingByBOM!=null)
按BOM返回编码;
//找不到BOM:(,请尝试将字符解析为多个编码
var encodingByParsingUTF8=GetEncodingByParsing(文件名,Encoding.UTF8);
if(encodingByParsingUTF8!=null)
返回parsingutf8编码;
var encodingByParsingLatin1=GetEncodingByParsing(文件名,Encoding.GetEncoding(“iso-8859-1”);
if(encodingByParsingLatin1!=null)
返回ParsingLatin1编码;
var encodingByParsingUTF7=GetEncodingByParsing(文件名,Encoding.UTF7);
if(encodingByParsingUTF7!=null)
返回parsingutf7编码;
返回null;//未找到编码
}
///
///通过分析文本文件的字节顺序标记(BOM)来确定文本文件的编码
///
///要分析的文本文件。
///检测到的编码无效。
私人sta
var encoding = FileHelper.GetEncoding(filePath);
if (encoding == null)
throw new Exception("The file encoding is not supported. Please choose one of the following encodings: UTF8/UTF7/iso-8859-1");
public class FileHelper
{
/// <summary>
/// Determines a text file's encoding by analyzing its byte order mark (BOM) and if not found try parsing into diferent encodings
/// Defaults to UTF8 when detection of the text file's endianness fails.
/// </summary>
/// <param name="filename">The text file to analyze.</param>
/// <returns>The detected encoding or null.</returns>
public static Encoding GetEncoding(string filename)
{
var encodingByBOM = GetEncodingByBOM(filename);
if (encodingByBOM != null)
return encodingByBOM;
// BOM not found :(, so try to parse characters into several encodings
var encodingByParsingUTF8 = GetEncodingByParsing(filename, Encoding.UTF8);
if (encodingByParsingUTF8 != null)
return encodingByParsingUTF8;
var encodingByParsingLatin1 = GetEncodingByParsing(filename, Encoding.GetEncoding("iso-8859-1"));
if (encodingByParsingLatin1 != null)
return encodingByParsingLatin1;
var encodingByParsingUTF7 = GetEncodingByParsing(filename, Encoding.UTF7);
if (encodingByParsingUTF7 != null)
return encodingByParsingUTF7;
return null; // no encoding found
}
/// <summary>
/// Determines a text file's encoding by analyzing its byte order mark (BOM)
/// </summary>
/// <param name="filename">The text file to analyze.</param>
/// <returns>The detected encoding.</returns>
private static Encoding GetEncodingByBOM(string filename)
{
// Read the BOM
var byteOrderMark = new byte[4];
using (var file = new FileStream(filename, FileMode.Open, FileAccess.Read))
{
file.Read(byteOrderMark, 0, 4);
}
// Analyze the BOM
if (byteOrderMark[0] == 0x2b && byteOrderMark[1] == 0x2f && byteOrderMark[2] == 0x76) return Encoding.UTF7;
if (byteOrderMark[0] == 0xef && byteOrderMark[1] == 0xbb && byteOrderMark[2] == 0xbf) return Encoding.UTF8;
if (byteOrderMark[0] == 0xff && byteOrderMark[1] == 0xfe) return Encoding.Unicode; //UTF-16LE
if (byteOrderMark[0] == 0xfe && byteOrderMark[1] == 0xff) return Encoding.BigEndianUnicode; //UTF-16BE
if (byteOrderMark[0] == 0 && byteOrderMark[1] == 0 && byteOrderMark[2] == 0xfe && byteOrderMark[3] == 0xff) return Encoding.UTF32;
return null; // no BOM found
}
private static Encoding GetEncodingByParsing(string filename, Encoding encoding)
{
var encodingVerifier = Encoding.GetEncoding(encoding.BodyName, new EncoderExceptionFallback(), new DecoderExceptionFallback());
try
{
using (var textReader = new StreamReader(filename, encodingVerifier, detectEncodingFromByteOrderMarks: true))
{
while (!textReader.EndOfStream)
{
textReader.ReadLine(); // in order to increment the stream position
}
// all text parsed ok
return textReader.CurrentEncoding;
}
}
catch (Exception ex) { }
return null; //
}
}