C# 获取字符串的编码_C#_.net_Encoding_Html Agility Pack

C# 获取字符串的编码

c# .net encoding

C# 获取字符串的编码,c#,.net,encoding,html-agility-pack,C#,.net,Encoding,Html Agility Pack,我从另一个页面获取html，在编码方面有问题。例如：我得到： aparelho nas sa??das 原文是： aparelho nas saídas 如何获得编码并转换为原始字符串我的代码： var GetResponse = API_GET("..."); //this returns html of an http request. HtmlDocument doc = new HtmlDocument(); //the html-parsing doc.LoadHtml

我从另一个页面获取html，在编码方面有问题。例如：我得到：

aparelho nas sa??das

原文是：

aparelho nas saídas

如何获得编码并转换为原始字符串

我的代码：

var GetResponse = API_GET("..."); //this returns html of an http request. 
HtmlDocument doc = new HtmlDocument();  //the html-parsing 
doc.LoadHtml(GetResponse);
var body = doc.DocumentNode.SelectNodes("//div[@class='para']"); 
... 
var para = body[i].InnerHtml; //Here's the problem,it returns the output like: sa??das

我是怎么做到的

提前感谢

使用此功能-类似于这样做。只检查utf-8。我认为很难检查编码

如有疑问，请将其转换为UTF-8或编码=WTF

这是部分源代码。看看IsUtf8方法。这可能非常有用

using System; using System.IO; namespace Unicode { /// <summary> /// http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335 /// /// http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html /// /// http://www.unicode.org/versions/corrigendum1.html /// /// http://www.ietf.org/rfc/rfc2279.txt /// /// </summary> public class Utf8Checker : IUtf8Checker { public bool Check(string fileName) { using (BufferedStream fstream = new BufferedStream(File.OpenRead(fileName))) { return this.IsUtf8(fstream); } } /// <summary> /// Check if stream is utf8 encoded. /// Notice: stream is read completely in memory! /// </summary> /// <param name="stream">Stream to read from.</param> /// <returns>True if the whole stream is utf8 encoded.</returns> public bool IsUtf8(Stream stream) { int count = 4 * 1024; byte[] buffer; int read; while (true) { buffer = new byte[count]; stream.Seek(0, SeekOrigin.Begin); read = stream.Read(buffer, 0, count); if (read < count) { break; } buffer = null; count *= 2; } return IsUtf8(buffer, read); } /// <summary> /// /// </summary> /// <param name="buffer"></param> /// <param name="length"></param> /// <returns></returns> public static bool IsUtf8(byte[] buffer, int length) { int position = 0; int bytes = 0; while (position < length) { if (!IsValid(buffer, position, length, ref bytes)) { return false; } position += bytes; } return true; } /// <summary> /// /// </summary> /// <param name="buffer"></param> /// <param name="position"></param> /// <param name="length"></param> /// <param name="bytes"></param> /// <returns></returns> public static bool IsValid(byte[] buffer, int position, int length, ref int bytes) { if (length > buffer.Length) { throw new ArgumentException("Invalid length"); } if (position > length - 1) { bytes = 0; return true; } byte ch = buffer[position]; if (ch <= 0x7F) { bytes = 1; return true; } if (ch >= 0xc2 && ch <= 0xdf) { if (position >= length - 2) { bytes = 0; return false; } if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0xbf) { bytes = 0; return false; } bytes = 2; return true; } if (ch == 0xe0) { if (position >= length - 3) { bytes = 0; return false; } if (buffer[position + 1] < 0xa0 || buffer[position + 1] > 0xbf || buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf) { bytes = 0; return false; } bytes = 3; return true; } if (ch >= 0xe1 && ch <= 0xef) { if (position >= length - 3) { bytes = 0; return false; } if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0xbf || buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf) { bytes = 0; return false; } bytes = 3; return true; } if (ch == 0xf0) { if (position >= length - 4) { bytes = 0; return false; } if (buffer[position + 1] < 0x90 || buffer[position + 1] > 0xbf || buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf || buffer[position + 3] < 0x80 || buffer[position + 3] > 0xbf) { bytes = 0; return false; } bytes = 4; return true; } if (ch == 0xf4) { if (position >= length - 4) { bytes = 0; return false; } if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0x8f || buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf || buffer[position + 3] < 0x80 || buffer[position + 3] > 0xbf) { bytes = 0; return false; } bytes = 4; return true; } if (ch >= 0xf1 && ch <= 0xf3) { if (position >= length - 4) { bytes = 0; return false; } if (buffer[position + 1] < 0x80 || buffer[position + 1] > 0xbf || buffer[position + 2] < 0x80 || buffer[position + 2] > 0xbf || buffer[position + 3] < 0x80 || buffer[position + 3] > 0xbf) { bytes = 0; return false; } bytes = 4; return true; } return false; } } }

使用系统；使用System.IO；名称空间Unicode { /// /// http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335 /// /// http://www.cl.cam.ac.uk/~mgk25/ucs/ISO-10646-UTF-8.html /// /// http://www.unicode.org/versions/corrigendum1.html /// /// http://www.ietf.org/rfc/rfc2279.txt /// /// 公共类Utf8Checker:IUtf8Checker { 公共布尔检查（字符串文件名） { 使用（BufferedStream fstream=new BufferedStream（File.OpenRead（fileName））） { 返回此.IsUtf8（fstream）； } } /// ///检查流是否为utf8编码。 ///注意：流在内存中完全读取！ /// ///要从中读取的流。 ///如果整个流是utf8编码的，则为True。公共布尔IsUtf8（流） { 整数计数=4*1024；字节[]缓冲区； int-read； while（true） { 缓冲区=新字节[计数]； stream.Seek（0，SeekOrigin.Begin）； read=stream.read（缓冲区，0，计数）；如果（读取<计数） { 打破 } 缓冲区=空；计数*=2； } 返回IsUtf8（缓冲区，读取）； } /// /// /// /// /// /// 公共静态bool IsUtf8（字节[]缓冲区，整数长度） { int位置=0； int字节=0； while（位置<长度） { 如果（！IsValid（缓冲区、位置、长度、引用字节）） { 返回false； } 位置+=字节； } 返回true； } /// /// /// /// /// /// /// /// 公共静态bool有效（字节[]缓冲区，int位置，int长度，ref int字节） { if（长度>缓冲区长度） { 抛出新ArgumentException（“无效长度”）； } 如果（位置>长度-1） { 字节=0；返回true； } 字节ch=缓冲器[位置]； if（ch=0xc2&&ch=length-2） { 字节=0；返回false； } 如果（缓冲器[位置+1]<0x80 | |缓冲器[位置+1]>0xbf） { 字节=0；返回false； } 字节=2；返回true； } if（ch==0xe0） { 如果（位置>=长度-3） { 字节=0；返回false； } 如果（缓冲器[位置+1]<0xa0 | |缓冲器[位置+1]>0xbf|| 缓冲区[位置+2]<0x80 | |缓冲区[位置+2]>0xbf） { 字节=0；返回false； } 字节=3；返回true； } 如果（ch>=0xe1&&ch=length-3） { 字节=0；返回false； } 如果（缓冲器[位置+1]<0x80 | |缓冲器[位置+1]>0xbf|| 缓冲区[位置+2]<0x80 | |缓冲区[位置+2]>0xbf） { 字节=0；返回false； } 字节=3；返回true； } 如果（ch==0xf0） { 如果（位置>=长度-4） { 字节=0；返回false； } 如果（缓冲器[位置+1]<0x90 | |缓冲器[位置+1]>0xbf|| 缓冲区[位置+2]<0x80 | |缓冲区[位置+2]>0xbf|| 缓冲区[位置+3]<0x80 | |缓冲区[位置+3]>0xbf） { 字节=0；返回false； } 字节=4；返回true； } 如果（ch==0xf4） { 如果（位置>=长度-4） { 字节=0；返回false； } 如果（缓冲器[位置+1]<0x80 | |缓冲器[位置+1]>0x8f|| 缓冲区[位置+2]<0x80 | |缓冲区[位置+2]>0xbf|| 缓冲区[位置+3]<0x80 | |缓冲区[位置+3]>0xbf） { 字节=0；返回false； } 字节=4；返回true； } 如果（ch>=0xf1&&ch=length-4） { 字节=0；返回false； } 如果（缓冲器[位置+1]<0x80 | |缓冲器[位置+1]>0xbf|| 缓冲区[位置+2]<0x80 | |缓冲区[位置+2]>0xbf|| 缓冲区[位置+3]<0x80 | |缓冲区[位置+3]>0xbf） { 字节=0；返回false；