C# 当网页编码不同时，如何获取网页标题_C#_Html_Character Encoding

C# 当网页编码不同时，如何获取网页标题

c# html character-encoding

C# 当网页编码不同时，如何获取网页标题,c#,html,character-encoding,C#,Html,Character Encoding,我有一种方法可以下载网页并提取标题标签，但根据网站的不同，结果可能会被编码或使用错误的字符集。当网站的编码不同时，是否有一种防弹的方法来获取网站标题我用不同的结果测试了一些URL：返回“魁北克-维基百科”。结果是好的 return“Condo，chalet ou maison和agravevendre avec un courtier immobilier | RE/MAXQuébec” 返回“餐厅”Montr�al|重新集中” 我使用的方法是： private string

我有一种方法可以下载网页并提取标题标签，但根据网站的不同，结果可能会被编码或使用错误的字符集。当网站的编码不同时，是否有一种防弹的方法来获取网站标题

我用不同的结果测试了一些URL：

返回“魁北克-维基百科”。结果是好的
return“Condo，chalet ou maison和agravevendre avec un courtier immobilier | RE/MAXQuébec”
返回“餐厅”Montr�al|重新集中”

我使用的方法是：

private string GetUrlTitle(Uri uri)
{
    string title = "";

    using (HttpClient client = new HttpClient())
    {
        HttpResponseMessage response = null;

        response = client.GetAsync(uri).Result;

        if (!response.IsSuccessStatusCode)
        {
            string errorMessage = "";

            try
            {
                XmlSerializer xml = new XmlSerializer(typeof(HttpError));
                HttpError error = xml.Deserialize(response.Content.ReadAsStreamAsync().Result) as HttpError;
                errorMessage = error.Message;
            }
            catch (Exception)
            {
                errorMessage = response.ReasonPhrase;
            }

            throw new Exception(errorMessage);
        }

        var html = response.Content.ReadAsStringAsync().Result;
        title = Regex.Match(html, @"\<title\b[^>]*\>\s*(?<Title>[\s\S]*?)\</title\>", RegexOptions.IgnoreCase).Groups["Title"].Value;
    }

    if (title == string.Empty)
    {
        title = uri.ToString();
    }

    return title;
}

私有字符串GetUrlTitle（Uri）
{
字符串标题=”；
使用（HttpClient=new HttpClient（））
{
HttpResponseMessage响应=null；
response=client.GetAsync（uri）.Result；
如果（！response.issucessStatusCode）
{
字符串errorMessage=“”；
尝试
{
XmlSerializer xml=新的XmlSerializer（typeof（HttpError））；
HttpError error=xml.Deserialize（response.Content.ReadAsStreamAsync（）.Result）为HttpError；
errorMessage=错误。消息；
}
捕获（例外）
{
errorMessage=response.ReasonPhrase；
}
抛出新异常（errorMessage）；
}
var html=response.Content.ReadAsStringAsync（）.Result；
title=Regex.Match（html，@“\]*\>\s*（？[\s\s]*？）\”，RegexOptions.IgnoreCase）.Groups[“title”].Value；
}
if（title==string.Empty）
{
title=uri.ToString（）；
}
返回标题；
}

这可能对您有所帮助。利用全球化

using System;
using System.Globalization;

public class Example
{
    public static void Main()
{
  string[] values = { "a tale of two cities", "gROWL to the rescue",
                      "inside the US government", "sports and MLB baseball",
                      "The Return of Sherlock Holmes", "UNICEF and         children"};

  TextInfo ti = CultureInfo.CurrentCulture.TextInfo;
  foreach (var value in values)
     Console.WriteLine("{0} --> {1}", value, ti.ToTitleCase(value));
   }
}

查看此项。

您可以尝试获取所有字节，并使用您想要的任何编码转换为

字符串

，只需使用类即可。应该是这样的：

私有字符串GetUrlTitle（Uri）
{
字符串标题=”；
使用（HttpClient=new HttpClient（））
{
var byteData=await client.GetByteArrayAsync（url）；
字符串html=Encoding.UTF8.GetString（byteData）；
title=Regex.Match（html，@“\]*\>\s*（？[\s\s]*？）\”，RegexOptions.IgnoreCase）.Groups[“title”].Value；
}
返回标题；
}

我希望它能帮助您，如果有，请将其标记为答案。

字符集并不总是出现在标题中，因此我们还必须检查元标记，或者如果它也不存在，请返回UTF8（或其他）。此外，标题可能是编码的，所以我们只需要解码它

结果

返回“魁北克-维基百科”
返回“公寓，小木屋” maisonávendre avec联合国朝臣immobilier | RE/MAX Québec”
返回“Montréal餐厅”| “再离心”

下面的代码来自github项目。我对它做了一点修改

private string GetUrlTitle(Uri uri)
{
    string title = "";

    using (HttpClient client = new HttpClient())
    {
        HttpResponseMessage response = client.GetAsync(uri).Result;

        if (!response.IsSuccessStatusCode)
        {
            throw new Exception(response.ReasonPhrase);
        }

        var contentStream = response.Content.ReadAsStreamAsync().Result;
        var charset = response.Content.Headers.ContentType.CharSet ?? GetCharsetFromBody(contentStream);                

        Encoding encoding = GetEncodingOrDefaultToUTF8(charset);
        string content = GetContent(contentStream, encoding);

        Match titleMatch = Regex.Match(content, @"\<title\b[^>]*\>\s*(?<Title>[\s\S]*?)\</title\>", RegexOptions.IgnoreCase);

        if (titleMatch.Success)
        {
            title = titleMatch.Groups["Title"].Value;

            // decode the title in case it have been encoded
            title = WebUtility.HtmlDecode(title).Trim();
        }
    }

    if (string.IsNullOrWhiteSpace(title))
    {
        title = uri.ToString();
    }

    return title;
}

private string GetContent(Stream contentStream, Encoding encoding)
{
    contentStream.Seek(0, SeekOrigin.Begin);

    using (StreamReader sr = new StreamReader(contentStream, encoding))
    {
        return sr.ReadToEnd();
    }
}

/// <summary>
/// Try getting the charset from the body content.
/// </summary>
/// <param name="contentStream"></param>
/// <returns></returns>
private string GetCharsetFromBody(Stream contentStream)
{
    contentStream.Seek(0, SeekOrigin.Begin);

    StreamReader srr = new StreamReader(contentStream, Encoding.ASCII);
    string body = srr.ReadToEnd();
    string charset = null;

    if (body != null)
    {
        //find expression from : http://stackoverflow.com/questions/3458217/how-to-use-regular-expression-to-match-the-charset-string-in-html
        Match match = Regex.Match(body, @"<meta(?!\s*(?:name|value)\s*=)(?:[^>]*?content\s*=[\s""']*)?([^>]*?)[\s""';]*charset\s*=[\s""']*([^\s""'/>]*)", RegexOptions.IgnoreCase);

        if (match.Success)
        {
            charset = string.IsNullOrWhiteSpace(match.Groups[2].Value) ? null : match.Groups[2].Value;
        }
    }

    return charset;
}

/// <summary>
/// Try parsing the charset or fallback to UTF8
/// </summary>
/// <param name="charset"></param>
/// <returns></returns>
private Encoding GetEncodingOrDefaultToUTF8(string charset)
{
    Encoding e = Encoding.UTF8;

    if (charset != null)
    {
        try
        {
            e = Encoding.GetEncoding(charset);
        }
        catch
        {
        }
    }

    return e;
}

私有字符串GetUrlTitle（Uri）
{
字符串标题=”；
使用（HttpClient=new HttpClient（））
{
HttpResponseMessageResponse=client.GetAsync（uri）.Result；
如果（！response.issucessStatusCode）
{
抛出新异常（response.ReasonPhrase）；
}
var contentStream=response.Content.ReadAsStreamAsync（）.Result；
var charset=response.Content.Headers.ContentType.charset？？GetCharsetFromBody（contentStream）；
编码编码=GetEncodingOrDefaultToUTF8（字符集）；
string content=GetContent（contentStream，编码）；
Match titleMatch=Regex.Match（内容@“\]*\>\s*（？[\s\s]*？）\”，RegexOptions.IgnoreCase）；
如果（标题匹配成功）
{
title=titleMatch.Groups[“title”]。值；
//如果标题已编码，请对其进行解码
title=WebUtility.HtmlDecode（title.Trim（）；
}
}
if（string.IsNullOrWhiteSpace（title））
{
title=uri.ToString（）；
}
返回标题；
}
私有字符串GetContent（流内容流、编码）
{
contentStream.Seek（0，SeekOrigin.Begin）；
使用（StreamReader sr=新的StreamReader（contentStream，编码））
{
返回sr.ReadToEnd（）；
}
}
/// 
///尝试从正文内容获取字符集。
/// 
/// 
/// 
私有字符串GetCharsetFromBody（流内容流）
{
contentStream.Seek（0，SeekOrigin.Begin）；
StreamReader srr=新的StreamReader（contentStream，Encoding.ASCII）；
字符串体=srr.ReadToEnd（）；
字符串字符集=null；
if（body！=null）
{
//从以下位置查找表达式：http://stackoverflow.com/questions/3458217/how-to-use-regular-expression-to-match-the-charset-string-in-html
Match Match=Regex.Match（body，@“]*？content\s*=[\s”“']*？（[^>]*？）[\s”“；]*字符集\s*=[\s”“']*（[^\s”“/>]*），RegexOptions.IgnoreCase）；
如果（匹配成功）
{
charset=string.IsNullOrWhiteSpace（match.Groups[2].Value）？null:match.Groups[2].Value；
}
}
返回字符集；
}
/// 
///尝试解析字符集或回退到UTF8
/// 
/// 
/// 
私有编码GetEncodingOrDefaultToUTF8（字符串字符集）
{
编码e=Encoding.UTF8；
if（字符集！=null）
{
尝试
{
e=Encoding.GetEncoding（字符集）；
}
抓住
{
}
}
返回e；
}

您是否尝试运行有问题的代码？你真的了解这个问题吗？我也有类似的问题。首先，我使用了

Utf8Checker.IsUtf8

（互联网上的某个地方）。如果不是utf-8，我通过检查meta标记的http equiv属性来检查编码（使用HtmlAgilityPack）。我用你的URL测试了它，看起来很有效。（顺便说一句：问题不在于你的代码。某些方面没有正确编码/配置以返回正确的编码。因此你必须做一些更像浏览器的事情）HTML本质上是XML，请尝试使用XML解析器并搜索标题attribute@Wobbles

HTML本质上是XML

绝对不是。您无法解析