C# 如何使用HtmlAlityPack获取tr链接和内容?
我使用以下代码获取带有HtmlAlityPack的url的html源代码:C# 如何使用HtmlAlityPack获取tr链接和内容?,c#,parsing,html-parsing,html-agility-pack,C#,Parsing,Html Parsing,Html Agility Pack,我使用以下代码获取带有HtmlAlityPack的url的html源代码: private string GetUrlSource(string urlAddress) { string content = string.Empty; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(urlAddress); HttpWebResponse
private string GetUrlSource(string urlAddress)
{
string content = string.Empty;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(urlAddress);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK)
{
Stream receiveStream = response.GetResponseStream();
StreamReader readStream = null;
if (response.CharacterSet == null)
readStream = new StreamReader(receiveStream);
else
readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet));
content = readStream.ReadToEnd();
response.Close();
readStream.Close();
}
return content;
}
然后,使用以下代码获取数据:
var source = GetUrlSource(urlAddress);
var htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.LoadHtml(source);
var nodes = htmlDoc.DocumentNode.SelectNodes("//div[@class='linear-view']/table/tr/td");
foreach (var node in nodes)
{
HtmlNodeCollection rows = node.SelectNodes(".//a");
if (rows != null)
{
for (int j = 0; j < rows.Count; ++j)
{
var link = rows[j].Attributes["href"].Value;
var title = rows[j].InnerText;
}
}
else
{
var publisher = node.InnerText;
}
}
我对节点的结果是:
<tr>
<th>Title</th>
<th>Publisher</th>
</tr>
<tr>
<td><a href="http://link1.com" id="14.4">Title1</a></td>
<td>Publisher1</td>
</tr>
<tr>
<td><a href="http://link2.com" id="12.0">Title2</a></td>
<td>Publisher2</td>
</tr>
<tr>
<td><a href="http://link3.com/" id="84.4">Title3</a></td>
<td>Publisher3</td>
</tr>
标题
出版商
出版商1
出版商2
出版商3
我使用此代码获取数据:
var source = GetUrlSource(urlAddress);
var htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.LoadHtml(source);
var nodes = htmlDoc.DocumentNode.SelectNodes("//div[@class='linear-view']/table/tr/td");
foreach (var node in nodes)
{
HtmlNodeCollection rows = node.SelectNodes(".//a");
if (rows != null)
{
for (int j = 0; j < rows.Count; ++j)
{
var link = rows[j].Attributes["href"].Value;
var title = rows[j].InnerText;
}
}
else
{
var publisher = node.InnerText;
}
}
foreach(节点中的变量节点)
{
HtmlNodeCollection rows=node.SelectNodes(“.//a”);
如果(行!=null)
{
对于(int j=0;j
如果没有if&else,我如何获得每个tr标签的链接、标题和发布者?
例如:
http://link1.com ,标题1,发布者1
和http://link2.com ,标题2,发布者2
和http://link3.com ,Title3,Publisher3
许多可能的方法之一:
//select <tr> having child node <td>
var tr = doc.DocumentNode.SelectNodes("//div[@class='linear-view']/table/tr[td]");
foreach (HtmlNode node in tr)
{
//select <td> having child node <a>
var td1 = node.SelectSingleNode("./td[a]"); //or using index: ./td[1]
var link = td1.FirstChild.Attributes["href"].Value;
var title = td1.InnerText;
//select <td> not having child node <a>
var publisher = node.SelectSingleNode("./td[not(a)]") //using index: ./td[2]
.InnerText;
}
//选择具有子节点
var tr=doc.DocumentNode.SelectNodes(“//div[@class='linear-view']]/table/tr[td]”;
foreach(tr中的HtmlNode节点)
{
//选择具有子节点
var td1=node.SelectSingleNode(“./td[a]”;//或使用索引:./td[1]
var link=td1.FirstChild.Attributes[“href”].Value;
var title=td1.InnerText;
//选择没有子节点
var publisher=node。使用索引选择singlenode(“./td[not(a)]”//。/td[2]
.内部文本;
}