C# Web scrape项目写入过多信息
我正试图修改下面的代码,以便从www.itoworld.com/careers上抓取工作。作业采用表格格式,并返回所有,您只需创建一个“名称节点”并将其用于解析方法 我用这段代码进行了测试,它对我有效C# Web scrape项目写入过多信息,c#,html,web-scraping,C#,Html,Web Scraping,我正试图修改下面的代码,以便从www.itoworld.com/careers上抓取工作。作业采用表格格式,并返回所有,您只需创建一个“名称节点”并将其用于解析方法 我用这段代码进行了测试,它对我有效 var parentnode = node.ParentNode.ParentNode.ParentNode.FirstChild.NextSibling; var nameNode = parentnode.FirstChild;
var parentnode = node.ParentNode.ParentNode.ParentNode.FirstChild.NextSibling;
var nameNode = parentnode.FirstChild;
Links l = new Links();
l.Name = ParseHtmlContainingText(nameNode.InnerText);
l.Link = node.GetAttributeValue("href", "");
那太完美了。非常感谢你。我将努力更好地理解遗产。
<a class="std-btn" href="http://www.itoworld.com/office-manager/">Office Manager</a>
<a href='http://www.itoworld.com/office-manager/' target='_blank'>Office ManagerOffice & AdminCambridgeFind out more</a>
public string ExtractIto()
{
string sUrl = "http://www.itoworld.com/careers/";
GlobusHttpHelper ghh = new GlobusHttpHelper();
List<Links> link = new List<Links>();
bool Next = true;
int count = 1;
string html = ghh.getHtmlfromUrl(new Uri(string.Format(sUrl)));
HtmlAgilityPack.HtmlDocument hd = new HtmlAgilityPack.HtmlDocument();
hd.LoadHtml(html);
var hn = hd.DocumentNode.SelectSingleNode("//*[@class='btn-wrapper']");
var hnc = hn.SelectNodes(".//a");
foreach (var node in hnc)
{
try
{
var parentnode = node.ParentNode.ParentNode.ParentNode.FirstChild.NextSibling;
Links l = new Links();
l.Name = ParseHtmlContainingText(parentnode.InnerText);
l.Link = node.GetAttributeValue("href", "");
link.Add(l);
}
}
string Xml = getXml(link);
return WriteXml(Xml);
public string ParseHtmlContainingText(string htmlString)
{
return Regex.Replace(Regex.Replace(WebUtility.HtmlDecode(htmlString), @"<[^>]+>| ", ""), @"\s{2,}", " ").Trim();
}
var parentnode = node.ParentNode.ParentNode.ParentNode.FirstChild.NextSibling;
var nameNode = parentnode.FirstChild;
Links l = new Links();
l.Name = ParseHtmlContainingText(nameNode.InnerText);
l.Link = node.GetAttributeValue("href", "");