C#HTML Agility Pack从SQL Server数据库的字符串中加载HTML
我有一个满是网页的数据库,我想使用HTML agility pack从中提取信息。我已经构建了一个函数,当我从富文本框加载文本时,它可以获取我想要的信息。但是,当我从SQL数据库的字符串中加载HTML时,它无法正确地从节点获取所有文本。因此,没有给我所有的信息,我需要从网页C#HTML Agility Pack从SQL Server数据库的字符串中加载HTML,c#,html,.net,html-agility-pack,C#,Html,.net,Html Agility Pack,我有一个满是网页的数据库,我想使用HTML agility pack从中提取信息。我已经构建了一个函数,当我从富文本框加载文本时,它可以获取我想要的信息。但是,当我从SQL数据库的字符串中加载HTML时,它无法正确地从节点获取所有文本。因此,没有给我所有的信息,我需要从网页 private static string[] Data(string strWebpage,string strURL, int iID) { //Declair and load
private static string[] Data(string strWebpage,string strURL, int iID)
{
//Declair and load HTML agility pack
HtmlAgilityPack.HtmlDocument HPD = new HtmlAgilityPack.HtmlDocument();
HPD.LoadHtml(strWebpage);
string[] strData = new string[17];//Return string array
//Get text from html nodes
HtmlAgilityPack.HtmlNode HDNA = HPD.DocumentNode.SelectSingleNode("//div[@class='product_info']"); //Top product information
HtmlAgilityPack.HtmlNode HDNB = HPD.DocumentNode.SelectSingleNode("//table[@width='300px']"); //Bottom Product Information
HtmlAgilityPack.HtmlNode HDNC = HPD.DocumentNode.SelectSingleNode("//h2[@class='name']"); //Product title
HtmlAgilityPack.HtmlNode HDND = HPD.DocumentNode.SelectSingleNode("//div[@class='product_image']"); //Product URL
HtmlAgilityPack.HtmlNode HDNE = HPD.DocumentNode.SelectSingleNode("//div[@class='contentwrapper']"); //Product Description
HtmlAgilityPack.HtmlNodeCollection HDNF = HPD.DocumentNode.SelectNodes("//div[@class='conttopright']//a[@class='uponelevel']"); //Get product category
//Store temporary data ready to be processed and determined if useful
List<string> strElimination = new List<string>();
string[] strBits = TextToNArray(HDNA.InnerText);
for (int i = 0; i < strBits.Length; i = i + 2)
{
strElimination.Add(strBits[i].Trim() + "\t" + strBits[i + 1].Trim()); //Prepiar data types from field 1
}
string [] strBits = TextToNArray(HDNB.InnerText);
for (int i = 0; i < strBits.Length; i = i + 2)
{
strElimination.Add(strBits[i].Trim() + "\t" + strBits[i + 1].Trim()); //Prepiar data types from field 2
}
strData[13] = (HDNC.InnerText.Trim()); //Title
strData[14] = (HDND.InnerHtml.Replace("\\", "\\\\").Replace("<img id=\"ctl00_ContentPlaceHolder1_ProductImage\" src=\"", "").Replace("\" alt=\"Product Image\" style=\"border-width:0px;\">", "").Trim());
strData[15] = strURL; //Page source URL
strData[16] = iID.ToString(); //Raw page id
strData[8] = ""; //Description start text
strData[0] = ""; //Product category start text
//Get product category
foreach (var vCat in HDNF)
{
strData[0] += "-" + vCat.InnerText.ToString();
}
strData[0] = strData[0].Trim('-').Trim().Replace("Home-","");
//Extract the description from the text
string[] strDescProcess = TextToNArray(HDNE.InnerText);
for (int i = 0; i < strDescProcess.Length; i++)
{
if(strDescProcess[i].Trim() == "Description")
{
i++;
while (strDescProcess[i].Trim() != "More Product Details")
{
strData[8] += strDescProcess[i].Trim(); //Add description as one line
i++;
}
}
}
//Order Additional information into array
foreach (string strInfo in strElimination)
{
string [] strParts = strInfo.Split('\t');
switch (strParts[0].Trim().ToLower())
{
case "list price*":
double dPrice;
//Attempt to turn price into valid double value
try
{
dPrice = Convert.ToDouble(strParts[1].Substring(1));
}
catch
{
try
{
dPrice = Convert.ToDouble(strParts[1].Substring(2));
}
catch
{
dPrice = 0.0;
}
}
strData[1] = dPrice.ToString();
break;
case "availability":
//Determine if book is availiable
if (strParts[1].ToLower() == "available")
{
strData[2] = "1";
}
else
{
strData[2] = "0";
}
break;
case "language":
strData[3] = strParts[1];
break;
case "arrangement":
strData[4] = strParts[1];
break;
case "skill level":
strData[5] = strParts[1];
break;
case "publisher":
strData[6] = strParts[1];
break;
case "catalogue no.":
strData[7] = strParts[1];
break;
case "published on":
//Turn the date into a format the database understands (American silly date format yyyy-MM-dd)
try
{
strData[9] = Convert.ToDateTime(strParts[1]).ToString("yyyy-MM-dd");
}
catch
{
//Date could not be parsed
strData[9] = "0000-00-00";
}
break;
case "format":
strData[10] = strParts[1];
break;
case "pages":
strData[11] = strParts[1];
break;
case "isbn":
strData[12] = strParts[1];
break;
}
}
//Return data found
return strData;
}
private静态字符串[]数据(字符串strWebpage,字符串strURL,int-iID)
{
//声明并加载HTML敏捷包
HtmlAgilityPack.HtmlDocument HPD=新的HtmlAgilityPack.HtmlDocument();
加载HTML(strWebpage);
string[]strData=新字符串[17];//返回字符串数组
//从html节点获取文本
HtmlAgilityPack.HtmlNode HDNA=HPD.DocumentNode.SelectSingleNode(“//div[@class='product\u info']”);//顶级产品信息
HtmlAgilityPack.HtmlNode HDNB=HPD.DocumentNode.SelectSingleNode(“//表[@width='300px']”);//底部产品信息
HtmlAgilityPack.HtmlNode HDNC=HPD.DocumentNode.SelectSingleNode(//h2[@class='name']);//产品名称
HtmlAgilityPack.HtmlNode HDND=HPD.DocumentNode.SelectSingleNode(“//div[@class='product\u image']”);//产品URL
HtmlAgilityPack.HtmlNode HDNE=HPD.DocumentNode.SelectSingleNode(“//div[@class='contentwrapper']”);//产品说明
HtmlAgilityPack.HtmlNodeCollection HDNF=HPD.DocumentNode.SelectNodes(//div[@class='conttopright']//a[@class='uponelevel']);//获取产品类别
//存储准备好处理的临时数据,并确定是否有用
List strElimination=新列表();
字符串[]strBits=TextToNArray(HDNA.InnerText);
对于(int i=0;i