C# 这是CookieContainer bug吗?
我在做什么:C# 这是CookieContainer bug吗?,c#,cookiecontainer,C#,Cookiecontainer,我在做什么: 我正在开发一个“webscraper”(多线程),就是这样,lol.在从页面提取数据之前,我需要提交一个表单,因此布局如下: 获取对example.com/path/doc.jsp(我的数据)的请求 检查确认单是否存在于单据来源中。如果是,则继续执行步骤3(我的数据不存在,需要先提交表格),否则返回(因为没有要提交的表格,我的数据在这里) 获取对example.com/path/sub/other.jsp的请求(必要的键值) 将请求发布到example.com/path/submi
我正在开发一个“webscraper”(多线程),就是这样,lol.
在从页面提取数据之前,我需要提交一个表单,因此布局如下:
问题:
我需要从cookies中提取表单中的一个值,因此我使用
GetCookies()
函数,但是,正如我所说的,如果响应告诉我返回步骤1,那么之后的所有请求(包括GET和POST)都会丢失cookies(并添加奇怪的cookies)。请参见下图:图像说明:
public class CWeb : IDisposable
{
private WebClientEx _wc;
private string _originalUrl;
public CWeb()
{
_wc = new WebClientEx(new CookieContainer());
}
public string downloadPage(string url)
{
_originalUrl = url;
string pgSrc = "error";
int tries = 0;
while (tries < 3 && pgSrc == "error)
{
try
{
pgSrc = _wc.DownloadString(url);
}
catch (Exception err)
{
tries += 1;
pgSrc = "error";
...
}
}
if (needSubmit(pgSrc)) // needSubmit just peform IndexOf on pgSrc
do
{
pgSrc = sendForm(pgSrc);
} while (needSubmit(pgSrc));
return WebUtility.HtmlDecode(pgSrc);
}
public string sendForm(pageSource)
{
// 1- Get Cookie Value
string cookie = _wc.CookieContainer.GetCookies(new Uri(_originalUrl))["JSESSIONID"].Value;
// 2- Get hidden values in pageSource parameter
// skip this, since there's no web request here, only some html parsing
// with Html Agility Pack
...
// 3- Get key value
string tmpStr = _wc.DownloadString("http://example.com/path/sub/other.jsp");
... more html parsing ...
// 4- Build form
NameValueCollection nvc = new NameValueCollection();
nvc["param1"] = cookie;
nvc["param2"] = key;
...
// 5- Send
_wc.UploadValues("example.com/path/submit.jsp", nvc);
// 6- Return
return _wc.DownloadString(_originalUrl);
}
public void Dispose()
{
_wc.Dispose();
}
}
static void Main(string[] args)
{
// Load tons of 'doc' url list from database...
List<string> urls = new List<string>();
...
Parallel.ForEach(urls, (url) =>
{
using (CWeb crawler = new CWeb())
{
string pageData = crawler.downloadPage(url);
... parse html data here ...
}
});
}
- 第一个调用是对doc.jsp的GET请求,我的数据在这里
- 第二个调用是other.jsp请求,因为确认表单出现在doc.jsp源代码中
- 第三个调用是当我提交所有值时
- 第四个调用是对doc.jsp的GET请求,因为提交表单的响应(第三个调用)告诉我重复这个过程。基本上,4º~6º调用与1º~3º调用相同,但使用cookies
我的代码:
public class CWeb : IDisposable
{
private WebClientEx _wc;
private string _originalUrl;
public CWeb()
{
_wc = new WebClientEx(new CookieContainer());
}
public string downloadPage(string url)
{
_originalUrl = url;
string pgSrc = "error";
int tries = 0;
while (tries < 3 && pgSrc == "error)
{
try
{
pgSrc = _wc.DownloadString(url);
}
catch (Exception err)
{
tries += 1;
pgSrc = "error";
...
}
}
if (needSubmit(pgSrc)) // needSubmit just peform IndexOf on pgSrc
do
{
pgSrc = sendForm(pgSrc);
} while (needSubmit(pgSrc));
return WebUtility.HtmlDecode(pgSrc);
}
public string sendForm(pageSource)
{
// 1- Get Cookie Value
string cookie = _wc.CookieContainer.GetCookies(new Uri(_originalUrl))["JSESSIONID"].Value;
// 2- Get hidden values in pageSource parameter
// skip this, since there's no web request here, only some html parsing
// with Html Agility Pack
...
// 3- Get key value
string tmpStr = _wc.DownloadString("http://example.com/path/sub/other.jsp");
... more html parsing ...
// 4- Build form
NameValueCollection nvc = new NameValueCollection();
nvc["param1"] = cookie;
nvc["param2"] = key;
...
// 5- Send
_wc.UploadValues("example.com/path/submit.jsp", nvc);
// 6- Return
return _wc.DownloadString(_originalUrl);
}
public void Dispose()
{
_wc.Dispose();
}
}
static void Main(string[] args)
{
// Load tons of 'doc' url list from database...
List<string> urls = new List<string>();
...
Parallel.ForEach(urls, (url) =>
{
using (CWeb crawler = new CWeb())
{
string pageData = crawler.downloadPage(url);
... parse html data here ...
}
});
}
公共类CWeb:IDisposable
{
私人网络客户端;
私有字符串_originalUrl;
公共CWeb()
{
_wc=新的WebClientEx(新的CookieContainer());
}
公共字符串下载页面(字符串url)
{
_originalUrl=url;
字符串pgSrc=“error”;
int=0;
while(尝试<3&&pgSrc==”错误)
{
尝试
{
pgSrc=_wc.DownloadString(url);
}
捕获(异常错误)
{
尝试次数+=1;
pgSrc=“错误”;
...
}
}
if(needSubmit(pgSrc))//needSubmit只需在pgSrc上设置IndexOf
做
{
pgSrc=sendForm(pgSrc);
}while(needSubmit(pgSrc));
返回WebUtility.HtmlDecode(pgSrc);
}
公共字符串sendForm(pageSource)
{
//1-获取Cookie值
字符串cookie=_wc.CookieContainer.GetCookies(新Uri(_originalUrl))[“JSESSIONID”].Value;
//2-获取pageSource参数中的隐藏值
//跳过这个,因为这里没有web请求,只有一些html解析
//使用Html敏捷包
...
//3-获取关键值
字符串tmpStr=_wc.DownloadString(“http://example.com/path/sub/other.jsp");
…更多html解析。。。
//4-建造形式
NameValueCollection nvc=新的NameValueCollection();
nvc[“param1”]=cookie;
nvc[“参数2”]=键;
...
//5-发送
_UploadValues(“example.com/path/submit.jsp”,nvc);
//6-返回
返回_wc.DownloadString(_originalUrl);
}
公共空间处置()
{
_wc.Dispose();
}
}
主程序:
public class CWeb : IDisposable
{
private WebClientEx _wc;
private string _originalUrl;
public CWeb()
{
_wc = new WebClientEx(new CookieContainer());
}
public string downloadPage(string url)
{
_originalUrl = url;
string pgSrc = "error";
int tries = 0;
while (tries < 3 && pgSrc == "error)
{
try
{
pgSrc = _wc.DownloadString(url);
}
catch (Exception err)
{
tries += 1;
pgSrc = "error";
...
}
}
if (needSubmit(pgSrc)) // needSubmit just peform IndexOf on pgSrc
do
{
pgSrc = sendForm(pgSrc);
} while (needSubmit(pgSrc));
return WebUtility.HtmlDecode(pgSrc);
}
public string sendForm(pageSource)
{
// 1- Get Cookie Value
string cookie = _wc.CookieContainer.GetCookies(new Uri(_originalUrl))["JSESSIONID"].Value;
// 2- Get hidden values in pageSource parameter
// skip this, since there's no web request here, only some html parsing
// with Html Agility Pack
...
// 3- Get key value
string tmpStr = _wc.DownloadString("http://example.com/path/sub/other.jsp");
... more html parsing ...
// 4- Build form
NameValueCollection nvc = new NameValueCollection();
nvc["param1"] = cookie;
nvc["param2"] = key;
...
// 5- Send
_wc.UploadValues("example.com/path/submit.jsp", nvc);
// 6- Return
return _wc.DownloadString(_originalUrl);
}
public void Dispose()
{
_wc.Dispose();
}
}
static void Main(string[] args)
{
// Load tons of 'doc' url list from database...
List<string> urls = new List<string>();
...
Parallel.ForEach(urls, (url) =>
{
using (CWeb crawler = new CWeb())
{
string pageData = crawler.downloadPage(url);
... parse html data here ...
}
});
}
static void Main(字符串[]args)
{
//从数据库加载成吨的“文档”url列表。。。
列表URL=新列表();
...
Parallel.ForEach(url,(url)=>
{
使用(CWeb爬虫程序=新CWeb())
{
字符串pageData=crawler.downloadPage(url);
…在此解析html数据。。。
}
});
}
我的环境:
public class CWeb : IDisposable
{
private WebClientEx _wc;
private string _originalUrl;
public CWeb()
{
_wc = new WebClientEx(new CookieContainer());
}
public string downloadPage(string url)
{
_originalUrl = url;
string pgSrc = "error";
int tries = 0;
while (tries < 3 && pgSrc == "error)
{
try
{
pgSrc = _wc.DownloadString(url);
}
catch (Exception err)
{
tries += 1;
pgSrc = "error";
...
}
}
if (needSubmit(pgSrc)) // needSubmit just peform IndexOf on pgSrc
do
{
pgSrc = sendForm(pgSrc);
} while (needSubmit(pgSrc));
return WebUtility.HtmlDecode(pgSrc);
}
public string sendForm(pageSource)
{
// 1- Get Cookie Value
string cookie = _wc.CookieContainer.GetCookies(new Uri(_originalUrl))["JSESSIONID"].Value;
// 2- Get hidden values in pageSource parameter
// skip this, since there's no web request here, only some html parsing
// with Html Agility Pack
...
// 3- Get key value
string tmpStr = _wc.DownloadString("http://example.com/path/sub/other.jsp");
... more html parsing ...
// 4- Build form
NameValueCollection nvc = new NameValueCollection();
nvc["param1"] = cookie;
nvc["param2"] = key;
...
// 5- Send
_wc.UploadValues("example.com/path/submit.jsp", nvc);
// 6- Return
return _wc.DownloadString(_originalUrl);
}
public void Dispose()
{
_wc.Dispose();
}
}
static void Main(string[] args)
{
// Load tons of 'doc' url list from database...
List<string> urls = new List<string>();
...
Parallel.ForEach(urls, (url) =>
{
using (CWeb crawler = new CWeb())
{
string pageData = crawler.downloadPage(url);
... parse html data here ...
}
});
}
- 使用Visual Studio Professional 2013
- 目标框架是.NETFramework 4.5
- 平台x86(调试)
- WebClient Tex是WebClient的扩展版本,用于处理Cookie。请访问此处。我曾尝试实施
(从),但即使使用该修复程序,此问题仍会发生错误修复\u CookieDomain()
- 我所有的url都包含http://前缀
- 使用Fiddler查看请求信息
- 英语不是我的母语…“-”