Web scraping 自动从网站登录/刮取文件
我需要找出如何刮网站和下载文件从一个认证的网站 脚本需要修改Web scraping 自动从网站登录/刮取文件,web-scraping,screen-scraping,jsoup,scrapy,Web Scraping,Screen Scraping,Jsoup,Scrapy,我需要找出如何刮网站和下载文件从一个认证的网站 脚本需要修改 使用用户名/密码登录此网站 浏览各个页面以进入下载页面 在表单中设置一些字段并点击下载按钮 保存下载的文件 我一直在研究Jsoup(因为Java是我的首选),但也可以尝试scrapy等。但我需要了解这些是否普遍存在,以及是否有其他技术支持这一点。 我可以使用Selenium之类的工具来设置,但我不想要一个将浏览器用作UA的工具,因为它会带来巨大的额外开销。 我正在取得进展,但整个饼干管理变得非常混乱 谢谢, Vivek如果您需要与您描
Vivek如果您需要与您描述的网页进行大量交互,那么使用真正的浏览器是没有办法的——至少从我的经验来看是这样。然而,SeleniumWebDriver在phantomjs中工作得很好,因此开销不会太大
正如下面的then评论中指出的,您也可以使用类似mechanize的东西,但是当有javascript更改页面上的DOM时,这些解决方案往往是无用的。(请参阅)我建议您使用Fiddler2,并像往常一样浏览网站 一旦您完成了,您应该能够轻松地复制所需的页面调用以及Javascript所做的任何事情,只需很少的麻烦和代码 我倾向于使用以下方法一次下载多个表单中的页面,并为登录站点等保存cookies:
function Download($href)
{
curl_setopt($this->ch, CURLOPT_COOKIEJAR, COOKIE_FILE); // Cookie management.
curl_setopt($this->ch, CURLOPT_COOKIEFILE, COOKIE_FILE);
curl_setopt($this->ch, CURLOPT_TIMEOUT, CURL_TIMEOUT); // Timeout
curl_setopt($this->ch, CURLOPT_USERAGENT, WEBBOT_NAME); // Webbot name
curl_setopt($this->ch, CURLOPT_VERBOSE, FALSE); // Minimize logs
curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE); // No certificate
curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, TRUE); // Follow redirects
curl_setopt($this->ch, CURLOPT_MAXREDIRS, 4); // Limit redirections to four
curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, TRUE); // Return in string
curl_setopt($this->ch, CURLOPT_URL, $href); // Target site
curl_setopt($this->ch, CURLOPT_REFERER, $href); // Referer value
curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true);
# Create return arrays
$return_array['FILE'] = curl_exec($this->ch);
$return_array['STATUS'] = curl_getinfo($this->ch);
$return_array['ERRORS'] = curl_error($this->ch);
$dom_document = new DOMDocument();
@$dom_document->loadHTML($return_array['FILE']);
$return_array['DOM'] = new DOMXpath($dom_document);
return $return_array;
}
这是我的HttpHelper类。易于使用且仅限于Html:
<?php
class HttpHelper {
function __construct() {
//setcookie("UserPostcode","2065",time() + 3600);
$this->ch = curl_init();
define("WEBBOT_NAME", "Test Webbot");
# Length of time cURL will wait for a response (seconds)
define("CURL_TIMEOUT", 25);
# Location of your cookie file. (Must be fully resolved local address)
define("COOKIE_FILE", "cookie.txt");
# DEFINE METHOD CONSTANTS
define("HEAD", "HEAD");
define("GET", "GET");
define("POST", "POST");
# DEFINE HEADER INCLUSION
define("EXCL_HEAD", FALSE);
define("INCL_HEAD", TRUE);
$header = array();
$header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: "; // browsers keep this blank.
curl_setopt($this->ch, CURLOPT_HTTPHEADER, $header); // Set Header Information
}
// Collects the HTML, Status, Errors and a DOM.
function Download($href)
{
curl_setopt($this->ch, CURLOPT_COOKIEJAR, COOKIE_FILE); // Cookie management.
curl_setopt($this->ch, CURLOPT_COOKIEFILE, COOKIE_FILE);
curl_setopt($this->ch, CURLOPT_TIMEOUT, CURL_TIMEOUT); // Timeout
curl_setopt($this->ch, CURLOPT_USERAGENT, WEBBOT_NAME); // Webbot name
curl_setopt($this->ch, CURLOPT_VERBOSE, FALSE); // Minimize logs
curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE); // No certificate
curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, TRUE); // Follow redirects
curl_setopt($this->ch, CURLOPT_MAXREDIRS, 4); // Limit redirections to four
curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, TRUE); // Return in string
curl_setopt($this->ch, CURLOPT_URL, $href); // Target site
curl_setopt($this->ch, CURLOPT_REFERER, $href); // Referer value
curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true);
# Create return arrays
$return_array['FILE'] = curl_exec($this->ch);
$return_array['STATUS'] = curl_getinfo($this->ch);
$return_array['ERRORS'] = curl_error($this->ch);
$dom_document = new DOMDocument();
@$dom_document->loadHTML($return_array['FILE']);
$return_array['DOM'] = new DOMXpath($dom_document);
return $return_array;
}
function http_post_form($target, $ref, $data_array)
{
return $this->http($target, $ref, $method="POST", $data_array, EXCL_HEAD);
}
function http_post_withheader($target, $ref, $data_array)
{
return http($target, $ref, $method="POST", $data_array, INCL_HEAD);
}
function http($target, $ref, $method, $data_array, $incl_head)
{
# Initialize PHP/CURL handle
$ch = curl_init();
# Prcess data, if presented
if(is_array($data_array))
{
# Convert data array into a query string (ie animal=dog&sport=baseball)
foreach ($data_array as $key => $value)
{
if(strlen(trim($value))>0)
$temp_string[] = $key . "=" . urlencode($value);
else
$temp_string[] = $key;
}
$query_string = join('&', $temp_string);
}else{
$query_string =$data_array;
}
# HEAD method configuration
if($method == HEAD)
{
curl_setopt($ch, CURLOPT_HEADER, TRUE); // No http head
curl_setopt($ch, CURLOPT_NOBODY, TRUE); // Return body
}
else
{
# GET method configuration
if($method == GET)
{
if(isset($query_string))
$target = $target . "?" . $query_string;
curl_setopt ($ch, CURLOPT_HTTPGET, TRUE);
curl_setopt ($ch, CURLOPT_POST, FALSE);
}
# POST method configuration
if($method == POST)
{
if(isset($query_string))
curl_setopt ($ch, CURLOPT_POSTFIELDS, $query_string);
curl_setopt ($ch, CURLOPT_POST, TRUE);
curl_setopt ($ch, CURLOPT_HTTPGET, FALSE);
}
curl_setopt($ch, CURLOPT_HEADER, $incl_head); // Include head as needed
curl_setopt($ch, CURLOPT_NOBODY, FALSE); // Return body
}
curl_setopt($ch, CURLOPT_COOKIEJAR, COOKIE_FILE); // Cookie management.
curl_setopt($ch, CURLOPT_COOKIEFILE, COOKIE_FILE);
curl_setopt($ch, CURLOPT_TIMEOUT, CURL_TIMEOUT); // Timeout
curl_setopt($ch, CURLOPT_USERAGENT, WEBBOT_NAME); // Webbot name
curl_setopt($ch, CURLOPT_URL, $target); // Target site
curl_setopt($ch, CURLOPT_REFERER, $ref); // Referer value
curl_setopt($ch, CURLOPT_VERBOSE, FALSE); // Minimize logs
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); // No certificate
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE); // Follow redirects
curl_setopt($ch, CURLOPT_MAXREDIRS, 4); // Limit redirections to four
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); // Return in string
# Create return array
$return_array['FILE'] = curl_exec($ch);
$return_array['STATUS'] = curl_getinfo($ch);
$return_array['ERROR'] = curl_error($ch);
# Close PHP/CURL handle
curl_close($ch);
# Return results
return $return_array;
}
function InnerHtml($element)
{
$innerHTML = "";
if($element != NULL && $element->hasChildNodes())
{
$children = $element->childNodes;
foreach ($children as $child)
{
$tmp_dom = new DOMDocument();
$tmp_dom->appendChild($tmp_dom->importNode($child, true));
$innerHTML.=trim($tmp_dom->saveHTML());
}
}
return $innerHTML;
}
function Split($data, $split)
{
return explode($split, $data);
}
function correctImgUrls($html, $url)
{
$DOM = new DOMDocument;
$DOM->loadHTML($html);
$imgs = $DOM->getElementsByTagName('img');
foreach($imgs as $img){
$src = $img->getAttribute('src');
if(strpos($src, $url) !== 0){
$img->setAttribute('src', $url.$src);
}
}
$html = $DOM->saveHTML();
return $html;
}
function correctUrls($html, $url)
{
$DOM = new DOMDocument;
$DOM->loadHTML($html);
$imgs = $DOM->getElementsByTagName('a');
foreach($imgs as $img){
$src = $img->getAttribute('href');
if(strpos($src, $url) !== 0){
$img->setAttribute('a', $url.$src);
}
}
$html = $DOM->saveHTML();
return $html;
}
function removeHref($html)
{
$DOM = new DOMDocument;
$DOM->loadHTML($html);
$imgs = $DOM->getElementsByTagName('a');
foreach($imgs as $img){
$src = $img->getAttribute('href');
$img->setAttribute('href', "#");
}
$html = $DOM->saveHTML();
return $html;
}
function QuerySelector($dom, $xPath)
{
return $dom->query($xPath);
}
/*
function __destruct() {
# Close PHP/CURL handle
echo "Destruct Called..";
curl_close($ch);
}*/
}
?>
你欠我的 不是真的,任何像图书馆这样的机械化系统都能做到这一点。我修改了我的答案。这一点也不无用,您可能需要找出一些ajax调用,但根据我的经验,这比selenium带来的头痛要好。我不提倡selenium。我用任何有用的东西。也许我应该选择不那么激烈的词。最后,OP得到了一些提示,可以决定做什么。一般来说,我认为类似硒的解决方案是用于单元测试的,而类似机械化的解决方案是用于刮削的。
include("Business/Http/HttpHelper.php");
$bot = new HttpHelper;
//$download = $bot ->Download("https://www.odesk.com/login");
$data['username'] = "myusername";
$data['password'] = "myPassword";
$bot -> http_post_form("https://www.odesk.com/login", "https://www.odesk.com/login", $data);