Web scraping 自动从网站登录/刮取文件_Web Scraping_Screen Scraping_Jsoup_Scrapy

Web scraping 自动从网站登录/刮取文件

web-scraping scrapy

Web scraping 自动从网站登录/刮取文件,web-scraping,screen-scraping,jsoup,scrapy,Web Scraping,Screen Scraping,Jsoup,Scrapy,我需要找出如何刮网站和下载文件从一个认证的网站脚本需要修改使用用户名/密码登录此网站浏览各个页面以进入下载页面在表单中设置一些字段并点击下载按钮保存下载的文件我一直在研究Jsoup（因为Java是我的首选），但也可以尝试scrapy等。但我需要了解这些是否普遍存在，以及是否有其他技术支持这一点。我可以使用Selenium之类的工具来设置，但我不想要一个将浏览器用作UA的工具，因为它会带来巨大的额外开销。我正在取得进展，但整个饼干管理变得非常混乱谢谢， Vivek如果您需要与您描

我需要找出如何刮网站和下载文件从一个认证的网站

脚本需要修改

使用用户名/密码登录此网站

浏览各个页面以进入下载页面

在表单中设置一些字段并点击下载按钮

保存下载的文件

我一直在研究Jsoup（因为Java是我的首选），但也可以尝试scrapy等。但我需要了解这些是否普遍存在，以及是否有其他技术支持这一点。我可以使用Selenium之类的工具来设置，但我不想要一个将浏览器用作UA的工具，因为它会带来巨大的额外开销。我正在取得进展，但整个饼干管理变得非常混乱

谢谢，

Vivek

如果您需要与您描述的网页进行大量交互，那么使用真正的浏览器是没有办法的——至少从我的经验来看是这样。然而，SeleniumWebDriver在phantomjs中工作得很好，因此开销不会太大

正如下面的then评论中指出的，您也可以使用类似mechanize的东西，但是当有javascript更改页面上的DOM时，这些解决方案往往是无用的。（请参阅）

我建议您使用Fiddler2，并像往常一样浏览网站

一旦您完成了，您应该能够轻松地复制所需的页面调用以及Javascript所做的任何事情，只需很少的麻烦和代码

我倾向于使用以下方法一次下载多个表单中的页面，并为登录站点等保存cookies：

function Download($href)
   {

        curl_setopt($this->ch, CURLOPT_COOKIEJAR, COOKIE_FILE);   // Cookie management.
        curl_setopt($this->ch, CURLOPT_COOKIEFILE, COOKIE_FILE);
        curl_setopt($this->ch, CURLOPT_TIMEOUT, CURL_TIMEOUT);    // Timeout
        curl_setopt($this->ch, CURLOPT_USERAGENT, WEBBOT_NAME);   // Webbot name
        curl_setopt($this->ch, CURLOPT_VERBOSE, FALSE);           // Minimize logs
        curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);    // No certificate
        curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);     // Follow redirects
        curl_setopt($this->ch, CURLOPT_MAXREDIRS, 4);             // Limit redirections to four
        curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, TRUE);     // Return in string
        curl_setopt($this->ch, CURLOPT_URL, $href);             // Target site
        curl_setopt($this->ch, CURLOPT_REFERER, $href);            // Referer value
        curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true);

        # Create return arrays
        $return_array['FILE']   = curl_exec($this->ch); 
        $return_array['STATUS'] = curl_getinfo($this->ch);
        $return_array['ERRORS']  = curl_error($this->ch);
        $dom_document = new DOMDocument();
        @$dom_document->loadHTML($return_array['FILE']);
        $return_array['DOM'] = new DOMXpath($dom_document);


        return $return_array;
   }

这是我的HttpHelper类。易于使用且仅限于Html：

<?php
class HttpHelper {


    function __construct() {
    //setcookie("UserPostcode","2065",time() + 3600);
        $this->ch = curl_init();
        define("WEBBOT_NAME", "Test Webbot");
        # Length of time cURL will wait for a response (seconds)
        define("CURL_TIMEOUT", 25);
        # Location of your cookie file. (Must be fully resolved local address)
        define("COOKIE_FILE", "cookie.txt");
        # DEFINE METHOD CONSTANTS
        define("HEAD", "HEAD");
        define("GET",  "GET");
        define("POST", "POST");
        # DEFINE HEADER INCLUSION
        define("EXCL_HEAD", FALSE);
        define("INCL_HEAD", TRUE);


        $header = array();
        $header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
        $header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
        $header[] =  "Cache-Control: max-age=0";
        $header[] =  "Connection: keep-alive";
        $header[] = "Keep-Alive: 300";
        $header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
        $header[] = "Accept-Language: en-us,en;q=0.5";
        $header[] = "Pragma: "; // browsers keep this blank.

        curl_setopt($this->ch, CURLOPT_HTTPHEADER, $header);        // Set Header Information

   }

   // Collects the HTML, Status, Errors and a DOM.
   function Download($href)
   {

        curl_setopt($this->ch, CURLOPT_COOKIEJAR, COOKIE_FILE);   // Cookie management.
        curl_setopt($this->ch, CURLOPT_COOKIEFILE, COOKIE_FILE);
        curl_setopt($this->ch, CURLOPT_TIMEOUT, CURL_TIMEOUT);    // Timeout
        curl_setopt($this->ch, CURLOPT_USERAGENT, WEBBOT_NAME);   // Webbot name
        curl_setopt($this->ch, CURLOPT_VERBOSE, FALSE);           // Minimize logs
        curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, FALSE);    // No certificate
        curl_setopt($this->ch, CURLOPT_FOLLOWLOCATION, TRUE);     // Follow redirects
        curl_setopt($this->ch, CURLOPT_MAXREDIRS, 4);             // Limit redirections to four
        curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, TRUE);     // Return in string
        curl_setopt($this->ch, CURLOPT_URL, $href);             // Target site
        curl_setopt($this->ch, CURLOPT_REFERER, $href);            // Referer value
        curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, true);

        # Create return arrays
        $return_array['FILE']   = curl_exec($this->ch); 
        $return_array['STATUS'] = curl_getinfo($this->ch);
        $return_array['ERRORS']  = curl_error($this->ch);
        $dom_document = new DOMDocument();
        @$dom_document->loadHTML($return_array['FILE']);
        $return_array['DOM'] = new DOMXpath($dom_document);


        return $return_array;
   }

   function http_post_form($target, $ref, $data_array)
    {
    return $this->http($target, $ref, $method="POST", $data_array, EXCL_HEAD);
    }

function http_post_withheader($target, $ref, $data_array)
    {
    return http($target, $ref, $method="POST", $data_array, INCL_HEAD);
    }

   function http($target, $ref, $method, $data_array, $incl_head)
    {
    # Initialize PHP/CURL handle
    $ch = curl_init();

    # Prcess data, if presented
    if(is_array($data_array))
        {
        # Convert data array into a query string (ie animal=dog&sport=baseball)
        foreach ($data_array as $key => $value) 
            {
            if(strlen(trim($value))>0)
                $temp_string[] = $key . "=" . urlencode($value);
            else
                $temp_string[] = $key;
            } 
        $query_string = join('&', $temp_string);
        }else{ 
            $query_string =$data_array;
        }

    # HEAD method configuration
    if($method == HEAD)
        {
        curl_setopt($ch, CURLOPT_HEADER, TRUE);                // No http head
        curl_setopt($ch, CURLOPT_NOBODY, TRUE);                // Return body
        }
    else
        {
        # GET method configuration
        if($method == GET)
            {
            if(isset($query_string))
                $target = $target . "?" . $query_string;
            curl_setopt ($ch, CURLOPT_HTTPGET, TRUE); 
            curl_setopt ($ch, CURLOPT_POST, FALSE); 
            }
        # POST method configuration
        if($method == POST)
            {
            if(isset($query_string))
                curl_setopt ($ch, CURLOPT_POSTFIELDS, $query_string);
            curl_setopt ($ch, CURLOPT_POST, TRUE); 
            curl_setopt ($ch, CURLOPT_HTTPGET, FALSE); 
            }
        curl_setopt($ch, CURLOPT_HEADER, $incl_head);   // Include head as needed
        curl_setopt($ch, CURLOPT_NOBODY, FALSE);        // Return body
        }

    curl_setopt($ch, CURLOPT_COOKIEJAR, COOKIE_FILE);   // Cookie management.
    curl_setopt($ch, CURLOPT_COOKIEFILE, COOKIE_FILE);
    curl_setopt($ch, CURLOPT_TIMEOUT, CURL_TIMEOUT);    // Timeout
    curl_setopt($ch, CURLOPT_USERAGENT, WEBBOT_NAME);   // Webbot name
    curl_setopt($ch, CURLOPT_URL, $target);             // Target site
    curl_setopt($ch, CURLOPT_REFERER, $ref);            // Referer value
    curl_setopt($ch, CURLOPT_VERBOSE, FALSE);           // Minimize logs
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);    // No certificate
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);     // Follow redirects
    curl_setopt($ch, CURLOPT_MAXREDIRS, 4);             // Limit redirections to four
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);     // Return in string

    # Create return array
    $return_array['FILE']   = curl_exec($ch); 
    $return_array['STATUS'] = curl_getinfo($ch);
    $return_array['ERROR']  = curl_error($ch);

    # Close PHP/CURL handle
    curl_close($ch);

    # Return results
    return $return_array;
    }

   function InnerHtml($element) 
    { 
        $innerHTML = ""; 
        if($element != NULL && $element->hasChildNodes())
        {
            $children = $element->childNodes; 
            foreach ($children as $child) 
            { 
                $tmp_dom = new DOMDocument(); 
                $tmp_dom->appendChild($tmp_dom->importNode($child, true)); 
                $innerHTML.=trim($tmp_dom->saveHTML()); 
            } 
        }
        return $innerHTML; 
    } 


   function Split($data, $split)
   {
        return explode($split, $data);
   }

    function correctImgUrls($html, $url)
    {
        $DOM = new DOMDocument;
        $DOM->loadHTML($html);

        $imgs = $DOM->getElementsByTagName('img');
        foreach($imgs as $img){
            $src = $img->getAttribute('src');
            if(strpos($src, $url) !== 0){
                $img->setAttribute('src', $url.$src);
            }
        }

        $html = $DOM->saveHTML();
        return $html;
    }

    function correctUrls($html, $url)
    {
        $DOM = new DOMDocument;
        $DOM->loadHTML($html);

        $imgs = $DOM->getElementsByTagName('a');
        foreach($imgs as $img){
            $src = $img->getAttribute('href');
            if(strpos($src, $url) !== 0){
                $img->setAttribute('a', $url.$src);
            }
        }

        $html = $DOM->saveHTML();
        return $html;
    }

    function removeHref($html)
    {
        $DOM = new DOMDocument;
        $DOM->loadHTML($html);

        $imgs = $DOM->getElementsByTagName('a');
        foreach($imgs as $img){
            $src = $img->getAttribute('href');
            $img->setAttribute('href', "#");
        }

        $html = $DOM->saveHTML();
        return $html;
    }


   function QuerySelector($dom, $xPath)
   {
        return $dom->query($xPath);
   }
   /*
   function __destruct() {
        # Close PHP/CURL handle
        echo "Destruct Called..";
        curl_close($ch);
   }*/


}
?>

你欠我的

不是真的，任何像图书馆这样的机械化系统都能做到这一点。我修改了我的答案。这一点也不无用，您可能需要找出一些ajax调用，但根据我的经验，这比selenium带来的头痛要好。我不提倡selenium。我用任何有用的东西。也许我应该选择不那么激烈的词。最后，OP得到了一些提示，可以决定做什么。一般来说，我认为类似硒的解决方案是用于单元测试的，而类似机械化的解决方案是用于刮削的。

include("Business/Http/HttpHelper.php");
    $bot = new HttpHelper;
    //$download = $bot ->Download("https://www.odesk.com/login");
    $data['username'] = "myusername";
    $data['password'] = "myPassword";
    $bot -> http_post_form("https://www.odesk.com/login", "https://www.odesk.com/login", $data);