Warning: file_get_contents(/data/phpspider/zhask/data//catemap/1/php/285.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Php 使用SSL和重定向的多线程cURL_Php_Multithreading_Ssl_Curl_Scraper - Fatal编程技术网

Php 使用SSL和重定向的多线程cURL

Php 使用SSL和重定向的多线程cURL,php,multithreading,ssl,curl,scraper,Php,Multithreading,Ssl,Curl,Scraper,我有一个非常简单的刮板现在可以做我需要的,但它非常慢,它在3秒钟内刮掉2张图片,我需要做的是在几秒钟内至少1000张图片 这就是我现在使用的代码 <?php require_once('config.php'); //Calling PHasher class file. include_once('classes/phasher.class.php'); $I = PHasher::Instance(); //Prevent execution timeout. set_ti

我有一个非常简单的刮板现在可以做我需要的,但它非常慢,它在3秒钟内刮掉2张图片,我需要做的是在几秒钟内至少1000张图片

这就是我现在使用的代码

    <?php
require_once('config.php');

//Calling PHasher class file.
include_once('classes/phasher.class.php');
$I = PHasher::Instance();

//Prevent execution timeout.
set_time_limit(0);

//Solving SSL Problem.
$arrContextOptions=array(
    "ssl"=>array(
        "verify_peer"=>false,
        "verify_peer_name"=>false,
    ),
);

//Check if the database contains hashed pictures or if it's empty, Then start from the latest hashed picture or start from 4.
$check = mysqli_query($con, "SELECT fid FROM images ORDER BY fid DESC LIMIT 1;");
if(mysqli_num_rows($check) > 0){

    $max_fid = mysqli_fetch_row($check);

    $fid = $max_fid[0]+1;
} else {
    $fid = 4;
}

$deletedProfile = "https://z-1-static.xx.fbcdn.net/rsrc.php/v2/yo/r/UlIqmHJn-SK.gif";

//Infinte while loop to fetch profiles pictures and save them inside avatar folder.
$initial = $fid;

while($fid = $initial){

    $url = 'https://graph.facebook.com/'.$fid.'/picture?width=378&height=378';

    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // follow the redirects
    curl_setopt($ch, CURLOPT_HEADER, false); // no needs to pass the headers to the data stream
    curl_setopt($ch, CURLOPT_NOBODY, true); // get the resource without a body
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // accept any server certificate
    curl_exec($ch);

    // get the last used URL
    $lastUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);

    curl_close($ch);

    if($lastUrl == $deletedProfile){
        $initial++;
    }else{
        $imageUrl = file_get_contents($url, false, stream_context_create($arrContextOptions));
        $savedImage = dirname(__file__).'/avatar/image.jpg';
        file_put_contents($savedImage, $imageUrl);

        //Exclude deleted profiles or corrupted pictures.
    if(getimagesize($savedImage) > 0 ){

    //PHasher class call to hash the images to hexdecimal values or binary values.
        $hash = $I->FastHashImage($savedImage);
        $hex = $I->HashAsString($hash);

        //Store Facebook id and hashed values for the images in hexa values.
        mysqli_query($con, "INSERT INTO images(fid, hash) VALUES ('$fid', '$hex')");

        $initial++;
    } else {
        $initial++;
    }
}
}

?>
2-然后我想在Mulit curl中使用输出数组,允许异步处理多个curl句柄


3-检查输出URL是否等于已删除的配置文件,如果不传递,则使用PHasher将其转换为散列值并存储在数据库中。

我只提供您需要的内容,尽管我无法达到那种吞吐量(每秒1000个并行请求)

我忘了我以前从哪里得到这个,但我正在使用它下载reddit内容:

class ParallelCurl {

    public $max_requests;
    public $options;
    public $outstanding_requests;
    public $multi_handle;

    public function __construct($in_max_requests = 10, $in_options = array()) {
        $this->max_requests = $in_max_requests;
        $this->options = $in_options;

        $this->outstanding_requests = array();
        $this->multi_handle = curl_multi_init();
    }

    //Ensure all the requests finish nicely
    public function __destruct() {
        $this->finishAllRequests();
    }

    // Sets how many requests can be outstanding at once before we block and wait for one to
    // finish before starting the next one
    public function setMaxRequests($in_max_requests) {
        $this->max_requests = $in_max_requests;
    }

    // Sets the options to pass to curl, using the format of curl_setopt_array()
    public function setOptions($in_options) {
        $this->options = $in_options;
    }

    // Start a fetch from the $url address, calling the $callback function passing the optional
    // $user_data value. The callback should accept 3 arguments, the url, curl handle and user
    // data, eg on_request_done($url, $ch, $user_data);
    public function startRequest($url, $callback, $user_data = array(), $post_fields = null, $headers = null) {
        if ($this->max_requests > 0)
            $this->waitForOutstandingRequestsToDropBelow($this->max_requests);

        $ch = curl_init();
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
        curl_setopt_array($ch, $this->options);
        curl_setopt($ch, CURLOPT_URL, $url);
        if (isset($post_fields)) {
            curl_setopt($ch, CURLOPT_POST, TRUE);
            curl_setopt($ch, CURLOPT_POSTFIELDS, $post_fields);
        }
        if (is_array($headers)) {
            curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
        }

        curl_multi_add_handle($this->multi_handle, $ch);

        $ch_array_key = (int) $ch;
        $this->outstanding_requests[$ch_array_key] = array(
            'link_url' => $url,
            'callback' => $callback,
            'user_data' => $user_data,
        );

        $this->checkForCompletedRequests();
    }

    // You *MUST* call this function at the end of your script. It waits for any running requests
    // to complete, and calls their callback functions
    public function finishAllRequests() {
        $this->waitForOutstandingRequestsToDropBelow(1);
    }

    // Checks to see if any of the outstanding requests have finished
    private function checkForCompletedRequests() {
        /*
          // Call select to see if anything is waiting for us
          if (curl_multi_select($this->multi_handle, 0.0) === -1)
          return;

          // Since something's waiting, give curl a chance to process it
          do {
          $mrc = curl_multi_exec($this->multi_handle, $active);
          } while ($mrc == CURLM_CALL_MULTI_PERFORM);
         */
        // fix for https://bugs.php.net/bug.php?id=63411
        do {
            $mrc = curl_multi_exec($this->multi_handle, $active);
        } while ($mrc == CURLM_CALL_MULTI_PERFORM);
        while ($active && $mrc == CURLM_OK) {
            if (curl_multi_select($this->multi_handle) != -1) {
                do {
                    $mrc = curl_multi_exec($this->multi_handle, $active);
                } while ($mrc == CURLM_CALL_MULTI_PERFORM);
            } else
                return;
        }

        // Now grab the information about the completed requests
        while ($info = curl_multi_info_read($this->multi_handle)) {

            $ch = $info['handle'];
            $ch_array_key = (int) $ch;

            if (!isset($this->outstanding_requests[$ch_array_key])) {
                die("Error - handle wasn't found in requests: '$ch' in " .
                    print_r($this->outstanding_requests, true));
            }

            $request = $this->outstanding_requests[$ch_array_key];
            $url = $request['link_url'];
            $content = curl_multi_getcontent($ch);
            $callback = $request['callback'];
            $user_data = $request['user_data'];

            call_user_func($callback, $content, $url, $ch, $user_data);

            unset($this->outstanding_requests[$ch_array_key]);

            curl_multi_remove_handle($this->multi_handle, $ch);
        }
    }

    // Blocks until there's less than the specified number of requests outstanding
    private function waitForOutstandingRequestsToDropBelow($max) {
        while (1) {
            $this->checkForCompletedRequests();
            if (count($this->outstanding_requests) < $max)
                break;

            usleep(10000);
        }
    }

}

谢谢,它正在工作,但是您能告诉我如何将循环的输出保存为数组吗?它将仅保存为数组中的一个元素[0]什么是“保存为数组”?只需在一个数组中收集您的url,并在该数组中循环,将每个url传递到$pcurl->startRequest。请参见:。在命令行中运行此命令(与此处的类一起),您可能会注意到下载可能没有顺序,因为它们不是按顺序保存的,而是并行保存的
Array ( [28990] => Array ( [0] => https://graph.facebook.com/28990/picture?width=378&height=378 )
[28991] => Array ( [0] => https://graph.facebook.com/28991/picture?width=378&height=378 )
[28992] => Array ( [0] => https://graph.facebook.com/28992/picture?width=378&height=378 )
[28993] => Array ( [0] => https://graph.facebook.com/28993/picture?width=378&height=378 )
[28994] => Array ( [0] => https://graph.facebook.com/28994/picture?width=378&height=378 )
[28995] => Array ( [0] => https://graph.facebook.com/28995/picture?width=378&height=378 )
[28996] => Array ( [0] => https://graph.facebook.com/28996/picture?width=378&height=378 )
[28997] => Array ( [0] => https://graph.facebook.com/28997/picture?width=378&height=378 )
class ParallelCurl {

    public $max_requests;
    public $options;
    public $outstanding_requests;
    public $multi_handle;

    public function __construct($in_max_requests = 10, $in_options = array()) {
        $this->max_requests = $in_max_requests;
        $this->options = $in_options;

        $this->outstanding_requests = array();
        $this->multi_handle = curl_multi_init();
    }

    //Ensure all the requests finish nicely
    public function __destruct() {
        $this->finishAllRequests();
    }

    // Sets how many requests can be outstanding at once before we block and wait for one to
    // finish before starting the next one
    public function setMaxRequests($in_max_requests) {
        $this->max_requests = $in_max_requests;
    }

    // Sets the options to pass to curl, using the format of curl_setopt_array()
    public function setOptions($in_options) {
        $this->options = $in_options;
    }

    // Start a fetch from the $url address, calling the $callback function passing the optional
    // $user_data value. The callback should accept 3 arguments, the url, curl handle and user
    // data, eg on_request_done($url, $ch, $user_data);
    public function startRequest($url, $callback, $user_data = array(), $post_fields = null, $headers = null) {
        if ($this->max_requests > 0)
            $this->waitForOutstandingRequestsToDropBelow($this->max_requests);

        $ch = curl_init();
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
        curl_setopt_array($ch, $this->options);
        curl_setopt($ch, CURLOPT_URL, $url);
        if (isset($post_fields)) {
            curl_setopt($ch, CURLOPT_POST, TRUE);
            curl_setopt($ch, CURLOPT_POSTFIELDS, $post_fields);
        }
        if (is_array($headers)) {
            curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
        }

        curl_multi_add_handle($this->multi_handle, $ch);

        $ch_array_key = (int) $ch;
        $this->outstanding_requests[$ch_array_key] = array(
            'link_url' => $url,
            'callback' => $callback,
            'user_data' => $user_data,
        );

        $this->checkForCompletedRequests();
    }

    // You *MUST* call this function at the end of your script. It waits for any running requests
    // to complete, and calls their callback functions
    public function finishAllRequests() {
        $this->waitForOutstandingRequestsToDropBelow(1);
    }

    // Checks to see if any of the outstanding requests have finished
    private function checkForCompletedRequests() {
        /*
          // Call select to see if anything is waiting for us
          if (curl_multi_select($this->multi_handle, 0.0) === -1)
          return;

          // Since something's waiting, give curl a chance to process it
          do {
          $mrc = curl_multi_exec($this->multi_handle, $active);
          } while ($mrc == CURLM_CALL_MULTI_PERFORM);
         */
        // fix for https://bugs.php.net/bug.php?id=63411
        do {
            $mrc = curl_multi_exec($this->multi_handle, $active);
        } while ($mrc == CURLM_CALL_MULTI_PERFORM);
        while ($active && $mrc == CURLM_OK) {
            if (curl_multi_select($this->multi_handle) != -1) {
                do {
                    $mrc = curl_multi_exec($this->multi_handle, $active);
                } while ($mrc == CURLM_CALL_MULTI_PERFORM);
            } else
                return;
        }

        // Now grab the information about the completed requests
        while ($info = curl_multi_info_read($this->multi_handle)) {

            $ch = $info['handle'];
            $ch_array_key = (int) $ch;

            if (!isset($this->outstanding_requests[$ch_array_key])) {
                die("Error - handle wasn't found in requests: '$ch' in " .
                    print_r($this->outstanding_requests, true));
            }

            $request = $this->outstanding_requests[$ch_array_key];
            $url = $request['link_url'];
            $content = curl_multi_getcontent($ch);
            $callback = $request['callback'];
            $user_data = $request['user_data'];

            call_user_func($callback, $content, $url, $ch, $user_data);

            unset($this->outstanding_requests[$ch_array_key]);

            curl_multi_remove_handle($this->multi_handle, $ch);
        }
    }

    // Blocks until there's less than the specified number of requests outstanding
    private function waitForOutstandingRequestsToDropBelow($max) {
        while (1) {
            $this->checkForCompletedRequests();
            if (count($this->outstanding_requests) < $max)
                break;

            usleep(10000);
        }
    }

}
$pcurl = new ParallelCurl(10, array(
    CURLOPT_RETURNTRANSFER  => 1,
    CURLOPT_FOLLOWLOCATION  => 1,
    CURLOPT_SSL_VERIFYPEER  => 1,
));

$pcurl->startRequest($url, function($data) {
     // download finished. $data is html or binary, whatever you requested
     echo $data;
});