PHP并行curl请求
我正在做一个简单的应用程序,从15个不同的URL读取json数据。我有一个特殊的需要,我需要做这个服务器。我正在使用PHP并行curl请求,php,curl,file-get-contents,Php,Curl,File Get Contents,我正在做一个简单的应用程序,从15个不同的URL读取json数据。我有一个特殊的需要,我需要做这个服务器。我正在使用文件获取内容($url) 因为我使用的是文件内容($url)。我写了一个简单的脚本,是不是: $websites = array( $url1, $url2, $url3, ... $url15 ); foreach ($websites as $website) { $data[] = file_get_contents($we
文件获取内容($url)
因为我使用的是文件内容($url)。我写了一个简单的脚本,是不是:
$websites = array(
$url1,
$url2,
$url3,
...
$url15
);
foreach ($websites as $website) {
$data[] = file_get_contents($website);
}
它被证明是非常慢的,因为它等待第一个请求,然后再执行下一个请求。如果你是说多重卷曲,那么类似的方法可能会有所帮助:
$nodes = array($url1, $url2, $url3);
$node_count = count($nodes);
$curl_arr = array();
$master = curl_multi_init();
for($i = 0; $i < $node_count; $i++)
{
$url =$nodes[$i];
$curl_arr[$i] = curl_init($url);
curl_setopt($curl_arr[$i], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($master, $curl_arr[$i]);
}
do {
curl_multi_exec($master,$running);
} while($running > 0);
for($i = 0; $i < $node_count; $i++)
{
$results[] = curl_multi_getcontent ( $curl_arr[$i] );
}
print_r($results);
$nodes=array($url1、$url2、$url3);
$node_count=计数($nodes);
$curl_arr=array();
$master=curl_multi_init();
对于($i=0;$i<$node_count;$i++)
{
$url=$nodes[$i];
$curl\u arr[$i]=curl\u init($url);
curl_setopt($curl_arr[$i],CURLOPT_RETURNTRANSFER,true);
curl_multi_add_句柄($master,$curl_arr[$i]);
}
做{
curl_multi_exec($master,$running);
}而($running>0);
对于($i=0;$i<$node_count;$i++)
{
$results[]=curl_multi_getcontent($curl_arr[$i]);
}
打印(结果);
希望它能在某种程度上有所帮助我想提供一个更完整的示例,在出现轻微错误或意外情况时,不会100%命中CPU并崩溃 它还向您展示了如何获取标题、正文、请求信息和下面的手动重定向 免责声明,此代码旨在扩展并实现到库中或作为快速起点,因此,其中的函数保持在最低限度
function mtime(){
return microtime(true);
}
function ptime($prev){
$t = microtime(true) - $prev;
$t = $t * 1000;
return str_pad($t, 20, 0, STR_PAD_RIGHT);
}
// This function exists to add compatibility for CURLM_CALL_MULTI_PERFORM for old curl versions, on modern curl it will only run once and be the equivalent of calling curl_multi_exec
function curl_multi_exec_full($mh, &$still_running) {
// In theory curl_multi_exec should never return CURLM_CALL_MULTI_PERFORM (-1) because it has been deprecated
// In practice it sometimes does
// So imagine that this just runs curl_multi_exec once and returns it's value
do {
$state = curl_multi_exec($mh, $still_running);
// curl_multi_select($mh, $timeout) simply blocks for $timeout seconds while curl_multi_exec() returns CURLM_CALL_MULTI_PERFORM
// We add it to prevent CPU 100% usage in case this thing misbehaves (especially for old curl on windows)
} while ($still_running > 0 && $state === CURLM_CALL_MULTI_PERFORM && curl_multi_select($mh, 0.1));
return $state;
}
// This function replaces curl_multi_select and makes the name make more sense, since all we're doing is waiting for curl, it also forces a minimum sleep time between requests to avoid excessive CPU usage.
function curl_multi_wait($mh, $minTime = 0.001, $maxTime = 1){
$umin = $minTime*1000000;
$start_time = microtime(true);
// it sleeps until there is some activity on any of the descriptors (curl files)
// it returns the number of descriptors (curl files that can have activity)
$num_descriptors = curl_multi_select($mh, $maxTime);
// if the system returns -1, it means that the wait time is unknown, and we have to decide the minimum time to wait
// but our `$timespan` check below catches this edge case, so this `if` isn't really necessary
if($num_descriptors === -1){
usleep($umin);
}
$timespan = (microtime(true) - $start_time);
// This thing runs very fast, up to 1000 times for 2 urls, which wastes a lot of CPU
// This will reduce the runs so that each interval is separated by at least minTime
if($timespan < $umin){
usleep($umin - $timespan);
//print "sleep for ".($umin - $timeDiff).PHP_EOL;
}
}
$handles = [
[
CURLOPT_URL=>"http://example.com/",
CURLOPT_HEADER=>false,
CURLOPT_RETURNTRANSFER=>true,
CURLOPT_FOLLOWLOCATION=>false,
],
[
CURLOPT_URL=>"http://www.php.net",
CURLOPT_HEADER=>false,
CURLOPT_RETURNTRANSFER=>true,
CURLOPT_FOLLOWLOCATION=>false,
// this function is called by curl for each header received
// This complies with RFC822 and RFC2616, please do not suggest edits to make use of the mb_ string functions, it is incorrect!
// https://stackoverflow.com/a/41135574
CURLOPT_HEADERFUNCTION=>function($ch, $header)
{
print "header from http://www.php.net: ".$header;
//$header = explode(':', $header, 2);
//if (count($header) < 2){ // ignore invalid headers
// return $len;
//}
//$headers[strtolower(trim($header[0]))][] = trim($header[1]);
return strlen($header);
}
]
];
//create the multiple cURL handle
$mh = curl_multi_init();
$chandles = [];
foreach($handles as $opts) {
// create cURL resources
$ch = curl_init();
// set URL and other appropriate options
curl_setopt_array($ch, $opts);
// add the handle
curl_multi_add_handle($mh, $ch);
$chandles[] = $ch;
}
//execute the multi handle
$prevRunning = null;
$count = 0;
do {
$time = mtime();
// $running contains the number of currently running requests
$status = curl_multi_exec_full($mh, $running);
$count++;
print ptime($time).": curl_multi_exec status=$status running $running".PHP_EOL;
// One less is running, meaning one has finished
if($running < $prevRunning){
print ptime($time).": curl_multi_info_read".PHP_EOL;
// msg: The CURLMSG_DONE constant. Other return values are currently not available.
// result: One of the CURLE_* constants. If everything is OK, the CURLE_OK will be the result.
// handle: Resource of type curl indicates the handle which it concerns.
while ($read = curl_multi_info_read($mh, $msgs_in_queue)) {
$info = curl_getinfo($read['handle']);
if($read['result'] !== CURLE_OK){
// handle the error somehow
print "Error: ".$info['url'].PHP_EOL;
}
if($read['result'] === CURLE_OK){
/*
// This will automatically follow the redirect and still give you control over the previous page
// TODO: max redirect checks and redirect timeouts
if(isset($info['redirect_url']) && trim($info['redirect_url'])!==''){
print "running redirect: ".$info['redirect_url'].PHP_EOL;
$ch3 = curl_init();
curl_setopt($ch3, CURLOPT_URL, $info['redirect_url']);
curl_setopt($ch3, CURLOPT_HEADER, 0);
curl_setopt($ch3, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch3, CURLOPT_FOLLOWLOCATION, 0);
curl_multi_add_handle($mh,$ch3);
}
*/
print_r($info);
$body = curl_multi_getcontent($read['handle']);
print $body;
}
}
}
// Still running? keep waiting...
if ($running > 0) {
curl_multi_wait($mh);
}
$prevRunning = $running;
} while ($running > 0 && $status == CURLM_OK);
//close the handles
foreach($chandles as $ch){
curl_multi_remove_handle($mh, $ch);
}
curl_multi_close($mh);
print $count.PHP_EOL;
函数mtime(){
返回微时间(true);
}
函数ptime($prev){
$t=微时间(真)-$prev;
$t=$t*1000;
返回str_pad($t,20,0,str_pad_RIGHT);
}
//此函数的存在是为了为CURLM_CALL_MULTI_PERFORM添加兼容性。对于旧的curl版本,在现代curl上它只运行一次,相当于调用curl_MULTI_exec
函数curl\u multi\u exec\u full($mh,&$still\u running){
//理论上,curl\u multi\u exec不应返回CURLM\u CALL\u multi\u PERFORM(-1),因为它已被弃用
//实际上,有时确实如此
//假设它只运行curl\u multi\u exec一次并返回它的值
做{
$state=curl\u multi\u exec($mh,$still\u running);
//curl\u multi\u select($mh,$timeout)只阻塞$timeout秒,而curl\u multi\u exec()返回CURLM\u CALL\u multi\u PERFORM
//我们添加它是为了防止CPU 100%的使用率,以防出现这种情况(特别是对于windows上的旧curl)
}而($still_running>0&&$state===CURLM_CALL_MULTI_PERFORM&&curl_MULTI_select($mh,0.1));
返回$state;
}
//这个函数取代了curl\u multi\u select,使这个名称更有意义,因为我们所做的只是等待curl,它还强制在请求之间留出最短的睡眠时间,以避免CPU过度使用。
函数curl\u multi\u wait($mh,$minTime=0.001,$maxTime=1){
$umin=$minTime*1000000;
$start_time=微时间(真);
//它会一直休眠,直到在任何描述符(curl文件)上出现某种活动为止
//它返回描述符的数量(可以有活动的curl文件)
$num\u descriptors=curl\u multi\u select($mh,$maxTime);
//如果系统返回-1,这意味着等待时间未知,我们必须确定最短等待时间
//但是我们下面的“$timespan”检查捕捉到了这个edge案例,所以这个“if”并不是真的必要
如果($num_描述符==-1){
usleep($umin);
}
$timespan=(微时间(true)-$start\u时间);
//这个东西运行得非常快,2个URL的运行次数高达1000次,这会浪费大量CPU
//这将减少运行次数,使每个间隔至少间隔一分钟
如果($timespan<$umin){
usleep($umin-$timespan);
//打印“睡眠时间”。($umin-$timeDiff).PHP\u EOL;
}
}
$handles=[
[
CURLOPT_URL=>”http://example.com/",
CURLOPT_头=>false,
CURLOPT_RETURNTRANSFER=>true,
CURLOPT_FOLLOWLOCATION=>false,
],
[
CURLOPT_URL=>”http://www.php.net",
CURLOPT_头=>false,
CURLOPT_RETURNTRANSFER=>true,
CURLOPT_FOLLOWLOCATION=>false,
//curl为接收到的每个标头调用此函数
//这符合RFC822和RFC2616,请不要建议编辑以使用mb_uu字符串函数,这是不正确的!
// https://stackoverflow.com/a/41135574
CURLOPT_HEADERFUNCTION=>函数($ch,$header)
{
“打印”标题来自http://www.php.net: $header;
//$header=分解(“:”,$header,2);
//if(count($header)<2){//忽略无效的头
//返回$len;
//}
//$headers[strtolower(trim($header[0])][]=trim($header[1]);
返回strlen($header);
}
]
];
//创建多重卷曲控制柄
$mh=curl_multi_init();
$chandles=[];
foreach($opts处理为$opts){
//创建卷曲资源
$ch=curl_init();
//设置URL和其他适当的选项
curl_setopt_数组($ch,$opts);
//添加句柄
卷曲多加手柄($mh,$ch);
$chandles[]=$ch;
}
//执行多句柄
$prevRunning=null;
$count=0;
做{
$time=mtime();
//$running包含当前正在运行的请求数
$status=curl\u multi\u exec\u full($mh,$running);
$count++;
打印时间($time)。“:curl\u multi\u exec status=$status running$running”。PHP\u EOL;
//少一个正在运行,意味着一个已经完成
如果($running<$prevRunning){
打印时间($time)。“:curl\u multi\u info\u read”.PHP\u EOL;
//msg:CURLMSG_DONE常量。其他返回值当前不可用。
//结果:一个CURLE_*常量。如果一切正常,则CURLE_uOK将是结果。
//句柄:curl类型的资源指示它所关注的句柄。
而($read=curl\u multi\u info\u read($mh,$msgs\u在队列中)){
$info=curl_getinfo($read['handle']);
如果($read['result']!==CURLE\u OK){
//以某种方式处理错误
$websites = array(
"http://google.com",
"http://example.org"
// $url2,
// $url3,
// ...
// $url15
);
$mh = curl_multi_init();
foreach ($websites as $website) {
$worker = curl_init($website);
curl_setopt_array($worker, [
CURLOPT_RETURNTRANSFER => 1
]);
curl_multi_add_handle($mh, $worker);
}
for (;;) {
$still_running = null;
do {
$err = curl_multi_exec($mh, $still_running);
} while ($err === CURLM_CALL_MULTI_PERFORM);
if ($err !== CURLM_OK) {
// handle curl multi error?
}
if ($still_running < 1) {
// all downloads completed
break;
}
// some haven't finished downloading, sleep until more data arrives:
curl_multi_select($mh, 1);
}
$results = [];
while (false !== ($info = curl_multi_info_read($mh))) {
if ($info["result"] !== CURLE_OK) {
// handle download error?
}
$results[curl_getinfo($info["handle"], CURLINFO_EFFECTIVE_URL)] = curl_multi_getcontent($info["handle"]);
curl_multi_remove_handle($mh, $info["handle"]);
curl_close($info["handle"]);
}
curl_multi_close($mh);
var_export($results);
$websites = array(
"http://google.com",
"http://example.org"
// $url2,
// $url3,
// ...
// $url15
);
var_dump(fetch_urls($websites,50));
function fetch_urls(array $urls, int $max_connections, int $timeout_ms = 10000, bool $return_fault_reason = true): array
{
if ($max_connections < 1) {
throw new InvalidArgumentException("max_connections MUST be >=1");
}
foreach ($urls as $key => $foo) {
if (! is_string($foo)) {
throw new \InvalidArgumentException("all urls must be strings!");
}
if (empty($foo)) {
unset($urls[$key]); // ?
}
}
unset($foo);
// DISABLED for benchmarking purposes: $urls = array_unique($urls); // remove duplicates.
$ret = array();
$mh = curl_multi_init();
$workers = array();
$work = function () use (&$ret, &$workers, &$mh, $return_fault_reason) {
// > If an added handle fails very quickly, it may never be counted as a running_handle
while (1) {
do {
$err = curl_multi_exec($mh, $still_running);
} while ($err === CURLM_CALL_MULTI_PERFORM);
if ($still_running < count($workers)) {
// some workers finished, fetch their response and close them
break;
}
$cms = curl_multi_select($mh, 1);
// var_dump('sr: ' . $still_running . " c: " . count($workers)." cms: ".$cms);
}
while (false !== ($info = curl_multi_info_read($mh))) {
// echo "NOT FALSE!";
// var_dump($info);
{
if ($info['msg'] !== CURLMSG_DONE) {
continue;
}
if ($info['result'] !== CURLE_OK) {
if ($return_fault_reason) {
$ret[$workers[(int) $info['handle']]] = print_r(array(
false,
$info['result'],
"curl_exec error " . $info['result'] . ": " . curl_strerror($info['result'])
), true);
}
} elseif (CURLE_OK !== ($err = curl_errno($info['handle']))) {
if ($return_fault_reason) {
$ret[$workers[(int) $info['handle']]] = print_r(array(
false,
$err,
"curl error " . $err . ": " . curl_strerror($err)
), true);
}
} else {
$ret[$workers[(int) $info['handle']]] = curl_multi_getcontent($info['handle']);
}
curl_multi_remove_handle($mh, $info['handle']);
assert(isset($workers[(int) $info['handle']]));
unset($workers[(int) $info['handle']]);
curl_close($info['handle']);
}
}
// echo "NO MORE INFO!";
};
foreach ($urls as $url) {
while (count($workers) >= $max_connections) {
// echo "TOO MANY WORKERS!\n";
$work();
}
$neww = curl_init($url);
if (! $neww) {
trigger_error("curl_init() failed! probably means that max_connections is too high and you ran out of system resources", E_USER_WARNING);
if ($return_fault_reason) {
$ret[$url] = array(
false,
- 1,
"curl_init() failed"
);
}
continue;
}
$workers[(int) $neww] = $url;
curl_setopt_array($neww, array(
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_SSL_VERIFYHOST => 0,
CURLOPT_SSL_VERIFYPEER => 0,
CURLOPT_TIMEOUT_MS => $timeout_ms
));
curl_multi_add_handle($mh, $neww);
// curl_multi_exec($mh, $unused_here); LIKELY TO BE MUCH SLOWER IF DONE IN THIS LOOP: TOO MANY SYSCALLS
}
while (count($workers) > 0) {
// echo "WAITING FOR WORKERS TO BECOME 0!";
// var_dump(count($workers));
$work();
}
curl_multi_close($mh);
return $ret;
}