PHP数据刮取和多线程进程数限制
PHP数据刮取和多线程进程数限制,php,parallel-processing,curl-multi,Php,Parallel Processing,Curl Multi,我正在编写一个脚本,它将使用cURL和DOMDocument从另一个站点抓取数据。 我将发布3部分代码来更好地解释我的脚本(抱歉,如果它太长了),但我会检查构建脚本的逻辑是否正确(我从未使用过scrape和multi_cURL),而且我不是专业程序员 作为第一步我已经让一个进程运行起来,并且成功了。 代码是这样的: $urlCurl = '[url to scrap]'; $options = Array( CURLOPT_RETURNTRANSFER => TR
我正在编写一个脚本,它将使用cURL和DOMDocument从另一个站点抓取数据。 我将发布3部分代码来更好地解释我的脚本(抱歉,如果它太长了),但我会检查构建脚本的逻辑是否正确(我从未使用过scrape和multi_cURL),而且我不是专业程序员 作为第一步我已经让一个进程运行起来,并且成功了。 代码是这样的:
$urlCurl = '[url to scrap]';
$options = Array(
CURLOPT_RETURNTRANSFER => TRUE, // Setting cURL's option to return the webpage data
CURLOPT_FOLLOWLOCATION => TRUE, // Setting cURL to follow 'location' HTTP headers
CURLOPT_AUTOREFERER => TRUE, // Automatically set the referer where following 'location' HTTP headers
CURLOPT_CONNECTTIMEOUT => 300, // Setting the amount of time (in seconds) before the request times out
CURLOPT_TIMEOUT => 300, // Setting the maximum amount of time for cURL to execute queries
CURLOPT_MAXREDIRS => 10, // Setting the maximum number of redirections to follow
CURLOPT_USERAGENT => "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1a2pre) Gecko/2008073000 Shredder/3.0a2pre ThunderBrowse/3.2.1.8", // Setting the useragent
CURLOPT_URL => $urlCurl, // Setting cURL's URL option with the $url variable passed into the function
);
$chCurl = curl_init($urlCurl);
curl_setopt_array($chCurl, $options); // Setting cURL's options using the previously assigned array data in $options
$resultChCurl = curl_exec($chCurl);
$html = $resultChCurl;
$dom = new DOMDocument();
$html = $dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
//$result = array();
$tables = $dom->getElementsByTagName('table');
foreach($tables as $table) {
if ($table->hasAttribute('class')) {
$rows = $dom->getElementsByTagName("tr");
foreach($rows as $row) {
$cols = $row->getElementsByTagName('td');
$aref = $row->getElementsByTagName('a');
foreach($aref as $profile) {
if (($cols->item(0)->nodeValue != "Seller") and ($cols->item(1)->nodeValue != "Ratio") and ($cols->item(2)->nodeValue != "Amount")) {
$seller = $cols->item(0)->nodeValue;
$sellerID = intval(preg_replace('/[^0-9]+/', '', ($profile->getAttribute( 'href' ))), 10);
$sellerType = sellerTypeExtract($profile->getAttribute( 'href' ));
$currencysell = preg_replace('/[.0-9]/','',($cols->item(1)->nodeValue));
$amount = floatval($cols->item(1)->nodeValue);
$rate = rateExtractor($cols->item(2)->nodeValue);
$currencybought = preg_replace('/[.0-9'.$currencysell.'=]/','',($cols->item(2)->nodeValue));
$marketLink = $urlCurl;
$server = "servername";
$tenGoldref = tenGoldref($rate);
}
$result[] = array (
'seller' => $seller,
'sellerprofile'=> $sellerID,
'sellertype' => $sellerType,
'currencysold' => $currencysell,
'amount'=> $amount,
'rate'=> $rate,
'currencybought'=> $currencybought,
'marketLink' => $marketLink,
'server' => $server,
'tengold' => $tenGoldref,
'timeMoment' => $dayTime,
'date' => $date,
'hour' => $hour
);
}
}
}
}}
print_r($result);
function curl_multi_download(array $urls, callable $callback, array $custom_options = array())
{
$time_start_insert = microtime(true);
// make sure the rolling window isn't greater than the # of urls
$rolling_window = 50;
$rolling_window = (sizeof($urls) < $rolling_window) ? sizeof($urls) : $rolling_window;
$master = curl_multi_init();
$curl_arr = array();
$options = array(
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 5,
) + $custom_options;
// start the first batch of requests
for ( $i = 0; $i < $rolling_window; $i++ )
{
$ch = curl_init();
$options[CURLOPT_URL] = $urls[$i];
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
}
do
{
while(($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM);
if($execrun != CURLM_OK)
break;
// a request was just completed -- find out which one
while( $done = curl_multi_info_read($master) )
{
$info = curl_getinfo($done['handle']);
// request successful. process output using the callback function.
$output = curl_multi_getcontent($done['handle']);
$extract = call_user_func_array($callback, array($info, $output));
if (empty($fullArr)) {
$fullArr = array($extract);
} else {
(array_push($fullArr,$extract));
}
echo "<br><br>*******************NOW THERE SHOULD BE FULL ARRAY*************************<br>";
echo 'done';
echo "<br>";
print_r($fullArr);
echo "<br><br>*******************END FULL ARRAY*************************<br>";
$time_end_insert = microtime(true);
//dividing with 60 will give the execution time in minutes other wise seconds
$execution_time_insert = ($time_end_insert - $time_start_insert)/60;
//execution time of the script
echo '<br><br><p class="finalTime">++++++++ total execution time: '.round($execution_time_insert,2,PHP_ROUND_HALF_DOWN).' minutes +++++++++++</p><br>';
if ( isset($urls[$i+1]) )
{
// start a new request (it's important to do this before removing the old one)
$ch = curl_init();
$options[CURLOPT_URL] = $urls[$i++]; // increment i
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
}
// remove the curl handle that just completed
curl_multi_remove_handle($master, $done['handle']);
}
} while ($running);
curl_multi_close($master);
return true;
}
function curl_multi_getcontent_utf8( $ch )
{
$data = curl_multi_getcontent( $ch );
if ( !is_string($data) )
return $data;
unset($charset);
$content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
/* 1: HTTP Content-Type: header */
preg_match( '@([\w/+]+)(;\s*charset=(\S+))?@i', $content_type, $matches );
if ( isset( $matches[3] ) )
$charset = $matches[3];
/* 2: <meta> element in the page */
if ( !isset($charset) )
{
preg_match( '@<meta\s+http-equiv="Content-Type"\s+content="([\w/]+)(;\s*charset=([^\s"]+))?@i', $data, $matches );
if ( isset( $matches[3] ) )
$charset = $matches[3];
}
/* 3: <xml> element in the page */
if ( !isset($charset) )
{
preg_match( '@<\?xml.+encoding="([^\s"]+)@si', $data, $matches );
if ( isset( $matches[1] ) )
$charset = $matches[1];
}
/* 4: PHP's heuristic detection */
if ( !isset($charset) )
{
$encoding = mb_detect_encoding($data);
if ($encoding)
$charset = $encoding;
}
/* 5: Default for HTML */
if ( !isset($charset) )
{
if (strstr($content_type, "text/html") === 0)
$charset = "ISO 8859-1";
}
/* Convert it if it is anything but UTF-8 */
/* You can change "UTF-8" to "UTF-8//IGNORE" to
ignore conversion errors and still output something reasonable */
if ( isset($charset) && strtoupper($charset) != "UTF-8" )
$data = iconv($charset, 'UTF-8', $data);
return $data;
}
curl_multi_download(array('[LINKS TO SCRAPE]'), 'process_response');
function process_response( $info, $response )
{
if ( $info['http_code'] != 200 )
{
echo "Error retrieving URL " . $info['url'] . "<br/>";
return;
};
//var_dump($info);
//var_dump($response);
$fullArr = array();
$infos = scrape($response,$dayTime,$date,$hour);
return $infos;
}
function curl_download( $url )
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
$data = self::curl_exec($ch);
curl_close($ch);
return $data;
}
function scrape($data,$dayTime,$date,$hour) {
$html = $data;
$dom = new DOMDocument();
$html = $dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$result = array();
$tables = $dom->getElementsByTagName('table');
foreach($tables as $table) {
if ($table->hasAttribute('class')) {
$rows = $dom->getElementsByTagName("tr");
foreach($rows as $row) {
$cols = $row->getElementsByTagName('td');
$aref = $row->getElementsByTagName('a');
foreach($aref as $profile) {
if (($cols->item(0)->nodeValue != "Seller") and ($cols->item(1)->nodeValue != "Ratio") and ($cols->item(2)->nodeValue != "Amount")) {
$seller = $cols->item(0)->nodeValue;
$sellerID = intval(preg_replace('/[^0-9]+/', '', ($profile->getAttribute( 'href' ))), 10);
$sellerType = sellerTypeExtract($profile->getAttribute( 'href' ));
$currencysell = preg_replace('/[.0-9]/','',($cols->item(1)->nodeValue));
$amount = floatval($cols->item(1)->nodeValue);
$rate = rateExtractor($cols->item(2)->nodeValue);
$currencybought = preg_replace('/[.0-9'.$currencysell.'=]/','',($cols->item(2)->nodeValue));
//$marketLink = $nodes;
$server = "servername";
$tenGoldref = tenGoldref($rate);
}
$result = array (
'seller' => $seller,
'sellerprofile'=> $sellerID,
'sellertype' => $sellerType,
'currencysold' => $currencysell,
'amount'=> $amount,
'rate'=> $rate,
'currencybought'=> $currencybought,
//'marketLink' => $marketLink,
'server' => $server,
'tengold' => $tenGoldref,
'timeMoment' => $dayTime,
'date' => $date,
'hour' => $hour
);
}
}
}
}
return $result;
}
为了节省时间,我尝试实现multi_cURL,我从这个简单的脚本开始
function urlBuilder() {
$countryList = array();
for ($i = 1; $i <= 50; $i++) {
echo "<br>".$i."<br>";
$origin = "[VARIABLE URL TO SCRAP]";
if (empty($countryList)) {
$countryList = array($origin);
} else {
(array_push($countryList,$origin));
}
}
return $countryList;
}
function start() {
$fullArr = array();
$nodes = urlBuilder();
$node_count = count($nodes);
$curl_arr = array();
$master = curl_multi_init();
for($i = 0; $i < $node_count; $i++)
{
$url = $nodes[$i];
$curl_arr[$i] = curl_init($url);
curl_setopt($curl_arr[$i], CURLOPT_RETURNTRANSFER, true);
curl_multi_add_handle($master, $curl_arr[$i]);
}
do {
curl_multi_exec($master,$running);
} while($running > 0);
echo "results: ";
for($i = 0; $i < $node_count; $i++)
{
$results = curl_multi_getcontent ( $curl_arr[$i] );
$infos = scrape($results,$nodes[$i],$dayTime,$date,$hour);
if (empty($fullArr)) {
$fullArr = array($infos);
} else {
(array_push($fullArr,$infos));
}
}
echo "<br><br>*******************NOW THERE SHOULD BE FULL ARRAY*************************<br>";
echo 'done';
echo "<br>";
print_r($fullArr);
echo "<br><br>*******************END FULL ARRAY*************************<br>";
函数urlBuilder(){
$countryList=array();
(i=1;i=0);
回声“结果:”;
对于($i=0;$i<$node_count;$i++)
{
$results=curl\u multi\u getcontent($curl\u arr[$i]);
$infos=scrap($results,$nodes[$i],$days,$date,$hour);
如果(空($fullArr)){
$fullArr=阵列($infos);
}否则{
(阵列推送($fullArr,$infos));
}
}
echo“
*******************现在应该有完整的数组******************************************
”;
回音“完成”;
回声“
”;
印刷费($fullArr);
echo“
*********************************************************************************************
”;
但我遇到的第一个问题是,对于155个并行连接,最后的30/40发送回“NULL”数据(在90%的测试中并非每次都是如此)。
所以第一个问题是:脚本是否因为连接太多而返回空数据?
我想,这就是为什么我试图在代码中实现本教程(),以获得一个好的多连接处理程序,最后的代码是:
$urlCurl = '[url to scrap]';
$options = Array(
CURLOPT_RETURNTRANSFER => TRUE, // Setting cURL's option to return the webpage data
CURLOPT_FOLLOWLOCATION => TRUE, // Setting cURL to follow 'location' HTTP headers
CURLOPT_AUTOREFERER => TRUE, // Automatically set the referer where following 'location' HTTP headers
CURLOPT_CONNECTTIMEOUT => 300, // Setting the amount of time (in seconds) before the request times out
CURLOPT_TIMEOUT => 300, // Setting the maximum amount of time for cURL to execute queries
CURLOPT_MAXREDIRS => 10, // Setting the maximum number of redirections to follow
CURLOPT_USERAGENT => "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1a2pre) Gecko/2008073000 Shredder/3.0a2pre ThunderBrowse/3.2.1.8", // Setting the useragent
CURLOPT_URL => $urlCurl, // Setting cURL's URL option with the $url variable passed into the function
);
$chCurl = curl_init($urlCurl);
curl_setopt_array($chCurl, $options); // Setting cURL's options using the previously assigned array data in $options
$resultChCurl = curl_exec($chCurl);
$html = $resultChCurl;
$dom = new DOMDocument();
$html = $dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
//$result = array();
$tables = $dom->getElementsByTagName('table');
foreach($tables as $table) {
if ($table->hasAttribute('class')) {
$rows = $dom->getElementsByTagName("tr");
foreach($rows as $row) {
$cols = $row->getElementsByTagName('td');
$aref = $row->getElementsByTagName('a');
foreach($aref as $profile) {
if (($cols->item(0)->nodeValue != "Seller") and ($cols->item(1)->nodeValue != "Ratio") and ($cols->item(2)->nodeValue != "Amount")) {
$seller = $cols->item(0)->nodeValue;
$sellerID = intval(preg_replace('/[^0-9]+/', '', ($profile->getAttribute( 'href' ))), 10);
$sellerType = sellerTypeExtract($profile->getAttribute( 'href' ));
$currencysell = preg_replace('/[.0-9]/','',($cols->item(1)->nodeValue));
$amount = floatval($cols->item(1)->nodeValue);
$rate = rateExtractor($cols->item(2)->nodeValue);
$currencybought = preg_replace('/[.0-9'.$currencysell.'=]/','',($cols->item(2)->nodeValue));
$marketLink = $urlCurl;
$server = "servername";
$tenGoldref = tenGoldref($rate);
}
$result[] = array (
'seller' => $seller,
'sellerprofile'=> $sellerID,
'sellertype' => $sellerType,
'currencysold' => $currencysell,
'amount'=> $amount,
'rate'=> $rate,
'currencybought'=> $currencybought,
'marketLink' => $marketLink,
'server' => $server,
'tengold' => $tenGoldref,
'timeMoment' => $dayTime,
'date' => $date,
'hour' => $hour
);
}
}
}
}}
print_r($result);
function curl_multi_download(array $urls, callable $callback, array $custom_options = array())
{
$time_start_insert = microtime(true);
// make sure the rolling window isn't greater than the # of urls
$rolling_window = 50;
$rolling_window = (sizeof($urls) < $rolling_window) ? sizeof($urls) : $rolling_window;
$master = curl_multi_init();
$curl_arr = array();
$options = array(
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_MAXREDIRS => 5,
) + $custom_options;
// start the first batch of requests
for ( $i = 0; $i < $rolling_window; $i++ )
{
$ch = curl_init();
$options[CURLOPT_URL] = $urls[$i];
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
}
do
{
while(($execrun = curl_multi_exec($master, $running)) == CURLM_CALL_MULTI_PERFORM);
if($execrun != CURLM_OK)
break;
// a request was just completed -- find out which one
while( $done = curl_multi_info_read($master) )
{
$info = curl_getinfo($done['handle']);
// request successful. process output using the callback function.
$output = curl_multi_getcontent($done['handle']);
$extract = call_user_func_array($callback, array($info, $output));
if (empty($fullArr)) {
$fullArr = array($extract);
} else {
(array_push($fullArr,$extract));
}
echo "<br><br>*******************NOW THERE SHOULD BE FULL ARRAY*************************<br>";
echo 'done';
echo "<br>";
print_r($fullArr);
echo "<br><br>*******************END FULL ARRAY*************************<br>";
$time_end_insert = microtime(true);
//dividing with 60 will give the execution time in minutes other wise seconds
$execution_time_insert = ($time_end_insert - $time_start_insert)/60;
//execution time of the script
echo '<br><br><p class="finalTime">++++++++ total execution time: '.round($execution_time_insert,2,PHP_ROUND_HALF_DOWN).' minutes +++++++++++</p><br>';
if ( isset($urls[$i+1]) )
{
// start a new request (it's important to do this before removing the old one)
$ch = curl_init();
$options[CURLOPT_URL] = $urls[$i++]; // increment i
curl_setopt_array($ch, $options);
curl_multi_add_handle($master, $ch);
}
// remove the curl handle that just completed
curl_multi_remove_handle($master, $done['handle']);
}
} while ($running);
curl_multi_close($master);
return true;
}
function curl_multi_getcontent_utf8( $ch )
{
$data = curl_multi_getcontent( $ch );
if ( !is_string($data) )
return $data;
unset($charset);
$content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
/* 1: HTTP Content-Type: header */
preg_match( '@([\w/+]+)(;\s*charset=(\S+))?@i', $content_type, $matches );
if ( isset( $matches[3] ) )
$charset = $matches[3];
/* 2: <meta> element in the page */
if ( !isset($charset) )
{
preg_match( '@<meta\s+http-equiv="Content-Type"\s+content="([\w/]+)(;\s*charset=([^\s"]+))?@i', $data, $matches );
if ( isset( $matches[3] ) )
$charset = $matches[3];
}
/* 3: <xml> element in the page */
if ( !isset($charset) )
{
preg_match( '@<\?xml.+encoding="([^\s"]+)@si', $data, $matches );
if ( isset( $matches[1] ) )
$charset = $matches[1];
}
/* 4: PHP's heuristic detection */
if ( !isset($charset) )
{
$encoding = mb_detect_encoding($data);
if ($encoding)
$charset = $encoding;
}
/* 5: Default for HTML */
if ( !isset($charset) )
{
if (strstr($content_type, "text/html") === 0)
$charset = "ISO 8859-1";
}
/* Convert it if it is anything but UTF-8 */
/* You can change "UTF-8" to "UTF-8//IGNORE" to
ignore conversion errors and still output something reasonable */
if ( isset($charset) && strtoupper($charset) != "UTF-8" )
$data = iconv($charset, 'UTF-8', $data);
return $data;
}
curl_multi_download(array('[LINKS TO SCRAPE]'), 'process_response');
function process_response( $info, $response )
{
if ( $info['http_code'] != 200 )
{
echo "Error retrieving URL " . $info['url'] . "<br/>";
return;
};
//var_dump($info);
//var_dump($response);
$fullArr = array();
$infos = scrape($response,$dayTime,$date,$hour);
return $infos;
}
function curl_download( $url )
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
$data = self::curl_exec($ch);
curl_close($ch);
return $data;
}
function scrape($data,$dayTime,$date,$hour) {
$html = $data;
$dom = new DOMDocument();
$html = $dom->loadHTML($html);
$dom->preserveWhiteSpace = false;
$result = array();
$tables = $dom->getElementsByTagName('table');
foreach($tables as $table) {
if ($table->hasAttribute('class')) {
$rows = $dom->getElementsByTagName("tr");
foreach($rows as $row) {
$cols = $row->getElementsByTagName('td');
$aref = $row->getElementsByTagName('a');
foreach($aref as $profile) {
if (($cols->item(0)->nodeValue != "Seller") and ($cols->item(1)->nodeValue != "Ratio") and ($cols->item(2)->nodeValue != "Amount")) {
$seller = $cols->item(0)->nodeValue;
$sellerID = intval(preg_replace('/[^0-9]+/', '', ($profile->getAttribute( 'href' ))), 10);
$sellerType = sellerTypeExtract($profile->getAttribute( 'href' ));
$currencysell = preg_replace('/[.0-9]/','',($cols->item(1)->nodeValue));
$amount = floatval($cols->item(1)->nodeValue);
$rate = rateExtractor($cols->item(2)->nodeValue);
$currencybought = preg_replace('/[.0-9'.$currencysell.'=]/','',($cols->item(2)->nodeValue));
//$marketLink = $nodes;
$server = "servername";
$tenGoldref = tenGoldref($rate);
}
$result = array (
'seller' => $seller,
'sellerprofile'=> $sellerID,
'sellertype' => $sellerType,
'currencysold' => $currencysell,
'amount'=> $amount,
'rate'=> $rate,
'currencybought'=> $currencybought,
//'marketLink' => $marketLink,
'server' => $server,
'tengold' => $tenGoldref,
'timeMoment' => $dayTime,
'date' => $date,
'hour' => $hour
);
}
}
}
}
return $result;
}
函数curl\u multi\u下载(数组$URL,可调用$callback,数组$custom\u options=array())
{
$time\u start\u insert=微时间(真);
//确保滚动窗口不大于URL的#
$rolling_window=50;
$rolling\u window=(sizeof($url)<$rolling\u window)?sizeof($url):$rolling\u window;
$master=curl_multi_init();
$curl_arr=array();
$options=array(
CURLOPT_RETURNTRANSFER=>true,
CURLOPT_FOLLOWLOCATION=>true,
CURLOPT_MAXREDIRS=>5,
)+$custom_选项;
//启动第一批请求
对于($i=0;$i<$rolling_window;$i++)
{
$ch=curl_init();
$options[CURLOPT_URL]=$URL[$i];
curl_setopt_数组($ch$options);
卷曲多加手柄($master,$ch);
}
做
{
而($execrun=curl\u multi\u exec($master,$running))==CURLM\u CALL\u multi\u PERFORM);
如果($execrun!=CURLM_OK)
打破
//一个请求刚刚完成--找出哪一个
而($done=curl\u multi\u info\u read($master))
{
$info=curl_getinfo($done['handle']);
//请求成功。使用回调函数处理输出。
$output=curl\u multi\u getcontent($done['handle']);
$extract=call_user_func_数组($callback,array($info,$output));
如果(空($fullArr)){
$fullArr=数组($extract);
}否则{
(数组推送($fullArr,$extract));
}
echo“
*******************现在应该有完整的数组******************************************
”;
回音“完成”;
回声“
”;
印刷费($fullArr);
echo“
*********************************************************************************************
”;
$time\u end\u insert=微时间(真);
//除以60将给出以分钟或秒为单位的执行时间
$execution\u time\u insert=($time\u end\u insert-$time\u start\u insert)/60;
//脚本的执行时间
echo'
+total execution time:'.round($execution\u time\u insert,2,PHP\u round\u HALF\u DOWN)。'minutes+;
如果(isset($URL[$i+1]))
{
//启动新请求(在删除旧请求之前执行此操作很重要)
$ch=curl_init();
$options[CURLOPT_URL]=$URL[$i++];//增量i
curl_setopt_数组($ch$options);
卷曲多加手柄($master,$ch);
}
//移除刚刚完成的卷曲手柄
卷曲多重移除手柄($master,$done['handle');
}
}同时($运行);
卷曲多合($master);
返回true;
}
函数curl\u multi\u getcontent\u utf8($ch)
{
$data=curl\u multi\u getcontent($ch);
如果(!是字符串($data))
返回$data;
未设置($字符集);
$content\u type=curl\u getinfo($ch,CURLINFO\u content\u type);
/*1:HTTP内容类型:标头*/
preg_match('@([\w/+]+)(;\s*字符集=(\s+)))@i',$content_type,$matches);
如果(isset($matches[3]))
$charset=$matches[3];
/*2:页面中的元素*/
如果(!isset($charset))
{
preg_match('@loadHTML($html);
$dom->preserveWhiteSpace=false;
$result=array();
$tables=$dom->getElementsByTagName('table');
foreach($tables作为$table){
如果($table->hasAttribute('class')){
$rows=$dom->getElementsByTagName(“tr”);
foreach($行作为$行){
$cols=$row->getElementsByTagName('td');
$aref=$row->getElementsByTagName('a');
foreach($aref作为$profile){
如果($cols->item(0)->nodeValue!=“卖方”)和($cols->item(1)->nodeValue!=“比率”)和($cols->item(2)->nodeValue!=“金额”)){
$seller=$cols->item(0)->nodeValue;
$sellerID=intval(preg_replace('/[^0-9]+/','',($profile->getAttribute('href')),10);
$sellerType=sellerTypeExtract($profile->getAttribute('href');
$currencysell=preg_replace('/[0-9]/','',($cols->item(1)->nodeValue));
$amount=floatval($cols->item(1)->nodeValue);
$rate=rateExtractor($cols->item(2)->nodeValue);
$currencybo