带有页面循环的php抓取
我有一个循环,它准确地显示了刮页的结果。现在我把一个循环放在需要提取的地方,直到最后的分页页面数。我得到最后一个值并将其放入for循环中。假设1到1474是范围。 现在我想在给定的时间内对url进行抓取。假设页面=1,页面=2。。。。第1474页。并希望在ul li集合中显示结果 以下是我迄今为止工作的代码。敬请告知,因为它显示了大约1474次第一页数据带有页面循环的php抓取,php,loops,curl,web-scraping,Php,Loops,Curl,Web Scraping,我有一个循环,它准确地显示了刮页的结果。现在我把一个循环放在需要提取的地方,直到最后的分页页面数。我得到最后一个值并将其放入for循环中。假设1到1474是范围。 现在我想在给定的时间内对url进行抓取。假设页面=1,页面=2。。。。第1474页。并希望在ul li集合中显示结果 以下是我迄今为止工作的代码。敬请告知,因为它显示了大约1474次第一页数据 $ch = curl_init('http://www.qatarliving.com/v3/classifieds/search/categ
$ch = curl_init('http://www.qatarliving.com/v3/classifieds/search/category/mobile-devices');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36');
$res = curl_exec($ch);
if ($res === false) {
die('error: ' . curl_error($ch));
}
curl_close($ch);
$d = new DOMDocument();
@$d->loadHTML($res);
$x = new DOMXPath($d);
$review = $x->query('//p[@class="b-filters-block--el-title"]');
if($review->length > 0) {
foreach($review as $row){
echo "<h1>".$row->nodeValue . "</h1>";
}
}
$pagging = $x->query('//a[@class="b-pagination--el-page b-pagination--el-item"]');
if($pagging->length > 0) {
echo "<ul>";
foreach($pagging as $row){
$tag= '';
$ddt= $row->getAttribute('href');
$url = str_replace('http://www.qatarliving.com/v3/classifieds/search/category/mobile-devices?page=','',$ddt);
$array[$tag] = $url;
}
$arrayvalue = end($array);
echo "[ Last pagging value is : " . end($array) . " ]<br><br>";
$myname = $x->query('//p[@class="b-card--el-description"]');
for ($x = 1; $x <= $arrayvalue; $x++) {
echo "<h3>Page = " . $x . "</h3>";
foreach ($myname as $tag) {
echo "<li> " . $tag->nodeValue . "</li>";
}
}
echo "</ul>";
}
$ch=curl\u init('http://www.qatarliving.com/v3/classifieds/search/category/mobile-devices');
curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0(windowsnt 6.1;WOW64)AppleWebKit/537.36(KHTML,像Gecko)Chrome/34.0.1847.116 Safari/537.36');
$res=curl_exec($ch);
如果($res==false){
die('error:'。curl_error($ch));
}
卷曲关闭($ch);
$d=新的DOMDocument();
@$d->loadHTML($res);
$x=新的DOMPath($d);
$review=$x->query('//p[@class=“b-filters-block--el title”]');
如果($review->length>0){
foreach($review为$row){
回显“$row->nodeValue.”;
}
}
$pagging=$x->query('//a[@class=“b-pagination--el-page b-pagination--el-item”]);
如果($pagging->length>0){
回声“”;
foreach($row分页){
$tag='';
$ddt=$row->getAttribute('href');
$url=str\u替换('http://www.qatarliving.com/v3/classifieds/search/category/mobile-devices?page=“,”,“$滴滴涕);
$array[$tag]=$url;
}
$arrayvalue=结束($array);
echo“[最后一个分页值是:“.end($array)。”]
;
$myname=$x->query('//p[@class=“b-card--el description”]);
对于($x=1;$x节点价值。“”;
}
}
回声“
”;
}
类似的东西
for($lp = 1; $lp <= $arrayvalue; $lp++){
$url="http://www.qatarliving.com/v3/classifieds/search/category/mobile-devices?page=" . $lp;
$html = file_get_contents($url);
$doc = new DOMDocument();
@$doc->loadHTML($html);
$xpath = new DOMXPath($d);
$myname = $xpath->query('//p[@class="b-card--el-description"]');
echo "<div><h1>".$url."</div>";
foreach($myname as $items){
echo $items->nodeValue . "<br>";
}
echo "</div>";
}
for($lp=1;$lp loadHTML($html);
$xpath=新的DOMXPath($d);
$myname=$xpath->query('//p[@class=“b-card--el description”]');
回显“$url”;
foreach($myname作为$items){
echo$items->nodeValue.“
”;
}
回声“;
}
这并没有给出预期的结果
$ch = curl_init('http://www.qatarliving.com/v3/classifieds/search/category/mobile-devices');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36');
$res = curl_exec($ch);
if ($res === false) {die('error: ' . curl_error($ch));}
curl_close($ch);
$d = new DOMDocument();
@$d->loadHTML($res);
$x = new DOMXPath($d);
$output = array(
'class' => '',
'src' => '',
);
$pagging = $x->query('//a[@class="b-pagination--el-page b-pagination--el-item"]');
$myspan = $x->query('//div[@class="b-items-list"]');
$data_array = array();
if($myspan->length > 0) {
$array = array();
foreach ($myspan as $row) {
$data = $x->query('//span[@class="b-card b-card-mod-h item "]');
if($data->length > 0) {
foreach ($data as $dt) {
$tag = '';
$class = $dt->getAttribute('class');
$value = $dt->getAttribute('href');
//if (trim($class) == 'b-ad-excerpt b-par-mod-clear b-line-mod-thin--mix-item'){
$tag='anch';
//}
if ($tag) {
$array[$tag] = $value;
}
}
}
$data = $row->getElementsByTagName('p');
foreach ($data as $dt) {
$tag = '';
$class = $dt->getAttribute('class');
$value = $dt->nodeValue;
if ($class == 'b-card--el-deposit-val') {
$tag = 'price';
} else if ($class == 'b-card--el-deposit-time') {
$tag = 'deposittime';
} else if ($class == 'b-ad-excerpt b-par-mod-clear b-line-mod-thin--mix-item') {
$tag = 'category';
} else if ($class == 'b-card--el-description') {
$tag = 'name';
}
if ($tag) {
$array[$tag] = $value;
}
}
$data = $row->getElementsByTagName('a');
foreach ($data as $dt) {
$tag = '';
$class = $dt->getAttribute('class');
$value = $dt->nodeValue;
if (trim($class) == 'b-card--el-view img-responsive') {
$tag = 'addedby';
} else if(trim($class) == 'b-card--el-agency-title'){
$tag = 'by';
}
if ($tag) {
$array[$tag] = $value;
}
}
$data = $row->getElementsByTagName('img');
foreach ($data as $dt) {
$tag = '';
$class = $dt->getAttribute('class');
$src = $dt->getAttribute('src');
$value = $dt->getAttribute('src');
if (trim($class) == 'b-card--el-view img-responsive') {
$tag = 'image';
} else if (!empty($src)) {
$tag = 'image';
$value = "no.jpg";
} else if (trim($class) == 'b-pagination--el-page') {
$tag = 'pagging';
}
if ($tag) {
$array[$tag] = $value;
}
}
$data_array[] = $array;
}
echo "<pre>";
print_r($data_array);
echo "</pre>";
}
$ch=curl\u init('http://www.qatarliving.com/v3/classifieds/search/category/mobile-devices');
curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch,CURLOPT_USERAGENT,'Mozilla/5.0(windowsnt 6.1;WOW64)AppleWebKit/537.36(KHTML,像Gecko)Chrome/34.0.1847.116 Safari/537.36');
$res=curl_exec($ch);
如果($res==false){die('error:'.curl_error($ch));}
卷曲关闭($ch);
$d=新的DOMDocument();
@$d->loadHTML($res);
$x=新的DOMPath($d);
$output=array(
“类”=>“”,
“src'=>”,
);
$pagging=$x->query('//a[@class=“b-pagination--el-page b-pagination--el-item”]);
$myspan=$x->query('//div[@class=“b-items-list”]');
$data_array=array();
如果($myspan->length>0){
$array=array();
foreach($myspan作为$row){
$data=$x->query('//span[@class=“b-card b-card-mod-h item”]');
如果($data->length>0){
foreach(数据为$dt){
$tag='';
$class=$dt->getAttribute('class');
$value=$dt->getAttribute('href');
//如果(修剪($class)='b-ad-摘录b-par-mod-clear b-line-mod-thin-混合项目'){
$tag='anch';
//}
如果($tag){
$array[$tag]=$value;
}
}
}
$data=$row->getElementsByTagName('p');
foreach(数据为$dt){
$tag='';
$class=$dt->getAttribute('class');
$value=$dt->nodeValue;
如果($class=='b卡--el存款余额'){
$tag=‘价格’;
}else if($class=='b卡--el存款时间'){
$tag=‘存款时间’;
}else if($class=='b-ad-摘录b-par-mod-clear b-line-mod-thin--mix项目'){
$tag='category';
}else if($class=='b卡--el说明'){
$tag='name';
}
如果($tag){
$array[$tag]=$value;
}
}
$data=$row->getElementsByTagName('a');
foreach(数据为$dt){
$tag='';
$class=$dt->getAttribute('class');
$value=$dt->nodeValue;
如果(修剪($class)=‘b卡——el视图img响应’){
$tag='addedby';
}else if(修剪($class)=‘b卡——el机构名称’){
$tag='by';
}
如果($tag){
$array[$tag]=$value;
}
}
$data=$row->getElementsByTagName('img');
foreach(数据为$dt){
$tag='';
$class=$dt->getAttribute('class');
$src=$dt->getAttribute('src');
$value=$dt->getAttribute('src');
如果(修剪($class)=‘b卡——el视图img响应’){
$tag='image';
}如果(!empty($src)){
$tag='image';
$value=“no.jpg”;
}else if(trim($class)='b-pagination--el-page'){
$tag='pagging';
}
如果($tag){
$array[$tag]=$value;
}
}
$data_array[]=$array;
}
回声“;
打印(数据数组);
回声“;
}
在每个页面的循环中,您必须获得每个页面的列表。下面是对您的评论的回答Looping new DOMDocument()works?不要在循环中创建任何对象。这将增加内存使用率并导致内存不足异常,但计数器会随着页面数的增加而增加。请修复我的代码。