Php 网站爬网返回空(使用preg\u match\u all)
我正在尝试使用webcrawler(下面的代码)提取本文中列出的所有查询。但我好像错过了什么 我的代码如下:Php 网站爬网返回空(使用preg\u match\u all),php,web-crawler,Php,Web Crawler,我正在尝试使用webcrawler(下面的代码)提取本文中列出的所有查询。但我好像错过了什么 我的代码如下: <?php function getSslPage($url){ $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_se
<?php
function getSslPage($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_COOKIEJAR, "cookie.txt");
curl_setopt($ch, CURLOPT_COOKIEFILE, "cookie.txt");
curl_setopt($ch, CURLOPT_USERAGENT, "Chrome/36.0.1985.125");
$login = curl_exec($ch);
return $login;
}
$milesfeed = getSslPage('http://www.usmleforum.com/forum/index.php?forum=1');
preg_match_all('/<td class="FootNotes2">(.*?)<\/td>/s',$milesfeed,$links);
$milesfeed_links=[];
$milesfeed_text=[];
$fourth="abc";
$third="abc";
//$third="https://onemileatatime";
foreach($links[1] as $miles){
$milesfeed_text[] = strip_tags($miles);
preg_match_all('/<a target="_top" class="Links2" href="(.*?)">/s', $miles, $link);
$milesfeed_links[] = strip_tags($link[1][0]);
$first=explode("://",$link[1][0]);
$second=explode(".",$first[1]);
//print_r($second);
if($second[0]!=$third || $third=="abc"){
if($second[0]=="www"){
echo "<h3>".ucfirst($second[1])."</h3>";
}else{
echo "<h3>".ucfirst($second[0])."</h3>";
}
}
echo '<a href="'.$link[1][0].'" target="_blank">'.wordwrap(strip_tags($miles),30).'</a><br><br>';
$third=$second[0];
}
?>
类class=“FootNotes2”
不在tr
中,而是在td
中,这可能会改变您的结果
编辑:
在这种情况下,您的Rexeg不正确。您搜索
搞定了。您的curl\u exec是否返回html?或者这一个是空的,只是一个旁白-值得一看关于使用CURLOPT_SSL\u VERIFYPEER的一般想法,false
,不幸的是,它没有。但是谢谢你提出来。你在milesfeed_links=[]中也有一个输入错误代码>你弄错了$sign@HarishK我更新了我的答案,我注意到你的RexegDo中也有一个拼写错误。还没有运气<代码>预匹配所有(“(.*?”,$milesfeed,$links)代码>谢谢您的时间。这是因为您还需要更改第二个rexeg功能。我使用了/(.*)/
,在$links[1]
<?php
function con(){
$q=mysqli_connect('localhost','root','','usmle');
return $q;
}
function addquery($a,$b,$c,$d){
$a=mysqli_real_escape_string(con(),$a);
$b=mysqli_real_escape_string(con(),$b);
$c=mysqli_real_escape_string(con(),$c);
$d=mysqli_real_escape_string(con(),$d);
mysqli_query(con(),"insert into query(Query,QueryBy,QueryLink,Date)values('$a','$b','$c','$d')");
}
function addreply($a,$b){
$a=mysqli_real_escape_string(con(),$a);
$b=mysqli_real_escape_string(con(),$b);
mysqli_query(con(),"insert into replies(QueryID,Reply)values($a,'$b')");
}
function lastID(){
$q=mysqli_query(con(),"select MAX(QueryID) as LastID from query LIMIT 1");
return $q;
}
function getSslPage($url){
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER,false);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_COOKIEJAR, "cookie.txt");
curl_setopt($ch, CURLOPT_COOKIEFILE, "cookie.txt");
curl_setopt($ch, CURLOPT_USERAGENT, "Chrome/36.0.1985.125");
$login = curl_exec($ch);
return $login;
}
function cleannr($str)
{
// $str = str_replace("£", "", $str);
$str = str_replace(array("\r\n", "\n\r", "\n", "\r"), ',', $str);
$str = str_replace("\n" , "" , $str);
$str = trim(strip_tags($str));
return str_replace("," , " " , $str);
}
$set=0;
$athomepage = getSslPage('http://www.usmleforum.com/forum/index.php?forum=1&Page=1');
preg_match_all('/<td width="64%" height="25" class="FootNotes2"><a href="(.*?)" target="_top" class="Links2">(.*?)<\/a>(.*?)<\/td>/s',$athomepage,$pages);
/* $count = $pages[2][0];
$total_pages = 1;
if($count > 0){
$pages = $pages[0][$count-1];
$total_pages = strip_tags($pages);
} */
// echo $total_pages;
// die;
// echo "<pre>".print_r($listres,true)."</pre>";
// die;
foreach($pages[1] as $links){
ob_flush();
flush();
$set++;
echo 'Query No.'.$set.'<br />';
//echo "https://www.immobilienscout24.de/Suche/controller/exposeNavigation/goToExpose.go?exposeId=".$links."<br>";
//die;
$link="http://www.usmleforum.com".$links;
$ipage=getSslPage("http://www.usmleforum.com".$links."");
preg_match_all('/<td width="95%" valign="top" colspan="2" class="FormText2">(.*?)<\/td>/s',$ipage,$query);
preg_match_all('/<td width="97%" colspan="2" valign="top" class="FootNotes2">(.*?)<\/td>/s',$ipage,$by);
$explodation=explode("-",$by[1][0]);
$date=$explodation[1];
$name=$explodation[0];
$actualquery=cleannr($query[1][0]);
echo '<h2>Query : </h2><br />';
echo 'Query : '.$actualquery.'<br />';
echo 'Query By : '.$name.'<br />';
echo 'Link : '.$link.'<br />';
echo 'Date : '.$date.'<br />';
addquery($actualquery,$name,$link,$date);
$id=lastID();
foreach($id as $ids){
$lastID=$ids["LastID"];
}
echo '<h2>Replies : </h2><br />';
for($i=1;$i<count($query[1]);$i++){
if($query[1][$i]!=""){
$replyquery=cleannr($query[1][$i]);
echo 'Reply : '.$replyquery.'<br />';
addreply($lastID,$replyquery);
}
}
}
echo "<center><h2>Scraping Done</h2></center>";
?>