Php 文件获取内容脚本适用于某些网站，但不适用于其他网站_Php_Regex_Parsing_File Get Contents

Php 文件获取内容脚本适用于某些网站，但不适用于其他网站

php regex parsing

Php 文件获取内容脚本适用于某些网站，但不适用于其他网站,php,regex,parsing,file-get-contents,Php,Regex,Parsing,File Get Contents,我希望构建一个PHP脚本，用于解析特定标记的HTML。我一直在使用这个代码块，改编自：该脚本适用于某些网站（如上面的google），但当我在其他网站（如freshdirect）上尝试时，我会出现以下错误： “警告：文件获取内容（）[函数.文件获取内容]：无法打开流：HTTP请求失败！” 我在StackOverflow上看到了很多很棒的例子，比如在php.ini中启用extension=php\u openssl.dll。但是（1）我的php.ini版本中没有extension=php_op

我希望构建一个PHP脚本，用于解析特定标记的HTML。我一直在使用这个代码块，改编自：

该脚本适用于某些网站（如上面的google），但当我在其他网站（如freshdirect）上尝试时，我会出现以下错误：

“警告：文件获取内容（）[函数.文件获取内容]：无法打开流：HTTP请求失败！”

我在StackOverflow上看到了很多很棒的例子，比如在php.ini中启用

extension=php\u openssl.dll

。但是（1）我的php.ini版本中没有

extension=php_openssl.dll

，并且（2）当我将其添加到extensions部分并重新启动WAMP服务器时，仍然没有成功

有人能给我指一下正确的方向吗？多谢各位

$html = file_get_html('http://google.com/');
$title = $html->find('title')->innertext;

或者如果你更喜欢preg_match，你应该使用cURL而不是fgc

function curl($url){

    $headers[]  = "User-Agent:Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13";
    $headers[]  = "Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
    $headers[]  = "Accept-Language:en-us,en;q=0.5";
    $headers[]  = "Accept-Encoding:gzip,deflate";
    $headers[]  = "Accept-Charset:ISO-8859-1,utf-8;q=0.7,*;q=0.7";
    $headers[]  = "Keep-Alive:115";
    $headers[]  = "Connection:keep-alive";
    $headers[]  = "Cache-Control:max-age=0";

    $curl = curl_init();
    curl_setopt($curl, CURLOPT_URL, $url);
    curl_setopt($curl, CURLOPT_HTTPHEADER, $headers);
    curl_setopt($curl, CURLOPT_ENCODING, "gzip");
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, 1);
    $data = curl_exec($curl);
    curl_close($curl);
    return $data;

}


$data = curl('http://www.google.com');
$regex = '#<title>(.*?)</title>#mis';
preg_match($regex,$data,$match);
var_dump($match); 
echo $match[1];

函数curl（$url）{
$headers[]=“用户代理：Mozilla/5.0（Windows；U；Windows NT 5.1；en-US；rv:1.9.2.13）Gecko/20101203 Firefox/3.6.13”；
$headers[]=“接受：text/html，application/xhtml+xml，application/xml；q=0.9，*/*；q=0.8”；
$headers[]=“接受语言：en-us，en；q=0.5”；
$headers[]=“接受编码：gzip，deflate”；
$headers[]=“接受字符集：ISO-8859-1，utf-8；q=0.7，*；q=0.7”；
$headers[]=“保持活动状态：115”；
$headers[]=“连接：保持活动状态”；
$headers[]=“缓存控制：最大使用期限=0”；
$curl=curl_init（）；
curl_setopt（$curl，CURLOPT_URL，$URL）；
curl_setopt（$curl，CURLOPT_HTTPHEADER，$headers）；
curl_setopt（$curl，CURLOPT_编码，“gzip”）；
curl_setopt（$curl，CURLOPT_RETURNTRANSFER，1）；
curl_setopt（$curl，CURLOPT_FOLLOWLOCATION，1）；
$data=curl\u exec（$curl）；
curl_close（$curl）；
返回$data；
}
$data=curl（$data）http://www.google.com');
$regex='#（.*）#mis'；
preg_match（$regex，$data，$match）；
var_dump（$match）；
echo$match[1]；

它只需要一个用户代理（“any”实际上，任何字符串都足够）：

看

当然，您可以在ini中设置：

 ini_set("user_agent","any");
 echo file_get_contents("http://www.freshdirect.com");

。。。但是我更愿意为下一个程序员明确说明这一点。

另一个选项：一些主机禁用

CURLOPT\u FOLLOWLOCATION

，因此递归是您想要的，如果出现任何错误，也会登录到文本文件中。这也是如何使用

DOMDocument（）

提取内容的一个简单示例，显然它并不广泛，但您可以构建它

<?php 
function file_get_site($url){
(function_exists('curl_init')) ? '' : die('cURL Must be installed. Ask your host to enable it or uncomment extension=php_curl.dll in php.ini');
$curl = curl_init();
$header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: ";

curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0 Firefox/5.0');
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_HEADER, true);
curl_setopt($curl, CURLOPT_REFERER, $url);
curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($curl, CURLOPT_AUTOREFERER, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_TIMEOUT, 60);

$html = curl_exec($curl);

$status = curl_getinfo($curl);
curl_close($curl);

if($status['http_code']!=200){
    if($status['http_code'] == 301 || $status['http_code'] == 302) {
        list($header) = explode("\r\n\r\n", $html, 2);
        $matches = array();
        preg_match("/(Location:|URI:)[^(\n)]*/", $header, $matches);
        $url = trim(str_replace($matches[1],"",$matches[0]));
        $url_parsed = parse_url($url);
        return (isset($url_parsed))? file_get_site($url):'';
    }
    $oline='';
    foreach($status as $key=>$eline){$oline.='['.$key.']'.$eline.' ';}
    $line =$oline." \r\n ".$url."\r\n-----------------\r\n";
    $handle = @fopen('./curl.error.log', 'a');
    fwrite($handle, $line);
    return FALSE;
}
return $html;
}


function get_content_tags($source,$tag,$id=null,$value=null){
    $xml = new DOMDocument();
    @$xml->loadHTML($source);

    foreach($xml->getElementsByTagName($tag) as $tags) {
        if($id!=null){
            if($tags->getAttribute($id)==$value){
                return $tags->getAttribute('content');
            }
        }
        return $tags->nodeValue;
    }
}


$source = file_get_site('http://www.freshdirect.com/about/index.jsp');

echo get_content_tags($source,'title'); //FreshDirect

echo get_content_tags($source,'meta','name','description'); //Online grocer providing high quality fresh......

?>

$eline）{$oline.='['.$key.].$eline.'；}
$line=$oline.\r\n.$url.\r\n--------------\r\n；
$handle=@fopen（'./curl.error.log'，'a'）；
fwrite（$handle，$line）；
返回FALSE；
}
返回$html；
}
函数get_content_tags（$source，$tag，$id=null，$value=null）{
$xml=newdomdocument（）；
@$xml->loadHTML（$source）；
foreach（$xml->getElementsByTagName（$tag）作为$tags）{
如果（$id！=null）{
如果（$tags->getAttribute（$id）==$value）{
返回$tags->getAttribute（'content'）；
}
}
返回$tags->nodeValue；
}
}
$source=文件\u获取\u站点（'http://www.freshdirect.com/about/index.jsp');
echo获取内容标签（$source，'title'）//生鲜直达
echo获取内容标签（$source，'meta'，'name'，'description'）//提供优质新鲜食品的在线杂货商。。。。。。
?>

也许可以看看下面这样的curl：如果您不能在@Wrikken、更快的dom解析和搜索/更改dom之间做出选择，那么只能使用简单的html dom（但如果它不习惯于DOM，那么进行实际更改可能会有点冗长，因此可以使用与simplehtmldom一样简单的包链接到答案，但仍然使用DOM/libxml以加快处理速度）@Wrikken，为了更快地获取页面，我将simplehtmldom修改为使用cURL，但并没有真正注意处理速度。我使用regex，尽管它不是“应该被播放”的方式。谢谢你的链接。啊！谢谢！为什么有些网站需要这个，顺便说一句？@ChuckLeButt Naive WAF和/或一些用户代理的自定义响应。但是，正如所述，你应该使用用户代理（尽管不是必需的），在RFC中遵循应该是一个好主意。

 ini_set("user_agent","any");
 echo file_get_contents("http://www.freshdirect.com");

<?php 
function file_get_site($url){
(function_exists('curl_init')) ? '' : die('cURL Must be installed. Ask your host to enable it or uncomment extension=php_curl.dll in php.ini');
$curl = curl_init();
$header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: ";

curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0 Firefox/5.0');
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_HEADER, true);
curl_setopt($curl, CURLOPT_REFERER, $url);
curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($curl, CURLOPT_AUTOREFERER, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_TIMEOUT, 60);

$html = curl_exec($curl);

$status = curl_getinfo($curl);
curl_close($curl);

if($status['http_code']!=200){
    if($status['http_code'] == 301 || $status['http_code'] == 302) {
        list($header) = explode("\r\n\r\n", $html, 2);
        $matches = array();
        preg_match("/(Location:|URI:)[^(\n)]*/", $header, $matches);
        $url = trim(str_replace($matches[1],"",$matches[0]));
        $url_parsed = parse_url($url);
        return (isset($url_parsed))? file_get_site($url):'';
    }
    $oline='';
    foreach($status as $key=>$eline){$oline.='['.$key.']'.$eline.' ';}
    $line =$oline." \r\n ".$url."\r\n-----------------\r\n";
    $handle = @fopen('./curl.error.log', 'a');
    fwrite($handle, $line);
    return FALSE;
}
return $html;
}


function get_content_tags($source,$tag,$id=null,$value=null){
    $xml = new DOMDocument();
    @$xml->loadHTML($source);

    foreach($xml->getElementsByTagName($tag) as $tags) {
        if($id!=null){
            if($tags->getAttribute($id)==$value){
                return $tags->getAttribute('content');
            }
        }
        return $tags->nodeValue;
    }
}


$source = file_get_site('http://www.freshdirect.com/about/index.jsp');

echo get_content_tags($source,'title'); //FreshDirect

echo get_content_tags($source,'meta','name','description'); //Online grocer providing high quality fresh......

?>