Php 加载外部目录中的所有TXT文件
所以我需要加载所有的txt文件:作为一个txt文件,并进入我的服务器,这是一个不同的Orcahub 由于某种原因,它不起作用。我不能让它真正让HTML甚至做正则表达式 我尝试的是:Php 加载外部目录中的所有TXT文件,php,regex,curl,Php,Regex,Curl,所以我需要加载所有的txt文件:作为一个txt文件,并进入我的服务器,这是一个不同的Orcahub 由于某种原因,它不起作用。我不能让它真正让HTML甚至做正则表达式 我尝试的是: <?php $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, 'http://orcahub.com/unchecked-proxy-list'); curl_setopt($ch, CURLOPT_HEADER, FALSE); curl_setopt(
<?php
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, 'http://orcahub.com/unchecked-proxy-list');
curl_setopt($ch, CURLOPT_HEADER, FALSE);
curl_setopt($ch, CURLOPT_NOBODY, FALSE); // remove body
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
$st = curl_exec($ch);
//curl_close($ch);
//preg_match_all("/(.*\.txt)/", $st, $out);
var_dump($ch);
?>
更新:
新问题,使用以下脚本时出现服务器错误500:
更新:发现此问题来自URL后的换行符
<?php
function disguise_curl($url) {
//Prepare Curl;
$curl = curl_init();
//Setup Headers (Firefox 2.0.0.6);
$header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: ";
//Setup Curl;
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)');
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_REFERER, 'http://orcahub.com/unchecked-proxy-list/');
curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($curl, CURLOPT_AUTOREFERER, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_TIMEOUT, 60);
//Execute Curl;
$html = curl_exec($curl);
//End Curl;
curl_close($curl);
//Output the HTML;
return $html;
}
function rem_href($x) { return substr(strstr($x, '>'), strlen('>')); }
$response = disguise_curl('http://orcahub.com/unchecked-proxy-list/');
preg_match_all("/<a[\s]+[^>]*?href[\s]?=[\s\"\']+"."(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $response, $matches, PREG_SET_ORDER );
foreach($matches as $value) {
$proxylists[] = 'http://orcahub.com/unchecked-proxy-list/'.rem_href($value[0]);
};
echo $proxylists[0];
$response = disguise_curl($proxylists[0]);
//Server Error 500 Here;
echo $response;
?>
“”([^来自一个函数,该函数添加了头以隐藏调用,一个用于解析响应的正则表达式I:
function disguise_curl($url)
{
$curl = curl_init();
// Setup headers - I used the same headers from Firefox version 2.0.0.6
// below was split up because php.net said the line was too long. :/
$header[0] = "Accept: text/xml,application/xml,application/xhtml+xml,";
$header[0] .= "text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5";
$header[] = "Cache-Control: max-age=0";
$header[] = "Connection: keep-alive";
$header[] = "Keep-Alive: 300";
$header[] = "Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7";
$header[] = "Accept-Language: en-us,en;q=0.5";
$header[] = "Pragma: "; // browsers keep this blank.
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_USERAGENT, 'Googlebot/2.1 (+http://www.google.com/bot.html)');
curl_setopt($curl, CURLOPT_HTTPHEADER, $header);
curl_setopt($curl, CURLOPT_REFERER, 'http://www.google.com');
curl_setopt($curl, CURLOPT_ENCODING, 'gzip,deflate');
curl_setopt($curl, CURLOPT_AUTOREFERER, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_TIMEOUT, 10);
$html = curl_exec($curl); // execute the curl command
curl_close($curl); // close the connection
return $html; // and finally, return $html
}
$response = disguise_curl('http://orcahub.com/unchecked-proxy-list/');
preg_match_all("/<a[\s]+[^>]*?href[\s]?=[\s\"\']+"."(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/", $response, $matches, PREG_SET_ORDER );
foreach($matches as $value) {
var_dump($value);
};
函数伪装卷曲($url)
{
$curl=curl_init();
//设置标题-我使用了Firefox版本2.0.0.6中相同的标题
//以下内容被拆分,因为php.net说行太长。:/
$header[0]=“接受:text/xml、application/xml、application/xhtml+xml”;
$header[0]。=“text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5”;
$header[]=“缓存控制:最大年龄=0”;
$header[]=“连接:保持活动状态”;
$header[]=“保持活动状态:300”;
$header[]=“接受字符集:ISO-8859-1,utf-8;q=0.7,*;q=0.7”;
$header[]=“接受语言:en-us,en;q=0.5”;
$header[]=“Pragma:;//浏览器将此项保留为空。
curl_setopt($curl,CURLOPT_URL,$URL);
curl_setopt($curl,CURLOPT_USERAGENT,'Googlebot/2.1(+http://www.google.com/bot.html)');
curl_setopt($curl,CURLOPT_HTTPHEADER,$header);
curl_setopt($curl,CURLOPT_REFERER,'http://www.google.com');
curl_setopt($curl,CURLOPT_编码'gzip,deflate');
curl_setopt($curl,CURLOPT_AUTOREFERER,true);
curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);
curl_setopt($curl,CURLOPT_超时,10);
$html=curl\u exec($curl);//执行curl命令
curl_close($curl);//关闭连接
return$html;//最后,返回$html
}
$response=伪装http://orcahub.com/unchecked-proxy-list/');
preg\u match\u all(“/]*?href[\s]?=[\s\“\']+”(.*?[\“\']+.*?>”([^所以获取所有文件并写入其中,对吗?正确,我相信服务器正在阻止我的请求,但我不这么认为,因为你可以浏览它们:)你可以使用xpath获取所有文件名:然后使用每个txt文件()函数-将所有行作为一个数组并使用fopen附加到您的文件中似乎不起作用+1!!!很抱歉回复太晚,但它很有魅力!我非常感谢您的帮助!好吧,所以我遇到了另一个问题。当我使用您的脚本(稍微修改)进入.txt文件时,它会给我一个服务器错误500。请检查我更新的帖子