如何从页面获取所有URL(php)
我有一个包含URL的页面,其中描述一个接一个地列出(类似于书签/站点列表)。如何使用php从该页面获取所有url并将其写入txt文件(每行一个,仅url不带说明) 页面如下所示: 我希望脚本的txt输出如下所示: 单向如何从页面获取所有URL(php),php,Php,我有一个包含URL的页面,其中描述一个接一个地列出(类似于书签/站点列表)。如何使用php从该页面获取所有url并将其写入txt文件(每行一个,仅url不带说明) 页面如下所示: 我希望脚本的txt输出如下所示: 单向 $url="http://wwww.somewhere.com"; $data=file_get_contents($url); $data = strip_tags($data,"<a>"); $d = preg_split("/<\/a>/"
$url="http://wwww.somewhere.com";
$data=file_get_contents($url);
$data = strip_tags($data,"<a>");
$d = preg_split("/<\/a>/",$data);
foreach ( $d as $k=>$u ){
if( strpos($u, "<a href=") !== FALSE ){
$u = preg_replace("/.*<a\s+href=\"/sm","",$u);
$u = preg_replace("/\".*/","",$u);
print $u."\n";
}
}
$url=”http://wwww.somewhere.com";
$data=文件内容($url);
$data=strip_标签($data,“”);
$d=预分割(“/”,$data);
foreach($d为$k=>$u){
如果(STRPO($u,单向
$url="http://wwww.somewhere.com";
$data=file_get_contents($url);
$data = strip_tags($data,"<a>");
$d = preg_split("/<\/a>/",$data);
foreach ( $d as $k=>$u ){
if( strpos($u, "<a href=") !== FALSE ){
$u = preg_replace("/.*<a\s+href=\"/sm","",$u);
$u = preg_replace("/\".*/","",$u);
print $u."\n";
}
}
$url=”http://wwww.somewhere.com";
$data=文件内容($url);
$data=strip_标签($data,“”);
$d=预分割(“/”,$data);
foreach($d为$k=>$u){
如果(strpos($u),另一种方式
$url = "http://wwww.somewhere.com";
$html = file_get_contents($url);
$doc = new DOMDocument();
$doc->loadHTML($html); //helps if html is well formed and has proper use of html entities!
$xpath = new DOMXpath($doc);
$nodes = $xpath->query('//a');
foreach($nodes as $node) {
var_dump($node->getAttribute('href'));
}
另一种方式
$url = "http://wwww.somewhere.com";
$html = file_get_contents($url);
$doc = new DOMDocument();
$doc->loadHTML($html); //helps if html is well formed and has proper use of html entities!
$xpath = new DOMXpath($doc);
$nodes = $xpath->query('//a');
foreach($nodes as $node) {
var_dump($node->getAttribute('href'));
}
您可以使用它来获取给定网页中的所有链接
<?php
$var = fread_url($url);
preg_match_all ("/a[\s]+[^>]*?href[\s]?=[\s\"\']+".
"(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/",
$var, &$matches);
$matches = $matches[1];
$list = array();
foreach($matches as $var)
{
print($var."<br>");
}
function fread_url($url,$ref="")
{
if(function_exists("curl_init")){
$ch = curl_init();
$user_agent = "Mozilla/4.0 (compatible; MSIE 5.01; ".
"Windows NT 5.0)";
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
curl_setopt( $ch, CURLOPT_HTTPGET, 1 );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1 );
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION , 1 );
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION , 1 );
curl_setopt( $ch, CURLOPT_URL, $url );
curl_setopt( $ch, CURLOPT_REFERER, $ref );
curl_setopt ($ch, CURLOPT_COOKIEJAR, 'cookie.txt');
$html = curl_exec($ch);
curl_close($ch);
}
else{
$hfile = fopen($url,"r");
if($hfile){
while(!feof($hfile)){
$html.=fgets($hfile,1024);
}
}
}
return $html;
}
?>
您可以使用此选项获取给定网页中的所有链接
<?php
$var = fread_url($url);
preg_match_all ("/a[\s]+[^>]*?href[\s]?=[\s\"\']+".
"(.*?)[\"\']+.*?>"."([^<]+|.*?)?<\/a>/",
$var, &$matches);
$matches = $matches[1];
$list = array();
foreach($matches as $var)
{
print($var."<br>");
}
function fread_url($url,$ref="")
{
if(function_exists("curl_init")){
$ch = curl_init();
$user_agent = "Mozilla/4.0 (compatible; MSIE 5.01; ".
"Windows NT 5.0)";
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
curl_setopt( $ch, CURLOPT_HTTPGET, 1 );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1 );
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION , 1 );
curl_setopt( $ch, CURLOPT_FOLLOWLOCATION , 1 );
curl_setopt( $ch, CURLOPT_URL, $url );
curl_setopt( $ch, CURLOPT_REFERER, $ref );
curl_setopt ($ch, CURLOPT_COOKIEJAR, 'cookie.txt');
$html = curl_exec($ch);
curl_close($ch);
}
else{
$hfile = fopen($url,"r");
if($hfile){
while(!feof($hfile)){
$html.=fgets($hfile,1024);
}
}
}
return $html;
}
?>
如果我的链接是这样的:“>上面的代码找不到链接,这是因为user2066719的答案是正确的(至少在2018年)。你不应该使用正则表达式搜索HTML/XML文档-它们太脆弱,而且现在甚至在本机PHP中也有更强大的工具。如果我的链接是这样的:>上面的代码找不到链接这是因为user2066719的答案是正确的(至少在2018年)。你不应该使用正则表达式来搜索HTML/XML文档——它们太脆弱了,而且现在甚至在原生PHP中也有更强大的工具。