用我的网络爬虫(php)扫描元标记
我有一个php网络爬虫程序,我想在其中添加get_meta_tags()函数。它扫描给定网页中的所有URL等等。是否可以将get_meta_标记方法添加到web爬虫程序,以便它从扫描的URL获取meta?用我的网络爬虫(php)扫描元标记,php,Php,我有一个php网络爬虫程序,我想在其中添加get_meta_tags()函数。它扫描给定网页中的所有URL等等。是否可以将get_meta_标记方法添加到web爬虫程序,以便它从扫描的URL获取meta? session_start(); $domain = "www.ebay.com"; if(empty($_SESSION['page'])) { $original_file = file_get_contents("http://" . $domain . "/"); $
session_start();
$domain = "www.ebay.com";
if(empty($_SESSION['page']))
{
$original_file = file_get_contents("http://" . $domain . "/");
$_SESSION['i'] = 0;
$connect = mysql_connect("cust-mysql-123-05", "uthe_774575_0001", "rooney08");
if (!$connect)
{
die("MySQL could not connect!");
}
$DB = mysql_select_db('theqlickcom_774575_db1');
if(!$DB)
{
die("MySQL could not select Database!");
}
}
if(isset($_SESSION['page']))
{
$connect = mysql_connect("xxxxx", "xxxxx", "xxxx");
if (!$connect)
{
die("MySQL could not connect!");
}
$DB = mysql_select_db('xxxx');
if(!$DB)
{
die("MySQL could not select Database!");
}
$PAGE = $_SESSION['page'];
$original_file = file_get_contents("$PAGE");
}
$stripped_file = strip_tags($original_file, "<a>");
preg_match_all("/<a(?:[^>]*)href=\"([^\"]*)\"(?:[^>]*)>(?:[^<]*)<\/a>/is", $stripped_file, $matches);
foreach($matches[1] as $key => $value)
{
if(strpos($value,"http://") != 'FALSE' && strpos($value,"https://") != 'FALSE')
{
$New_URL = "http://" . $domain . $value;
}
else
{
$New_URL = $value;
}
$New_URL = addslashes($New_URL);
$Check = mysql_query("SELECT * FROM pages WHERE url='$New_URL'");
$Num = mysql_num_rows($Check);
if($Num == 0)
{
mysql_query("INSERT INTO pages (url)
VALUES ('$New_URL')");
$_SESSION['i']++;
echo $_SESSION['i'] . "";
}
echo mysql_error();
}
$RandQuery = mysql_query("SELECT DISTINCT * FROM pages ORDER BY rank LIMIT 0,1");
$RandReturn = mysql_num_rows($RandQuery);
while($row1 = mysql_fetch_assoc($RandQuery))
{
$_SESSION['page'] = $row1['url'];
}
echo $RandReturn;
echo $_SESSION['page'];
mysql_close();
?>
session_start();
$domain=“www.ebay.com”;
if(空($_会话['page']))
{
$original\u file=file\u get\u contents(“http://“$domain.”/”;
$\会话['i']=0;
$connect=mysql_connect(“cust-mysql-123-05”、“uthe_774575_0001”、“鲁尼08”);
如果(!$connect)
{
死(“MySQL无法连接!”);
}
$DB=mysql_select_DB('theqlickcom_774575_db1');
如果(!$DB)
{
死(“MySQL无法选择数据库!”);
}
}
如果(isset($_会话['page']))
{
$connect=mysql_connect(“xxxxx”、“xxxxx”、“xxxxx”);
如果(!$connect)
{
死(“MySQL无法连接!”);
}
$DB=mysql\u select\u DB('xxxx');
如果(!$DB)
{
死(“MySQL无法选择数据库!”);
}
$PAGE=$\会话['PAGE'];
$original_file=文件获取内容($PAGE);
}
$stripped_file=strip_标签($original_file,“”);
preg\u match\u all(“/]*)href=\”([^\“]*)\”(?:[^>]*)>(?:[^首先,为什么在这行加引号?:
$original_file = file_get_contents("$PAGE");
其次,所有元标记都可以通过
$tags = get_meta_tags('http://www.example.com/');
看
因此,在您的示例中,我想您必须使用:
$tags = get_meta_tags($New_URL);
并将该数组保存到您的数据库中。我以前在从外部源读取html标记时遇到过这个问题。Jstel为我提供了一个很好的解决方案,尽管我相信您可以将她的解决方案合并到您的解决方案中
根据您的代码,以下是它的工作原理:
$domain = "www.ebay.com";
$original_file = file_get_contents("http://" . $domain . "/");
preg_match_all("/<meta[^>]+(http\-equiv|name)=\"([^\"]*)\"[^>]" . "+content=\"([^\"]*)\"[^>]*>/i",$original_file, $result);
print_r($result);
$domain=“www.ebay.com”;
$original\u file=file\u get\u contents(“http://“$domain.”/”;
preg\u match\u all(“/”+(http \-equiv | name)=\”([^\“]*)\“[^>]”“+”content=\”([^\“]*)\“[^>]*>/i”,$original\u file,$result);
打印(结果);
下面是我从这个正则表达式得到的示例结果:
Array
(
[0] => Array
(
[0] => <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
[1] => <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
[2] => <meta name="keywords" content="ebay, electronics, cars, clothing, apparel, collectibles, sporting goods, digital cameras, antiques, tickets, jewelry, online shopping, auction, online auction">
[3] => <meta name="description" content="Buy and sell electronics, cars, fashion apparel, collectibles, sporting goods, digital cameras, baby items, coupons, and everything else on eBay, the world's online marketplace">
[4] => <meta name="verify-v1" content="j6ZKbG61n+f9pUtbkf69zFRBrRSeUqyfEJ2BjiRxWDQ=">
[5] => <meta name="y_key" content="acf32e2a69cbc2b0">
[6] => <meta name="msvalidate.01" content="31154A785F516EC9842FC3BA2A70FB1A">
)
[1] => Array
(
[0] => http-equiv
[1] => http-equiv
[2] => name
[3] => name
[4] => name
[5] => name
[6] => name
)
[2] => Array
(
[0] => Content-Type
[1] => Content-Type
[2] => keywords
[3] => description
[4] => verify-v1
[5] => y_key
[6] => msvalidate.01
)
[3] => Array
(
[0] => text/html; charset=UTF-8
[1] => text/html; charset=UTF-8
[2] => ebay, electronics, cars, clothing, apparel, collectibles, sporting goods, digital cameras, antiques, tickets, jewelry, online shopping, auction, online auction
[3] => Buy and sell electronics, cars, fashion apparel, collectibles, sporting goods, digital cameras, baby items, coupons, and everything else on eBay, the world's online marketplace
[4] => j6ZKbG61n+f9pUtbkf69zFRBrRSeUqyfEJ2BjiRxWDQ=
[5] => acf32e2a69cbc2b0
[6] => 31154A785F516EC9842FC3BA2A70FB1A
)
)
数组
(
[0]=>阵列
(
[0] =>
[1] =>
[2] =>
[3] =>
[4] =>
[5] =>
[6] =>
)
[1] =>阵列
(
[0]=>http等效
[1] =>http等价
[2] =>名称
[3] =>名称
[4] =>名称
[5] =>名称
[6] =>名称
)
[2] =>阵列
(
[0]=>内容类型
[1] =>内容类型
[2] =>关键字
[3] =>说明
[4] =>验证-v1
[5] =>y_键
[6] =>msvalidate.01
)
[3] =>阵列
(
[0]=>text/html;字符集=UTF-8
[1] =>文本/html;字符集=UTF-8
[2] =>易趣、电子产品、汽车、服装、服装、收藏品、体育用品、数码相机、古董、门票、珠宝、在线购物、拍卖、在线拍卖
[3] =>在全球在线市场易趣上买卖电子产品、汽车、时尚服装、收藏品、体育用品、数码相机、婴儿用品、优惠券以及其他一切
[4] =>j6ZKbG61n+F9PUTBKF69ZFRBRSEUQYFEJ2BJIRXWDQ=
[5] =>acf32e2a69cbc2b0
[6] =>31154A785F516EC9842FC3BA2A70FB1A
)
)
每次你使用正则表达式解析HTML时,一只小海豹会可怕地死去-使用。(相关)“$x”
只是cargo cult编程。我如何将数组保存到数据库中。示例:((这取决于你。你使用的是mysql\u查询(“插入页面(url)值('$New\u url'))
那么您是否可以重新写入以保存该数组?