用我的网络爬虫(php)扫描元标记

用我的网络爬虫(php)扫描元标记,php,Php,我有一个php网络爬虫程序,我想在其中添加get_meta_tags()函数。它扫描给定网页中的所有URL等等。是否可以将get_meta_标记方法添加到web爬虫程序,以便它从扫描的URL获取meta? session_start(); $domain = "www.ebay.com"; if(empty($_SESSION['page'])) { $original_file = file_get_contents("http://" . $domain . "/"); $

我有一个php网络爬虫程序,我想在其中添加get_meta_tags()函数。它扫描给定网页中的所有URL等等。是否可以将get_meta_标记方法添加到web爬虫程序,以便它从扫描的URL获取meta?
 session_start();

 $domain = "www.ebay.com";

 if(empty($_SESSION['page']))
 {
 $original_file = file_get_contents("http://" . $domain . "/");

 $_SESSION['i'] = 0;

 $connect = mysql_connect("cust-mysql-123-05", "uthe_774575_0001", "rooney08");

 if (!$connect)
 {
 die("MySQL could not connect!");
 }

 $DB = mysql_select_db('theqlickcom_774575_db1');

if(!$DB)
{
 die("MySQL could not select Database!");
}
}
if(isset($_SESSION['page']))
{

$connect = mysql_connect("xxxxx", "xxxxx", "xxxx");

if (!$connect)
{
die("MySQL could not connect!");
}

$DB = mysql_select_db('xxxx');

if(!$DB)
{
die("MySQL could not select Database!");
}
$PAGE = $_SESSION['page'];
$original_file = file_get_contents("$PAGE");
}

$stripped_file = strip_tags($original_file, "<a>");
preg_match_all("/<a(?:[^>]*)href=\"([^\"]*)\"(?:[^>]*)>(?:[^<]*)<\/a>/is", $stripped_file,     $matches);

foreach($matches[1] as $key => $value)
{

if(strpos($value,"http://") != 'FALSE' && strpos($value,"https://") != 'FALSE')
{
$New_URL = "http://" . $domain . $value; 
}
else
{
$New_URL = $value;
}
$New_URL = addslashes($New_URL);
$Check = mysql_query("SELECT * FROM pages WHERE url='$New_URL'");
$Num = mysql_num_rows($Check);

if($Num == 0)
{
mysql_query("INSERT INTO pages (url)
VALUES ('$New_URL')");

 $_SESSION['i']++;

 echo $_SESSION['i'] . "";
  } 
  echo mysql_error();
   }

  $RandQuery = mysql_query("SELECT DISTINCT * FROM pages ORDER BY rank LIMIT 0,1");
  $RandReturn = mysql_num_rows($RandQuery);
  while($row1 = mysql_fetch_assoc($RandQuery))
  {
  $_SESSION['page'] = $row1['url'];
  } 
  echo $RandReturn;
  echo $_SESSION['page'];
  mysql_close();

  ?>
session_start();
$domain=“www.ebay.com”;
if(空($_会话['page']))
{
$original\u file=file\u get\u contents(“http://“$domain.”/”;
$\会话['i']=0;
$connect=mysql_connect(“cust-mysql-123-05”、“uthe_774575_0001”、“鲁尼08”);
如果(!$connect)
{
死(“MySQL无法连接!”);
}
$DB=mysql_select_DB('theqlickcom_774575_db1');
如果(!$DB)
{
死(“MySQL无法选择数据库!”);
}
}
如果(isset($_会话['page']))
{
$connect=mysql_connect(“xxxxx”、“xxxxx”、“xxxxx”);
如果(!$connect)
{
死(“MySQL无法连接!”);
}
$DB=mysql\u select\u DB('xxxx');
如果(!$DB)
{
死(“MySQL无法选择数据库!”);
}
$PAGE=$\会话['PAGE'];
$original_file=文件获取内容($PAGE);
}
$stripped_file=strip_标签($original_file,“”);

preg\u match\u all(“/]*)href=\”([^\“]*)\”(?:[^>]*)>(?:[^首先,为什么在这行加引号?:

$original_file = file_get_contents("$PAGE");
其次,所有元标记都可以通过

$tags = get_meta_tags('http://www.example.com/');

因此,在您的示例中,我想您必须使用:

$tags = get_meta_tags($New_URL);

并将该数组保存到您的数据库中。

我以前在从外部源读取html标记时遇到过这个问题。Jstel为我提供了一个很好的解决方案,尽管我相信您可以将她的解决方案合并到您的解决方案中

根据您的代码,以下是它的工作原理:

$domain = "www.ebay.com";
$original_file = file_get_contents("http://" . $domain . "/");
preg_match_all("/<meta[^>]+(http\-equiv|name)=\"([^\"]*)\"[^>]" . "+content=\"([^\"]*)\"[^>]*>/i",$original_file, $result);
print_r($result);
$domain=“www.ebay.com”;
$original\u file=file\u get\u contents(“http://“$domain.”/”;
preg\u match\u all(“/”+(http \-equiv | name)=\”([^\“]*)\“[^>]”“+”content=\”([^\“]*)\“[^>]*>/i”,$original\u file,$result);
打印(结果);
下面是我从这个正则表达式得到的示例结果:

Array
(
    [0] => Array
        (
            [0] => <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
            [1] => <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
            [2] => <meta name="keywords" content="ebay, electronics, cars, clothing, apparel, collectibles, sporting goods, digital cameras, antiques, tickets, jewelry, online shopping, auction, online auction">
            [3] => <meta name="description" content="Buy and sell electronics, cars, fashion apparel, collectibles, sporting goods, digital cameras, baby items, coupons, and everything else on eBay, the world's online marketplace">
            [4] => <meta name="verify-v1" content="j6ZKbG61n+f9pUtbkf69zFRBrRSeUqyfEJ2BjiRxWDQ=">
            [5] => <meta name="y_key" content="acf32e2a69cbc2b0">
            [6] => <meta name="msvalidate.01" content="31154A785F516EC9842FC3BA2A70FB1A">
        )

    [1] => Array
        (
            [0] => http-equiv
            [1] => http-equiv
            [2] => name
            [3] => name
            [4] => name
            [5] => name
            [6] => name
        )

    [2] => Array
        (
            [0] => Content-Type
            [1] => Content-Type
            [2] => keywords
            [3] => description
            [4] => verify-v1
            [5] => y_key
            [6] => msvalidate.01
        )

    [3] => Array
        (
            [0] => text/html; charset=UTF-8
            [1] => text/html; charset=UTF-8
            [2] => ebay, electronics, cars, clothing, apparel, collectibles, sporting goods, digital cameras, antiques, tickets, jewelry, online shopping, auction, online auction
            [3] => Buy and sell electronics, cars, fashion apparel, collectibles, sporting goods, digital cameras, baby items, coupons, and everything else on eBay, the world's online marketplace
            [4] => j6ZKbG61n+f9pUtbkf69zFRBrRSeUqyfEJ2BjiRxWDQ=
            [5] => acf32e2a69cbc2b0
            [6] => 31154A785F516EC9842FC3BA2A70FB1A
        )

)
数组
(
[0]=>阵列
(
[0] => 
[1] => 
[2] => 
[3] => 
[4] => 
[5] => 
[6] => 
)
[1] =>阵列
(
[0]=>http等效
[1] =>http等价
[2] =>名称
[3] =>名称
[4] =>名称
[5] =>名称
[6] =>名称
)
[2] =>阵列
(
[0]=>内容类型
[1] =>内容类型
[2] =>关键字
[3] =>说明
[4] =>验证-v1
[5] =>y_键
[6] =>msvalidate.01
)
[3] =>阵列
(
[0]=>text/html;字符集=UTF-8
[1] =>文本/html;字符集=UTF-8
[2] =>易趣、电子产品、汽车、服装、服装、收藏品、体育用品、数码相机、古董、门票、珠宝、在线购物、拍卖、在线拍卖
[3] =>在全球在线市场易趣上买卖电子产品、汽车、时尚服装、收藏品、体育用品、数码相机、婴儿用品、优惠券以及其他一切
[4] =>j6ZKbG61n+F9PUTBKF69ZFRBRSEUQYFEJ2BJIRXWDQ=
[5] =>acf32e2a69cbc2b0
[6] =>31154A785F516EC9842FC3BA2A70FB1A
)
)

每次你使用正则表达式解析HTML时,一只小海豹会可怕地死去-使用。(相关)
“$x”
只是cargo cult编程。我如何将数组保存到数据库中。示例:((这取决于你。你使用的是
mysql\u查询(“插入页面(url)值('$New\u url'))
那么您是否可以重新写入以保存该数组?