Php 外部站点的标题和元标记

Php 外部站点的标题和元标记,php,curl,Php,Curl,我需要的标题和元标签和购物网站使用url图像标签。 这是我的代码,它使用亚马逊产品链接工作。但它不像url那样工作: ? 获取标记的我的代码: $url ="http://rads.stackoverflow.com/amzn/click/B009T9QCWI"; $ch = curl_init(); $timeout = 5; curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); c

我需要的标题和元标签和购物网站使用url图像标签。 这是我的代码,它使用亚马逊产品链接工作。但它不像url那样工作:

  • ?
  • 获取标记的我的代码:

    $url ="http://rads.stackoverflow.com/amzn/click/B009T9QCWI";
    $ch = curl_init();
    $timeout = 5;
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
    $data = curl_exec($ch);
    curl_close($ch);
    $returned_content = $data;
    $doc = new \DOMDocument();
    @$doc->loadHTML($returned_content);
    $nodes = $doc->getElementsByTagName("title");
    //$title = $nodes->item(0)->nodeValue;
    $product_title = str_replace("'", " ", $title);
    $xml=simplexml_import_dom($doc);
    $images=$xml->xpath("//img");
    $j=0;
    foreach($images as $img) {
        $host = explode(":",$img["src"]);
        $ht = $host[0];
        if ($ht == "http" || $ht == "https" ) {
            $info = pathinfo($img["src"]);
            if (array_key_exists('extension', $info)) {
                $extension =  $info["extension"];
            }
            if ($extension == "jpg" || $extension == "jpeg") {
                $imagesrc[] = $img["src"];
    
                $j++;
    
                $image[] = $img["src"] ;
    
            }
        }
    }
    $metas = $doc->getElementsByTagName('meta');
    for ($i = 0; $i < $metas->length; $i++) {
        $meta = $metas->item($i);
        if ($meta->getAttribute('name') == 'description' || $meta->getAttribute('name') == 'Description') {
            $description = $meta->getAttribute('content');
        }
        if ($meta->getAttribute('name') == 'keywords') {
            $keywords = $meta->getAttribute('content');
        }
    }
    if (empty($image)) {
        $domarray[] = array('desc' => $description, 'title'=>$product_title);
        print_r($domarray);
    
    
    } else {
        $domarray[] = array('img' =>$image, 'desc' => $description, 'title'=>$product_title);
        print_r($domarray) ;
    
    }
    
    $url=”http://rads.stackoverflow.com/amzn/click/B009T9QCWI";
    $ch=curl_init();
    $timeout=5;
    curl_setopt($ch,CURLOPT_URL,$URL);
    curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
    curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,$timeout);
    $data=curl\u exec($ch);
    卷曲关闭($ch);
    $returned_content=$data;
    $doc=new\DOMDocument();
    @$doc->loadHTML($returned\u content);
    $nodes=$doc->getElementsByTagName(“标题”);
    //$title=$nodes->item(0)->nodeValue;
    $product\U title=str\U replace(“'”、“,$title”);
    $xml=simplexml\u import\u dom($doc);
    $images=$xml->xpath(“//img”);
    $j=0;
    foreach($img形式的图像){
    $host=explode(“:”,$img[“src”]);
    $ht=$host[0];
    如果($ht==“http”| |$ht==“https”){
    $info=pathinfo($img[“src”]);
    如果(数组\键\存在('extension',$info)){
    $extension=$info[“extension”];
    }
    如果($extension==“jpg”| |$extension==“jpeg”){
    $imagesrc[]=$img[“src”];
    $j++;
    $image[]=$img[“src”];
    }
    }
    }
    $metas=$doc->getElementsByTagName('meta');
    对于($i=0;$i<$metas->length;$i++){
    $meta=$metas->item($i);
    如果($meta->getAttribute('name')=='description'| |$meta->getAttribute('name')=='description'){
    $description=$meta->getAttribute('content');
    }
    如果($meta->getAttribute('name')=='keywords'){
    $keywords=$meta->getAttribute('content');
    }
    }
    if(空($image)){
    $DOMRARY[]=array('desc'=>$description,'title'=>$product\u title);
    打印(domrarray);
    }否则{
    $DOMRARY[]=array('img'=>$image,'desc'=>$description,'title'=>$product\u title);
    打印(domrarray);
    }
    
    为什么不使用解析器呢

    示例:

    require_once 'simple_html_dom.php';
    
    $url ="http://rads.stackoverflow.com/amzn/click/B009T9QCWI";
    
    $html = file_get_html( $url );
    
    // all results stored in this array
    $result = array();
    
    // page title
    $result[ 'title' ] = $html->find( 'title', 0 )->plaintext;
    
    // get all meta tags, which have an attribute "name"
    foreach( $html->find( 'meta[name]' ) as $meta ) {
        $result[ 'meta' ][] = array(
            'name' => $meta->name,
            'content' => $meta->content
        );
    }
    
    // get all images
    foreach( $html->find( 'img' ) as $image ) {
        $result[ 'image' ][] = $image->src;
    }
    
    print_r( $result );
    
    输出

    Array
    (
        [title] => Amazon.com: Samsung Galaxy S III, Black 16GB (Verizon Wireless): Cell Phones &amp; Accessories
        [meta] => Array
            (
                [0] => Array
                    (
                        [name] => description
                        [content] => Shop cell phones and accessories at Amazon.com. You&#39;ll find great prices on cases, headsets, and the latest smartphones from carriers like Verizon, AT&amp;T, and Sprint
                    )
    
                [1] => Array
                    (
                        [name] => title
                        [content] => Amazon.com: Samsung Galaxy S III, Black 16GB (Verizon Wireless): Cell Phones &amp; Accessories
                    )
    
                [2] => Array
                    (
                        [name] => keywords
                        [content] => Samsung Galaxy S III, Black 16GB (Verizon Wireless),Samsung,Galaxy S III
                    )
    
            )
    
        [image] => Array
            (
                [0] => http://g-ecx.images-amazon.com/images/G/01/gno/beacon/BeaconSprite-US-01._V397411194_.png
                [1] => http://g-ecx.images-amazon.com/images/G/01/x-locale/common/transparent-pixel._V386942464_.gif
                [2] => http://g-ecx.images-amazon.com/images/G/01/x-locale/common/transparent-pixel._V386942464_.gif
                [3] => http://ecx.images-amazon.com/images/I/41%2Bh%2BUmrcRL._SY300_.jpg
                [4] => http://ecx.images-amazon.com/images/I/41%2Bh%2BUmrcRL._SL500_AA280_.jpg
                [5] => http://g-ecx.images-amazon.com/images/G/01/icons/icon-offsite-sl-7069-t4._V171196157_.png
                [6] => http://g-ecx.images-amazon.com/images/G/01/icons/icon-offsite-sl-7069-t4._V171196157_.png
                [7] => http://ecx.images-amazon.com/images/I/41FBSaIC4AL._SL500_SS100_.jpg
                [8] => http://ecx.images-amazon.com/images/I/41HGvd6-jwL._SL500_SS100_.jpg
                [9] => http://ecx.images-amazon.com/images/I/51jiU%2BiYWUL._SL500_SS100_.jpg
                [10] => http://ecx.images-amazon.com/images/I/317JogSYmkL._SL500_SS100_.jpg
                [11] => http://ecx.images-amazon.com/images/I/41d6B11BDuL._SL500_SS100_.jpg
                [12] => http://ecx.images-amazon.com/images/I/41a94BWHXbL._SL500_SS100_.jpg
                [13] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/B009T9QCWI.main_SM.jpg
                [14] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/B009T9QCWI.pt01_SM.jpg
                [15] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/wireless-box-logo-verizon-box.jpg
                [16] => http://g-ecx.images-amazon.com/images/G/01/th/aplus/a-plus_bottom-217._V180545591_.gif
                [17] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/B009T9QCWI.pt02_SM.jpg
                [18] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/amazon_app_suite_1_sma.jpg
                [19] => http://g-ecx.images-amazon.com/images/G/01/wireless/detail-page/amazon_app_suite_5_sm.jpg
                [20] => http://ecx.images-amazon.com/images/I/41HGvd6-jwL._SL75_SS50_.jpg
                [21] => http://ecx.images-amazon.com/images/I/41FBSaIC4AL._SL75_SS50_.jpg
                [22] => http://ecx.images-amazon.com/images/I/51jiU%2BiYWUL._SL75_SS50_.jpg
                [23] => http://ecx.images-amazon.com/images/I/41a94BWHXbL._SL75_SS50_.jpg
                [24] => http://g-ecx.images-amazon.com/images/G/01/x-locale/communities/reputation/suggestionbox._V192249929_.gif
                [25] => http://g-ecx.images-amazon.com/images/G/01/icons/orange-arrow._V192570247_.gif
                [26] => http://g-ecx.images-amazon.com/images/G/01/icons/orange-arrow._V192570247_.gif
                [27] => http://g-ecx.images-amazon.com/images/G/01/icons/orange-arrow._V192570247_.gif
                [28] => http://g-ecx.images-amazon.com/images/G/01/gno/images/general/navAmazonLogoFooter._V169459313_.gif
                [29] => /gp/uedata/unsticky/182-7026578-6696341//ntpoffrw?noscript&amp;id=158FKQCX6TYATFBQQW0V
            )
    
    )
    
    您可以在循环中传递url,并对所有url执行相同的操作。为了简单起见,我留下了您在图像和元标记上所做的检查。
    希望有帮助。

    您正在尝试将HTML解析为XML。请不要(),我认为亚马逊有一个API,它可以以一种更容易解析的方式获取您的信息()。@ToBe注意,
    loadHTML()
    。非常感谢。但我的目标不仅仅是amazon。amazon正在使用此代码。您应该使用格式不正确的HTML进行测试,以确保。否则,它可能会破坏dom和xpath查询。您还可以尝试使用regexp进行简单的字符串解析(不是xml/html解析,而是搜索某些字符串,如“@LalMohan请参阅我的答案。谢谢。但它不适用于URL,如?