如何通过simple_php_dom将刮取的数据插入数据库_Php_Web_Screen Scraping

如何通过simple_php_dom将刮取的数据插入数据库

php web

如何通过simple_php_dom将刮取的数据插入数据库,php,web,screen-scraping,Php,Web,Screen Scraping,我正在实现一个代码，将其中一个站点的刮取值插入到数据库中，但它被插入到数据库中两次。经过太多的分析，我仍然无法理解为什么它会被两次插入数据库：我的代码如下： include('simple_html_dom.php'); $aflink3 = "http://aliveforfootball.com/blog/david-moyes-confident-manchester-united- future/"; $linkurl = $aflink3; // Loading the url

我正在实现一个代码，将其中一个站点的刮取值插入到数据库中，但它被插入到数据库中两次。经过太多的分析，我仍然无法理解为什么它会被两次插入数据库：

我的代码如下：

include('simple_html_dom.php');

$aflink3 = "http://aliveforfootball.com/blog/david-moyes-confident-manchester-united- future/";
$linkurl = $aflink3;

// Loading the url
$html = file_get_html($linkurl);

// an array state to find the html elements for scraping the data
$States = array
(
array("state","div.entry-content",""),
array("article.post",1,1)                   
);

// Finding the title of the article
if(($html->find("meta[property='og:title']",0))!=null){ $metatitle = $html->find   ("meta[property ='og:title']",0)->content;}
$title = $metatitle;

// Foreach to find the meta property of type images.
$metaimages = array();
if(($html->find("meta[property='og:image']"))!=null){
foreach($html->find("meta[property='og:image']") as $metaimage){
       $item['image'] = $metaimage->content;
       $metaimages = $item;
  }         
  }else {}

// Function to find the paragraphs of a particular article
function findParagraphs($article){
global $subtitle1;
global $articlecontent;
global $content;
global $spancontent;

$spancontent = array();

$content = array();
    $articlecontent = array();  
foreach($article->find('p') as $p){
    $articlecontent[] = $p->plaintext;
}

foreach($article->find('p span') as $spandiv){
    $spancontent[] = $spandiv->plaintext;       
}

$articlelength = count($articlecontent);
$spanlength = count($spancontent);

for($i=0;$i<$articlelength;$i++){
    for($j=0;$j<$spanlength;$j++){
    if(strpos($articlecontent[$i],(substr($spancontent[$j],0,5))) === false){
    }else{ $articlecontent[$i] = ""; }
    }
}
$content = $articlecontent;
}


$flag = 0;
$article = null;
$state = 0;
// Function to match the html elements to construct the data for the article section
$rows = count($States);
for($row = 0; $row < 2; $row++) {
 for($col = 0; $col < 3; $col++ ) {
echo "[".$row."][".$col."]<BR>";
if($States[$row][$col] == 1){
    $statefound = $States[$row][0]." ".$States[0][$col];
    $article = $html->find($statefound,0);
    if(isset($article) && ($state == 0)){
        $state = 1;         
        findParagraphs($article);       
        break 2;        
    }   
}   
 }
} 

// Creating the JSON Object of the scraped data
$stuff = array(
     'title' => $title , 
 'image' => $metaimages, 
 'content' => $content );


//Function to insert the Scraped-data into the database
if($stuff != null){ 
global $linkurl;

$jsencode = json_encode($stuff);

$obj = json_decode($jsencode, TRUE);
$dbcontent = "";
for($i=0; $i<count($obj['content']); $i++) {
    $dbcontent .= "<p>".$obj['content'][$i]."</p>";
}

$dbtitle = "";
for($i=0; $i<count($obj['title']); $i++) {
    $dbtitle .= "<p>".$obj['title']."</p>";
}

$dbimage = "";  
for($i=0; $i<count($obj['image']); $i++) {
    $dbimage = "<p>".$obj['image']['image']."</p>";
}

//Intializing the MySql Connections
mysql_connect("localhost", "root", "password") or die(mysql_error());
mysql_select_db("Parsing") or die(mysql_error());
mysql_query("INSERT INTO Sportparse 
(linkurl,linktitle,linkimage,linkcontent)  VALUES('$linkurl','$dbtitle','$dbimage','$dbcontent') ") 
or die(mysql_error());  
echo "Data Inserted Successfully";

//Cleaning up the memory to prevent the memory Leak
$html->clear(); 
unset($html);
} 
?>

include（'simple_html_dom.php'）；
$aflink3=”http://aliveforfootball.com/blog/david-moyes-confident-manchester-united- 未来/“；
$linkurl=$aflink3；
//加载url
$html=file\u get\u html（$linkurl）；
//用于查找用于抓取数据的html元素的数组状态
$States=数组
(
数组（“state”、“div.entry-content”和“”），
数组（“article.post”，1,1）
);
//查找文章的标题
如果（（$html->find（“meta[property='og:title']”，0））！=null）{$metatitle=$html->find（“meta[property='og:title']”，0）->content；}
$title=$metatitle；
//Foreach查找类型映像的元属性。
$metaimages=array（）；
if（（$html->find（“meta[property='og:image']）！=null）{
foreach（$html->find（“meta[property='og:image']）作为$metaimage）{
$item['image']=$metaimage->content；
$metaimages=$item；
}         
}else{}
//函数查找特定文章的段落
函数findParagraphs（$article）{
全球1美元；
全球内容；
全球$内容；
全球内容；
$spancontent=array（）；
$content=array（）；
$articlecontent=array（）；
foreach（$article->find（'p'）作为$p）{
$articlecontent[]=$p->纯文本；
}
foreach（$article->find（'p span'）作为$spandiv）{
$spancontent[]=$spandiv->纯文本；
}
$articlelength=计数（$articlecontent）；
$spanlength=计数（$spancontent）；
对于（$i=0；$ifind（$statefound，0）；
如果（isset（$article）&（$state==0））{
$state=1；
findParagraphs（$文章）；
破口2；
}   
}   
}
} 
//创建刮取数据的JSON对象
$stuff=数组(
“title”=>$title，
“图像”=>$metaimages，
“内容”=>$content）；
//函数将刮取的数据插入数据库
如果（$stuff！=null）{
全球$linkurl；
$jsencode=json_encode（$stuff）；
$obj=json_decode（$jsencode，TRUE）；
$dbcontent=“”；
对于（$i=0；$iclear（）；
未结算（$html）；
} 
?>

在插入数据库的最后一个代码中，数据被插入两次，而不是只插入一次。

我已经尝试了所有方法，但我无法修复它，我认为这是值得研究的问题，因为我的许多同事都无法解决它。

在某个地方添加一个唯一的约束怎么样？这就是为什么我添加了一个变量$state，并将其初始设置为0，一旦找到匹配的html模式，我就将其设置为maki将$state设置为1并检查条件