如何通过simple_php_dom将刮取的数据插入数据库

如何通过simple_php_dom将刮取的数据插入数据库,php,web,screen-scraping,Php,Web,Screen Scraping,我正在实现一个代码,将其中一个站点的刮取值插入到数据库中,但它被插入到数据库中两次。经过太多的分析,我仍然无法理解为什么它会被两次插入数据库: 我的代码如下: include('simple_html_dom.php'); $aflink3 = "http://aliveforfootball.com/blog/david-moyes-confident-manchester-united- future/"; $linkurl = $aflink3; // Loading the url

我正在实现一个代码,将其中一个站点的刮取值插入到数据库中,但它被插入到数据库中两次。经过太多的分析,我仍然无法理解为什么它会被两次插入数据库:

我的代码如下:

include('simple_html_dom.php');

$aflink3 = "http://aliveforfootball.com/blog/david-moyes-confident-manchester-united- future/";
$linkurl = $aflink3;

// Loading the url
$html = file_get_html($linkurl);

// an array state to find the html elements for scraping the data
$States = array
(
array("state","div.entry-content",""),
array("article.post",1,1)                   
);

// Finding the title of the article
if(($html->find("meta[property='og:title']",0))!=null){ $metatitle = $html->find   ("meta[property ='og:title']",0)->content;}
$title = $metatitle;

// Foreach to find the meta property of type images.
$metaimages = array();
if(($html->find("meta[property='og:image']"))!=null){
foreach($html->find("meta[property='og:image']") as $metaimage){
       $item['image'] = $metaimage->content;
       $metaimages = $item;
  }         
  }else {}

// Function to find the paragraphs of a particular article
function findParagraphs($article){
global $subtitle1;
global $articlecontent;
global $content;
global $spancontent;

$spancontent = array();

$content = array();
    $articlecontent = array();  
foreach($article->find('p') as $p){
    $articlecontent[] = $p->plaintext;
}

foreach($article->find('p span') as $spandiv){
    $spancontent[] = $spandiv->plaintext;       
}

$articlelength = count($articlecontent);
$spanlength = count($spancontent);

for($i=0;$i<$articlelength;$i++){
    for($j=0;$j<$spanlength;$j++){
    if(strpos($articlecontent[$i],(substr($spancontent[$j],0,5))) === false){
    }else{ $articlecontent[$i] = ""; }
    }
}
$content = $articlecontent;
}


$flag = 0;
$article = null;
$state = 0;
// Function to match the html elements to construct the data for the article section
$rows = count($States);
for($row = 0; $row < 2; $row++) {
 for($col = 0; $col < 3; $col++ ) {
echo "[".$row."][".$col."]<BR>";
if($States[$row][$col] == 1){
    $statefound = $States[$row][0]." ".$States[0][$col];
    $article = $html->find($statefound,0);
    if(isset($article) && ($state == 0)){
        $state = 1;         
        findParagraphs($article);       
        break 2;        
    }   
}   
 }
} 

// Creating the JSON Object of the scraped data
$stuff = array(
     'title' => $title , 
 'image' => $metaimages, 
 'content' => $content );


//Function to insert the Scraped-data into the database
if($stuff != null){ 
global $linkurl;

$jsencode = json_encode($stuff);

$obj = json_decode($jsencode, TRUE);
$dbcontent = "";
for($i=0; $i<count($obj['content']); $i++) {
    $dbcontent .= "<p>".$obj['content'][$i]."</p>";
}

$dbtitle = "";
for($i=0; $i<count($obj['title']); $i++) {
    $dbtitle .= "<p>".$obj['title']."</p>";
}

$dbimage = "";  
for($i=0; $i<count($obj['image']); $i++) {
    $dbimage = "<p>".$obj['image']['image']."</p>";
}

//Intializing the MySql Connections
mysql_connect("localhost", "root", "password") or die(mysql_error());
mysql_select_db("Parsing") or die(mysql_error());
mysql_query("INSERT INTO Sportparse 
(linkurl,linktitle,linkimage,linkcontent)  VALUES('$linkurl','$dbtitle','$dbimage','$dbcontent') ") 
or die(mysql_error());  
echo "Data Inserted Successfully";

//Cleaning up the memory to prevent the memory Leak
$html->clear(); 
unset($html);
} 
?>
include('simple_html_dom.php');
$aflink3=”http://aliveforfootball.com/blog/david-moyes-confident-manchester-united- 未来/“;
$linkurl=$aflink3;
//加载url
$html=file\u get\u html($linkurl);
//用于查找用于抓取数据的html元素的数组状态
$States=数组
(
数组(“state”、“div.entry-content”和“”),
数组(“article.post”,1,1)
);
//查找文章的标题
如果(($html->find(“meta[property='og:title']”,0))!=null){$metatitle=$html->find(“meta[property='og:title']”,0)->content;}
$title=$metatitle;
//Foreach查找类型映像的元属性。
$metaimages=array();
if(($html->find(“meta[property='og:image'])!=null){
foreach($html->find(“meta[property='og:image'])作为$metaimage){
$item['image']=$metaimage->content;
$metaimages=$item;
}         
}else{}
//函数查找特定文章的段落
函数findParagraphs($article){
全球1美元;
全球内容;
全球$内容;
全球内容;
$spancontent=array();
$content=array();
$articlecontent=array();
foreach($article->find('p')作为$p){
$articlecontent[]=$p->纯文本;
}
foreach($article->find('p span')作为$spandiv){
$spancontent[]=$spandiv->纯文本;
}
$articlelength=计数($articlecontent);
$spanlength=计数($spancontent);
对于($i=0;$ifind($statefound,0);
如果(isset($article)&($state==0)){
$state=1;
findParagraphs($文章);
破口2;
}   
}   
}
} 
//创建刮取数据的JSON对象
$stuff=数组(
“title”=>$title,
“图像”=>$metaimages,
“内容”=>$content);
//函数将刮取的数据插入数据库
如果($stuff!=null){
全球$linkurl;
$jsencode=json_encode($stuff);
$obj=json_decode($jsencode,TRUE);
$dbcontent=“”;
对于($i=0;$iclear();
未结算($html);
} 
?>
在插入数据库的最后一个代码中,数据被插入两次,而不是只插入一次。
我已经尝试了所有方法,但我无法修复它,我认为这是值得研究的问题,因为我的许多同事都无法解决它。

在某个地方添加一个唯一的约束怎么样?这就是为什么我添加了一个变量$state,并将其初始设置为0,一旦找到匹配的html模式,我就将其设置为maki将$state设置为1并检查条件