Warning: file_get_contents(/data/phpspider/zhask/data//catemap/7/kubernetes/5.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
PHP简单HTML DOM解析器_Php_Parsing_Screen Scraping_Simple Html Dom - Fatal编程技术网

PHP简单HTML DOM解析器

PHP简单HTML DOM解析器,php,parsing,screen-scraping,simple-html-dom,Php,Parsing,Screen Scraping,Simple Html Dom,我正在使用简单的HTMLDOM解析器,用一堆子页面进行屏幕抓取。出于某种原因,它解析前40个子页面很好,但当它解析到第41个子页面时,没有错误 我已经制作并尝试在我的脚本中记录我所做的一切,以及在简单的HTMLDOM解析器中记录一些venets,但是我没有找到错误 有人知道为什么解析41号URL时会这样吗?或者有人知道简单的HTMLDOM解析器在某些情况下会失败吗 我的测试页面: 这是我的脚本,我使用的是未修改版本的简单HTMLDOM解析器。 有趣的事情发生在get_-lections()中,我

我正在使用简单的HTMLDOM解析器,用一堆子页面进行屏幕抓取。出于某种原因,它解析前40个子页面很好,但当它解析到第41个子页面时,没有错误

我已经制作并尝试在我的脚本中记录我所做的一切,以及在简单的HTMLDOM解析器中记录一些venets,但是我没有找到错误

有人知道为什么解析41号URL时会这样吗?或者有人知道简单的HTMLDOM解析器在某些情况下会失败吗

我的测试页面:

这是我的脚本,我使用的是未修改版本的简单HTMLDOM解析器。 有趣的事情发生在get_-lections()中,我在其中标记了一个简单的HTML DOM解析器

define("LECTION_STATUS_REGULAR", 0);
define("LECTION_STATUS_CHANGED", 1);
define("LECTION_STATUS_CANCELLED", 2);

define("LECTION_DOCUMENTS_NONE", 0);
define("LECTION_DOCUMENTS_TRUE", 1);

define("AMOUNT_OF_WEEKS_IN_A_YEAR", 52);

include_once("simple_html_dom.php");

function clean_text($text)
{
    $text = trim($text);
    $text = strip_tags($text);
    $text = html_entity_decode($text, ENT_QUOTES, "UTF-8");
    $text = utf8_decode($text);

    return $text;
}

function get_links_for_lections($weeks)
{
    echo "Finding links<br /><textarea style=\"width:70%;height:150px;\">";

    foreach($weeks as $week)
    {
        // **
        // 
        // THIS IS WHERE I CALL SIMPLE HTML DOM PARSER
        //
        // **

        echo " * Retrieving HTML...\n";
        $html = file_get_html("http://www.lectio.dk/lectio/285/SkemaNy.aspx?type=elev&elevid=2444366210&week=" . $week['week'] . $week['year']);
        echo " * HTML retrieved...\n";

        $lections_regular = $html->find('a[class="s2skemabrik s2bgbox s2withlink"]');
        $lections_changed = $html->find('a[class="s2skemabrik s2bgbox s2changed s2withlink"]');
        $lections_cancelled = $html->find('a[class="s2skemabrik s2bgbox s2cancelled s2withlink"]');
        $lections = array_merge($lections_regular, $lections_changed, $lections_cancelled);

        foreach($lections as $lection)
        {
            $links[] = "http://www.lectio.dk" . $lection->href;
        }
    }

    echo "</textarea>
    <hr />";

    return $links;
}

function get_lections($links)
{
    // Create array to hold lections
    $lections = array();

    // Loop through links
    $num = 1;
    foreach($links as $link)
    {   
        echo $num . ". " . $link . "<br />
        <textarea style=\"width:70%;height:150px;\">";

        // Initialize lection
        $lection = array();
        $lection['status'] = LECTION_STATUS_REGULAR;
        $lection['documents'] = LECTION_DOCUMENTS_NONE;

        echo " * Retrieving HTML...\n";
        $html = file_get_html($link);
        echo " * HTML retrieved\n";

        // Loop through rows
        foreach($html->find("tr") as $row)
        {
            echo " * New cell\n";

            // Get name of row
            $row_name = $row->find("th");
            $row_name = $row_name['0']->innertext;

            echo " - Row name: \"" . $row_name . "\"\n";

            if ($row_name == "Type:")
            {
                echo " - Checking type...\n";

                // Row tells what type it is
                $cell = $row->find("td");
                $content = $cell['0']->innertext;
                $lection['type'] = clean_text($content);

                echo " - Type checked\n";
            }
            else if ($row_name == "Titel:")
            {
                echo " - Checking title...\n";

                // Row tells the title
                $cell = $row->find("td");
                $content = $cell['0']->innertext;
                $lection['title'] = clean_text($content);

                echo " - Title checked\n";
            }
            else if ($row_name == "Hold:")
            {
                echo " - Checking subject...\n";

                // Row tells what the subject is
                $cell = $row->find("td");
                $content = $cell['0']->innertext;
                $lection['subject'] = clean_text($content);

                echo " - Subject checked\n";
            }
            else if ($row_name == "Lærere:")
            {
                echo " - Checking teachers...\n";

                // Row tells who the teacher is 
                $cell = $row->find("td");
                $content = $cell['0']->innertext;
                $lection['teachers'] = clean_text($content);

                echo " - Teachers checked\n";
            }
            else if ($row_name == "Lokaler:")
            {
                echo " - Checking location...\n";

                // Row tells the location
                $cell = $row->find("td");
                $content = $cell['0']->innertext;
                $lection['location'] = clean_text($content);

                echo " - Location checked\n";
            }
            else if ($row_name == "Note:")
            {
                echo " - Checking note...\n";

                // Row contains a note
                $cell = $row->find("td");
                $content = $cell['0']->innertext;
                $lection['note'] = clean_text($content);

                echo " - Note checked\n";
            }
            elseif ($row_name == "Dokumenter:")
            {
                echo " - Checking documents...\n";

                // Row contains the documents
                $cell = $row->find("td");
                $content = $cell['0']->plaintext;
                $content = clean_text($content);
                if ($content)
                {
                    // We can't get the titles of the documents as we are not logged in
                    // Instead we tell the user that there are documents available
                    $lection['documents'] = LECTION_DOCUMENTS_TRUE;
                }

                echo " - Documents checked\n";
            }
            else if ($row_name == "Lektier:")
            {
                echo " - Checking homework...\n";

                // Row contains the homework
                $cell = $row->find("td");
                $content = $cell['0']->innertext;
                $lection['homework'] = clean_text($content);

                echo " - Homework checked\n";
            }
            else if ($row_name == "Vises:")
            {
                echo " - Checking status (part 1)...\n";

                // Row tells where the lection is shown 
                $cell = $row->find("td");
                $content = $cell['0']->plaintext;
                $content = clean_text($content);
                if (strstr($content, ","))
                {
                    // If the above is true, the lection is NOT REGULAR
                    // Now we know that the lection is either changed or cancellde
                    // We assume it is changed
                    // Below we check if the lection is cancelled (Where $row_namme == "Status:")
                    $lection['status'] = LECTION_STATUS_CHANGED;
                }

                echo " - Status (part 1) checked\n";
            }
        }

        // Add lection to array of lections
        $lections[] = $lection;
        print_r($lection);

        echo " - Lection added!</textarea><br /><br />";

        $num += 1;
    }

    return $lections;
}

function get_weeks($amount_of_weeks)
{
    $weeks = array();

    // Current week
    $week_now = date('W');
    $year_now = date('Y');

    // Demo
    $week_now = 44;

    // Last week to fetch
    $last_week = $week_now + $amount_of_weeks;

    // Add weeks to array
    for ($i = $week_now; $i <= $last_week; $i++)
    {
        $week = array();

        if ($i > AMOUNT_OF_WEEKS_IN_A_YEAR)
        {
            // Week is next year
            $week['week'] = $i - AMOUNT_OF_WEEKS_IN_A_YEAR;
            $week['year'] = $year_now + 1;
        }
        else
        {
            // Week is in this year
            $week['week'] = $i;
            $week['year'] = $year_now;
        }

        // Add week to weeks
        $weeks[] = $week;
    }

    return $weeks;
}

$weeks = get_weeks(5);
$links = get_links_for_lections($weeks);
$lections = get_lections($links);
echo "<hr />";
print_r($lections);
echo "<hr />";
define(“选择状态常规”,0);
定义(“选择状态改变”,1);
定义(“选择状态取消”,2);
定义(“选择\文档\无”,0);
定义(“选择文件”1);
定义(“每年的周数”,52);
包括一次(“simple_html_dom.php”);
函数clean_text($text)
{
$text=修剪($text);
$text=带标签($text);
$text=html_entity_decode($text,ENT_引号,“UTF-8”);
$text=utf8\u解码($text);
返回$text;
}
函数获取选项链接(周)
{
回显“查找链接
”; foreach($weeks作为$week) { // ** // //这就是我称之为简单HTMLDOM解析器的地方 // // ** echo“*正在检索HTML…\n”; $html=文件\u获取\u html(“http://www.lectio.dk/lectio/285/SkemaNy.aspx?type=elev&elevid=2444366210&week=“$周['周]。$周['年]); echo“*检索到HTML…\n”; $lections_regular=$html->find('a[class=“s2skemabrik s2bgbox s2withlink”]); $selections_changed=$html->find('a[class=“s2skemabrik s2bgbox s2changed s2withlink”]); $lections_cancelled=$html->find('a[class=“s2skemabrik s2bgbox s2cancelled s2withlink”]); $lections=array\u merge($lections\u regular,$lections\u changed,$lections\u cancelled); foreach($lection作为$lection) { $links[]=”http://www.lectio.dk“$lection->href; } } 回声“
; 返回$links; } 函数获取选项($links) { //创建数组以保存选择 $lections=array(); //循环链接 $num=1; foreach($links作为$link) { echo$num.“.”$link.“
"; //初始化选择 $lection=array(); $rection['status']=rection_status_REGULAR; $lection['documents']=lection\u documents\u NONE; echo“*正在检索HTML…\n”; $html=file\u get\u html($link); echo“*检索到HTML\n”; //循环行 foreach($html->find(“tr”)作为$row) { echo“*新单元格\n”; //获取行的名称 $row_name=$row->find(“th”); $row_name=$row_name['0']->innertext; echo“-行名称:\”.$Row\u名称。“\”\n”; 如果($row_name==“Type:”) { echo“-检查类型…\n”; //行告诉它是什么类型的 $cell=$row->find(“td”); $content=$cell['0']->innertext; $lection['type']=干净的文本($content); echo“-已选中类型\n”; } else if($row_name==“Titel:”) { echo“-正在检查标题…\n”; //行显示标题 $cell=$row->find(“td”); $content=$cell['0']->innertext; $lection['title']=干净的文本($content); echo“-已选中标题\n”; } else if($row_name==“Hold:”) { echo“-正在检查主题…\n”; //行表示主题是什么 $cell=$row->find(“td”); $content=$cell['0']->innertext; $selection['subject']=干净的文本($content); echo“-已检查主题\n”; } else if($row_name==“Lærere:”) { echo“-检查教师…\n”; //行告诉谁是老师 $cell=$row->find(“td”); $content=$cell['0']->innertext; $lection['teachers']=干净的文本($content); echo“-已选中教师\n”; } else if($row_name==“Lokaler:”) { echo“-正在检查位置…\n”; //行显示位置 $cell=$row->find(“td”); $content=$cell['0']->innertext; $lection['location']=干净的文本($content); echo“-已检查位置\n”; } else if($row_name==“注意:”) { echo“-检查注释…\n”; //行中包含一个注释 $cell=$row->find(“td”); $content=$cell['0']->innertext; $lection['note']=干净的文本($content); echo“-已选中注释\n”; } elseif($row_name==“dokumcenter:”) { echo“-检查文档…\n”; //行包含文档 $cell=$row->find(“td”); $content=$cell['0']->纯文本; $content=clean_text($content); 如果($content) { //我们无法获取文档的标题,因为我们未登录 //相反,我们告诉用户有可用的文档 $lection['documents']=lection_documents_TRUE; } echo“-已检查文档\n”; } else if($row_name==“Lektier:”) { echo“-检查作业…\n”; //这一行包含家庭作业 $cel