Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/xpath/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 爬虫规则不起作用_Python_Xpath_Web Scraping_Web Crawler_Scrapy Spider - Fatal编程技术网

Python 爬虫规则不起作用

Python 爬虫规则不起作用,python,xpath,web-scraping,web-crawler,scrapy-spider,Python,Xpath,Web Scraping,Web Crawler,Scrapy Spider,我正在尝试构建一个spider,使用python的scrapy框架为纽约理工学院的课程刮取数据。。。下面是我的蜘蛛(nyitspider.py)。谁能告诉我哪里出了问题 from scrapy.spiders import CrawlSpider, Rule, BaseSpider, Spider from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor from scrapy.selector import Selector fr

我正在尝试构建一个spider,使用python的scrapy框架为纽约理工学院的课程刮取数据。。。下面是我的蜘蛛(nyitspider.py)。谁能告诉我哪里出了问题

from scrapy.spiders import CrawlSpider, Rule, BaseSpider, Spider
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

from nyit_sample.items import NyitSampleItem


class nyitspider(CrawlSpider):
name = 'nyitspider'
allowed_domains = ['nyit.edu']
start_urls = ['http://www.nyit.edu/academics/courses/']

rules = (
    Rule(LxmlLinkExtractor(
         allow=('.*/academics/courses', ),
    )),

Rule(LxmlLinkExtractor(
         allow=('.*/academics/courses/[a-z][a-z][a-z]-[a-z][a-z]-[0-9][0-9]    [0-9]/', ),
    ), callback='parse_item'),

)

def parse_item(self, response):
    item = Course()
    item["institute"] = 'New York Institute of Technology'
    item['site'] = 'www.nyit.edu'
    item['title'] = response.xpath('//*[@id="course_catalog_table"]/tbody/tr[1]/td[2]/a').extract()[0]
item['id'] = response.xpath('//*[@id="course_catalog_table"]/tbody/tr[1]/td[1]/a').extract()[0]
    item['credits'] = response.xpath('//*[@id="course_catalog_table"]/tbody/tr[1]/td[3]').extract()[0]
    item['description'] = response.xpath('//*[@id="course_catalog_table"]/tbody/tr[2]/td/text()[1]').extract()[0]



    yield item

您必须在parse_item方法中正确声明该项,并且该方法应该返回一些内容。这里有一个建议,但你必须改进它:

# -*- coding: utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule, BaseSpider, Spider
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

from nyit_sample.items import NyitSampleItem


class nyitspider(CrawlSpider):
    name = 'nyitspider'
    allowed_domains = ['nyit.edu']
    start_urls = ['http://www.nyit.edu/academics/courses/']

    rules = (
        Rule(LxmlLinkExtractor(
             allow=('.*/academics/courses', ),
        ), callback='parse_item'),   
        Rule(LxmlLinkExtractor(
             allow=('.*/academics/courses/[a-z][a-z][a-z]-[a-z][a-z]-[0-9][0-9]    [0-9]/', ),
        ), callback='parse_item'),

    )

    def parse_item(self, response):
        item = NyitSampleItem()
        item['institute'] = 'New York Institute of Technology'
        item['site'] = 'www.nyit.edu'
        item['title'] = response.xpath('string(//*[@id="course_catalog_table"]/tbody/tr[1]/td[2]/a)').extract()[0]
        item['id'] = response.xpath('string(//*[@id="course_catalog_table"]/tbody/tr[1]/td[1]/a)').extract()[0]
        item['credits'] = response.xpath('string(//*[@id="course_catalog_table"]/tbody/tr[1]/td[3])').extract()[0]
        item['description'] = response.xpath('//*[@id="course_catalog_table"]/tbody/tr[2]/td/text()[1]').extract()[0]
        return item

我们能从中得到什么?2017-03-17 07:20:59[scrapy.extensions.telnet]调试:telnet控制台监听127.0.0.1:6026 2017-03-17 07:20:59[scrapy.core.engine]调试:爬网(200)(referer:None)[“缓存”]首先,您可以从所有xpath表达式中删除
tbody
标记。它是由浏览器添加的,而来自页面的响应没有它。并尝试将第二条规则中的正则表达式更改为
r'\/academics\/courses\/(.*)
(第一条规则也可以删除)。