Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/324.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 搔痒的:爬行但不刮_Python_Scrapy_Web Crawler - Fatal编程技术网

Python 搔痒的:爬行但不刮

Python 搔痒的:爬行但不刮,python,scrapy,web-crawler,Python,Scrapy,Web Crawler,有了提供的建议和大量的线索,我能够获得单页的爬网工作。现在,我尝试对代码进行更改,以实现多个规则,但结果并不理想。这是我想做的事情的简要描述 对于起始url=ttp://sfbay.craigslist.org/ -我使用parse_items_1来标识链接,并对其进行解析以标识链接 在级别2中,对于中的链接ttp://sfbay.craigslist.org/npo ,我需要使用parse_items_2来识别类似的链接并解析相同的链接 蜘蛛可以爬行(我可以看到显示),但是链接没有被废弃 20

有了提供的建议和大量的线索,我能够获得单页的爬网工作。现在,我尝试对代码进行更改,以实现多个规则,但结果并不理想。这是我想做的事情的简要描述

对于起始url=ttp://sfbay.craigslist.org/ -我使用parse_items_1来标识链接,并对其进行解析以标识链接

在级别2中,对于中的链接ttp://sfbay.craigslist.org/npo ,我需要使用parse_items_2来识别类似的链接并解析相同的链接

蜘蛛可以爬行(我可以看到显示),但是链接没有被废弃

2013-02-13 11:23:55+0530 [craigs] DEBUG: Crawled (200) <GET http://sfbay.craigslist.org/npo/index100.html> (referer: http://sfbay.craigslist.org/npo/)
('**parse_items_2:', [u'Development Associate'], [u'http://sfbay.craigslist.org/eby/npo/3610841951.html'])
('**parse_items_2:', [u'Resource Development Assistant'], [u'http://sfbay.craigslist.org/eby/npo/3610835088.html'])
非常感谢您的帮助


谢谢。

在scrapy教程中,项目是在回调中创建的,然后返回以进一步向下传递,而不是绑定到spider类的实例。因此,删除init部分并重写一些回调代码似乎可以解决这个问题

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from tutorial.items import CraigslistSampleItem

class MySpider(CrawlSpider):
    name = "craigs"
    allowed_domains = ["sfbay.craigslist.org"]
    start_urls = ["http://sfbay.craigslist.org/"]

    rules = (
        Rule(SgmlLinkExtractor(allow=("index\d00\.html")), callback="parse_items_2", follow= True),
        Rule(SgmlLinkExtractor(allow=(r'sfbay.craigslist.org/npo')), callback="parse_items_1", follow= True),
        )

    def parse_items_1(self, response):
        items = []
        hxs = HtmlXPathSelector(response)
        titles = hxs.select("//div")
        for title in titles:
            item = CraigslistSampleItem()
            item ["title"] = title.select("//li/a/text()").extract()
            item ["link"] = title.select("//li/a/@href").extract()
            print ('**parse-items_1:', item["title"])
            items.append(item)
        return items

    def parse_items_2(self, response):
        hxs = HtmlXPathSelector(response)
        titles = hxs.select("//p")
        items = []
        for title in titles:
            item = CraigslistSampleItem()
            item ["title"] = title.select("a/text()").extract()
            item ["link"] = title.select("a/@href").extract()
            print ('**parse_items_2:', item["title"], item["link"])
            items.append(item)
        return items

为了测试,我将爬网的项目转储到一个文件(
scrapy crawl craigs-t json-o items.json
)。我注意到每隔一段时间就会有空条目和很多“使用条款”链接。这些建议表明,你的Xpath摘录可以加强,但除此之外,它似乎正在发挥作用。

由于payala和我给出的答案有助于解决页面未被爬网的初始问题,请选择最佳答案并接受它(当你的代表足够高时,还记得对这些答案进行投票)。当然。我会的。你知道当前代码有什么问题吗。我的意思是,它爬网了,当我打印项目列表时,我可以看到确切的标题和链接,但是信息没有被废弃。是的,我已经修复了它。请接受您上一个问题的答案之一,我将在这里发布我的修复作为答案。我做过,而且无论如何都会做。提前谢谢,效果很好。非常感谢。定义项=[],将不会初始化项列表。如果需要使用相同的列表来捕获来自2个不同解析的内容,该怎么办。
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from myspider.items import CraigslistSampleItem


class MySpider(CrawlSpider):
    name = "craigs"
    allowed_domains = ["sfbay.craigslist.org"]
    start_urls = ["http://sfbay.craigslist.org/"]

    rules = (
        Rule(SgmlLinkExtractor(allow=("index\d00\.html")), callback="parse_items_2", follow= True),
        Rule(SgmlLinkExtractor(allow=(r'sfbay.craigslist.org/npo')), callback="parse_items_1", follow= True),
        )

    def __init__(self, *a, **kw):
        super(MySpider, self).__init__(*a, **kw)
        self.items = []
        self.item = CraigslistSampleItem()

    def parse_items_1(self, response):
#       print response.url
        hxs = HtmlXPathSelector(response)
        titles = hxs.select("//div")
        for title in titles:
            self.item ["title"] = title.select("//li/a/text()").extract()
            self.item ["link"] = title.select("//li/a/@href").extract()
            print ('**parse-items_1:', self.item["title"])
            self.items.append(self.item)
        return self.items

    def parse_items_2(self, response):
#       print response.url
        hxs = HtmlXPathSelector(response)
        titles = hxs.select("//p")
        for title in titles:
            self.item ["title"] = title.select("a/text()").extract()
            self.item ["link"] = title.select("a/@href").extract()
            print ('**parse_items_2:', self.item["title"], self.item["link"])
            self.items.append(self.item)
        return self.items
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from tutorial.items import CraigslistSampleItem

class MySpider(CrawlSpider):
    name = "craigs"
    allowed_domains = ["sfbay.craigslist.org"]
    start_urls = ["http://sfbay.craigslist.org/"]

    rules = (
        Rule(SgmlLinkExtractor(allow=("index\d00\.html")), callback="parse_items_2", follow= True),
        Rule(SgmlLinkExtractor(allow=(r'sfbay.craigslist.org/npo')), callback="parse_items_1", follow= True),
        )

    def parse_items_1(self, response):
        items = []
        hxs = HtmlXPathSelector(response)
        titles = hxs.select("//div")
        for title in titles:
            item = CraigslistSampleItem()
            item ["title"] = title.select("//li/a/text()").extract()
            item ["link"] = title.select("//li/a/@href").extract()
            print ('**parse-items_1:', item["title"])
            items.append(item)
        return items

    def parse_items_2(self, response):
        hxs = HtmlXPathSelector(response)
        titles = hxs.select("//p")
        items = []
        for title in titles:
            item = CraigslistSampleItem()
            item ["title"] = title.select("a/text()").extract()
            item ["link"] = title.select("a/@href").extract()
            print ('**parse_items_2:', item["title"], item["link"])
            items.append(item)
        return items