使用mongodb编写简单的python scrapy爬虫程序_Python_Mongodb_Web Scraping_Scrapy

使用mongodb编写简单的python scrapy爬虫程序

python mongodb web-scraping scrapy

使用mongodb编写简单的python scrapy爬虫程序,python,mongodb,web-scraping,scrapy,Python,Mongodb,Web Scraping,Scrapy,我已经开始编写一个简单的scrapy模块供mongodb使用。我是python新手，我写的代码有问题：国会议员 import scrapy from scrapy.selector import Selector from scrapy.loader import ItemLoader from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy

我已经开始编写一个简单的scrapy模块供mongodb使用。我是python新手，我写的代码有问题：

国会议员

import scrapy

from scrapy.selector import Selector
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import HtmlResponse
from congress.items import CongressItem

class CongressSpider(CrawlSpider):
    name = "congres"
    allowed_domains = ["www.congress.gov"]
    start_urls = [
            'https://www.congress.gov/members',
        ]
    #creating a rule for my crawler. I only want it to continue to the next page, don't follow any other links.
    rules = (Rule(LinkExtractor(allow=(),restrict_xpaths=("//a[@class='next']",)), callback="parse_page", follow=True),)

    def parse_page(self, response):
        for search in response.selector.xpath(".//li[@class='compact']"):
            yield {'member' : ' '.join(search.xpath("normalize-space(span/a/text())").extract()).strip(),
               'state' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item']/span/text())").extract()).strip(),
                'District' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][2]/span/text())").extract()).strip(),
                'party' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][3]/span/text())").extract()).strip(),
                'Served' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][4]/span//li/text())").extract()).strip(),
            }

items.py

import scrapy
class CongressItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()

    member = scrapy.Field()
    state = scrapy.Field()
    District = scrapy.Field()
    party = scrapy.Field()
    served = scrapy.Field()

管道.py

from pymongo import MongoClient
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log

class CongressPipeline(object):
    collection_name= 'members'
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
        )
    def open_spider(self,spider):
            self.client = pymongo.MongoClient(self.mongo_uri)
            self.db = self.client[self.mongo_db]
    def close_spider(self, spider):
        self.client.close()
    def process_item(self, item, spider):
        self.db[self.collection_name].insert(dict(item))
        return item

设置.py

BOT_NAME = 'congres'

SPIDER_MODULES = ['congres.spiders']
NEWSPIDER_MODULE = 'congres.spiders'





MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = 'congres'
ROBOTSTXT_OBEY = True
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
   'congress.pipelines.CongresPipeline': 300,
}

它显示的错误是

Unhandled error in Deferred:
2017-07-09 11:15:33 [twisted] CRITICAL: Unhandled error in Deferred:

2017-07-09 11:15:34 [twisted] CRITICAL:
Traceback (most recent call last):
File "c:\python27\lib\site-packages\twisted\internet\defer.py", line 1386, 
in _inlineCallbacks
result = g.send(result)
File "c:\python27\lib\site-packages\scrapy\crawler.py", line 95, in crawl
six.reraise(*exc_info)
File "c:\python27\lib\site-packages\scrapy\crawler.py", line 79, in crawl
yield self.engine.open_spider(self.spider, start_requests)
NameError: global name 'pymongo' is not defined

您只是在

pipelines.py中导入MongoClient

from pymongo import MongoClient

在open\u spider
方法中，您是这样使用它的
self.client = pymongo.MongoClient(self.mongo_uri)

由于未导入pymongo
，因此出现错误。将最后一行更改为
self.client = MongoClient(self.mongo_uri)

把错误放在问题中；不作为链接代码和错误之间没有联系。图中显示blackberry_spider中存在缩进
错误，但这里您提到的代码来自国会
spider。@Mani更新了错误