使用mongodb编写简单的python scrapy爬虫程序
我已经开始编写一个简单的scrapy模块供mongodb使用。我是python新手,我写的代码有问题: 国会议员使用mongodb编写简单的python scrapy爬虫程序,python,mongodb,web-scraping,scrapy,Python,Mongodb,Web Scraping,Scrapy,我已经开始编写一个简单的scrapy模块供mongodb使用。我是python新手,我写的代码有问题: 国会议员 import scrapy from scrapy.selector import Selector from scrapy.loader import ItemLoader from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy
import scrapy
from scrapy.selector import Selector
from scrapy.loader import ItemLoader
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import HtmlResponse
from congress.items import CongressItem
class CongressSpider(CrawlSpider):
name = "congres"
allowed_domains = ["www.congress.gov"]
start_urls = [
'https://www.congress.gov/members',
]
#creating a rule for my crawler. I only want it to continue to the next page, don't follow any other links.
rules = (Rule(LinkExtractor(allow=(),restrict_xpaths=("//a[@class='next']",)), callback="parse_page", follow=True),)
def parse_page(self, response):
for search in response.selector.xpath(".//li[@class='compact']"):
yield {'member' : ' '.join(search.xpath("normalize-space(span/a/text())").extract()).strip(),
'state' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item']/span/text())").extract()).strip(),
'District' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][2]/span/text())").extract()).strip(),
'party' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][3]/span/text())").extract()).strip(),
'Served' : ' '.join(search.xpath("normalize-space(div[@class='quick-search-member']//span[@class='result-item'][4]/span//li/text())").extract()).strip(),
}
items.py
import scrapy
class CongressItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
member = scrapy.Field()
state = scrapy.Field()
District = scrapy.Field()
party = scrapy.Field()
served = scrapy.Field()
管道.py
from pymongo import MongoClient
from scrapy.conf import settings
from scrapy.exceptions import DropItem
from scrapy import log
class CongressPipeline(object):
collection_name= 'members'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
)
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].insert(dict(item))
return item
设置.py
BOT_NAME = 'congres'
SPIDER_MODULES = ['congres.spiders']
NEWSPIDER_MODULE = 'congres.spiders'
MONGO_URI = 'mongodb://localhost:27017'
MONGO_DATABASE = 'congres'
ROBOTSTXT_OBEY = True
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'congress.pipelines.CongresPipeline': 300,
}
它显示的错误是
Unhandled error in Deferred:
2017-07-09 11:15:33 [twisted] CRITICAL: Unhandled error in Deferred:
2017-07-09 11:15:34 [twisted] CRITICAL:
Traceback (most recent call last):
File "c:\python27\lib\site-packages\twisted\internet\defer.py", line 1386,
in _inlineCallbacks
result = g.send(result)
File "c:\python27\lib\site-packages\scrapy\crawler.py", line 95, in crawl
six.reraise(*exc_info)
File "c:\python27\lib\site-packages\scrapy\crawler.py", line 79, in crawl
yield self.engine.open_spider(self.spider, start_requests)
NameError: global name 'pymongo' is not defined
您只是在
pipelines.py中导入MongoClient
from pymongo import MongoClient
在open\u spider
方法中,您是这样使用它的
self.client = pymongo.MongoClient(self.mongo_uri)
由于未导入pymongo
,因此出现错误。将最后一行更改为
self.client = MongoClient(self.mongo_uri)
把错误放在问题中;不作为链接代码和错误之间没有联系。图中显示blackberry_spider中存在缩进
错误,但这里您提到的代码来自国会
spider。@Mani更新了错误