如何在scrapy中使用管道项目
我是scrapy的新用户,可以抓取我的网站。我想将抓取的数据存储到mysql数据库中。 myspider.py: 对于pipelines.py,我修改并使用了googldir的示例。因此,当我运行crawl时,会出现以下错误: exceptions.AttributeError:“MininovaSpider”对象没有属性“iterkeys” exceptions.TypeError:“MininovaSpider”对象不可下标 pipeline.py:如何在scrapy中使用管道项目,scrapy,pipeline,web-crawler,Scrapy,Pipeline,Web Crawler,我是scrapy的新用户,可以抓取我的网站。我想将抓取的数据存储到mysql数据库中。 myspider.py: 对于pipelines.py,我修改并使用了googldir的示例。因此,当我运行crawl时,会出现以下错误: exceptions.AttributeError:“MininovaSpider”对象没有属性“iterkeys” exceptions.TypeError:“MininovaSpider”对象不可下标 pipeline.py: from scrapy import l
from scrapy import log
from twisted.enterprise import adbapi
import time
import MySQLdb.cursors
class Pipeline(object):
def __init__(self):
self.dbpool = adbapi.ConnectionPool('MySQLdb',
db='test',
user='root',
passwd='',
cursorclass=MySQLdb.cursors.DictCursor,
charset='utf8',
use_unicode=True
)
def process_item(self, spider, item):
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
def _conditional_insert(self, tx, item):
tx.execute("select * from database where url = %s", (item['url'] ))
result = tx.fetchone()
if result:
log.msg("Item already stored in db: %s" % item, level=log.DEBUG)
else:
tx.execute(\
"insert into database (wilaya,titre, site, lien,resume,timestamp) "
"values (%s, %s, %s, %s,%s,%s)",
(item['wilaya'],
item['title'],
'example.com',item['url'],item['description'],
time.time())
)
log.msg("Item stored in db: %s" % item, level=log.DEBUG)
def handle_error(self, e):
log.err(e)
和回溯:
Traceback (most recent call last):
File "/usr/lib/python2.7/twisted/internet/defer.py", line 287, in addCallbacks
self._runCallbacks()
File "/usr/lib/python2.7/twisted/internet/defer.py", line 545, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/lib/python2.7/site-packages/scrapy/core/scraper.py", line 208, in _itemproc_finished
item=output, response=response, spider=spider)
File "/usr/lib/python2.7/site-packages/scrapy/utils/signal.py", line 53, in send_catch_log_deferred
*arguments, **named)
--- <exception caught here> ---
File "/usr/lib/python2.7/twisted/internet/defer.py", line 134, in maybeDeferred
result = f(*args, **kw)
File "/usr/lib/python2.7/site-packages/scrapy/xlib/pydispatch/robustapply.py", line 47, in robustApply
return receiver(*arguments, **named)
File "/usr/lib/python2.7/site-packages/scrapy/contrib/feedexport.py", line 177, in item_scraped
slot.exporter.export_item(item)
File "/usr/lib/python2.7/site-packages/scrapy/contrib/exporter/__init__.py", line 109, in export_item
itemdict = dict(self._get_serialized_fields(item))
File "/usr/lib/python2.7/site-packages/scrapy/contrib/exporter/__init__.py", line 60, in _get_serialized_fields
field_iter = item.iterkeys()
**exceptions.AttributeError: 'MininovaSpider' object has no attribute 'iterkeys'
2012-01-18 16:00:43-0600 [scrapy] Unhandled Error
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 503, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 530, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 483, in run
self.__target(*self.__args, **self.__kwargs)
--- <exception caught here> ---
File "/usr/lib/python2.7/twisted/python/threadpool.py", line 207, in _worker
result = context.call(ctx, function, *args, **kwargs)
File "/usr/lib/python2.7/twisted/python/context.py", line 118, in callWithContext
return self.currentContext().callWithContext(ctx, func, *args, **kw)
File "/usr/lib/python2.7/twisted/python/context.py", line 81, in callWithContext
return func(*args,**kw)
File "/usr/lib/python2.7/twisted/enterprise/adbapi.py", line 448, in _runInteraction
result = interaction(trans, *args, **kw)
File "/opt/scrapy/test/pipelines.py", line 33, in _conditional_insert
tx.execute("select * from database where url = %s", (item['url'] ))
**exceptions.TypeError: 'MininovaSpider' object is not subscriptable
看起来您在某个地方生成了一个spider实例,而不是一个项目。我想你有更多的代码没有显示
在Pipeline.process_项目中,将其放入以确认:
def process_item(self, spider, item):
assert isinstance(item, Torrent), 'Here should be Torrent instance!'
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item
这个问题不是很清楚。能否显示管道的代码和完整的异常回溯?
exceptions.TypeError: 'MininovaSpider' object is not subscriptable
def process_item(self, spider, item):
assert isinstance(item, Torrent), 'Here should be Torrent instance!'
query = self.dbpool.runInteraction(self._conditional_insert, item)
query.addErrback(self.handle_error)
return item