刮皮不'；尽管xpath是正确的，但不能下载数据_Xpath_Web Scraping_Scrapy_Scrapy Spider

刮皮不'；尽管xpath是正确的，但不能下载数据

xpath web-scraping scrapy

刮皮不'；尽管xpath是正确的，但不能下载数据,xpath,web-scraping,scrapy,scrapy-spider,Xpath,Web Scraping,Scrapy,Scrapy Spider,我正在从中爬网数据（仅此页面用于测试我的爬网程序） items.py import scrapy class ShipItem(scrapy.Item): name = scrapy.Field() imo = scrapy.Field() category = scrapy.Field() image_urls = scrapy.Field() images = scrapy.Field() class

我正在从中爬网数据（仅此页面用于测试我的爬网程序）

items.py

import scrapy

class ShipItem(scrapy.Item):
    name        = scrapy.Field()
    imo         = scrapy.Field()
    category    = scrapy.Field()
    image_urls  = scrapy.Field()
    images      = scrapy.Field()

class CategoryItem(scrapy.Item):
    name = scrapy.Field()
    link = scrapy.Field()

BOT_NAME = 'ship'
SPIDER_MODULES = ['ship.spiders']
NEWSPIDER_MODULE = 'ship.spiders'
DOWNLOAD_DELAY = 0.5

import scrapy
from ship.items import ShipItem

class ShipSpider(scrapy.Spider):

    name = "shipspider"
    allowed_domains = ["shipspotting.com"]
    page_url = "http://www.shipspotting.com"
    start_urls = [
    page_url + "/gallery/search.php?limit=192&limitstart=2112&sortkey=p.lid&sortorder=desc&page_limit=192&viewtype=2"
]

    def parse(self, response):
        ships = response.xpath('/html/body/center/table/tbody/tr/td[1]/table[1]/tbody/tr/td[2]/div[3]/center/table/tbody/tr/td/table[4]/tbody/tr')

        for ship in ships:
            item = ShipItem()
            item['name'] = ship.xpath('td/center/table[1]/tbody/tr/td[2]/span').extract()[0]

            yield item

import scrapy
from ship.items import CategoryItem

class CategorySpider(scrapy.Spider):
    name = "catspider"
    allowed_domains = ["shipspotting.com"]
    page_url = "http://www.shipspotting.com"
    start_urls = [
        page_url + "/gallery/categories.php"
    ]

    def parse(self, response):
        cats = response.xpath('//td[@class="whiteboxstroke"]/a')
        file = open('categories.txt', 'a')

        for cat in cats:
            item = CategoryItem()

            item['name'] = cat.xpath('img/@title').extract()[0]
            item['link'] = cat.xpath('@href').extract()[0]

            yield item

        file.close()

设置.py

import scrapy

class ShipItem(scrapy.Item):
    name        = scrapy.Field()
    imo         = scrapy.Field()
    category    = scrapy.Field()
    image_urls  = scrapy.Field()
    images      = scrapy.Field()

class CategoryItem(scrapy.Item):
    name = scrapy.Field()
    link = scrapy.Field()

BOT_NAME = 'ship'
SPIDER_MODULES = ['ship.spiders']
NEWSPIDER_MODULE = 'ship.spiders'
DOWNLOAD_DELAY = 0.5

import scrapy
from ship.items import ShipItem

class ShipSpider(scrapy.Spider):

    name = "shipspider"
    allowed_domains = ["shipspotting.com"]
    page_url = "http://www.shipspotting.com"
    start_urls = [
    page_url + "/gallery/search.php?limit=192&limitstart=2112&sortkey=p.lid&sortorder=desc&page_limit=192&viewtype=2"
]

    def parse(self, response):
        ships = response.xpath('/html/body/center/table/tbody/tr/td[1]/table[1]/tbody/tr/td[2]/div[3]/center/table/tbody/tr/td/table[4]/tbody/tr')

        for ship in ships:
            item = ShipItem()
            item['name'] = ship.xpath('td/center/table[1]/tbody/tr/td[2]/span').extract()[0]

            yield item

import scrapy
from ship.items import CategoryItem

class CategorySpider(scrapy.Spider):
    name = "catspider"
    allowed_domains = ["shipspotting.com"]
    page_url = "http://www.shipspotting.com"
    start_urls = [
        page_url + "/gallery/categories.php"
    ]

    def parse(self, response):
        cats = response.xpath('//td[@class="whiteboxstroke"]/a')
        file = open('categories.txt', 'a')

        for cat in cats:
            item = CategoryItem()

            item['name'] = cat.xpath('img/@title').extract()[0]
            item['link'] = cat.xpath('@href').extract()[0]

            yield item

        file.close()

spider/shipspider.py

import scrapy

class ShipItem(scrapy.Item):
    name        = scrapy.Field()
    imo         = scrapy.Field()
    category    = scrapy.Field()
    image_urls  = scrapy.Field()
    images      = scrapy.Field()

class CategoryItem(scrapy.Item):
    name = scrapy.Field()
    link = scrapy.Field()

BOT_NAME = 'ship'
SPIDER_MODULES = ['ship.spiders']
NEWSPIDER_MODULE = 'ship.spiders'
DOWNLOAD_DELAY = 0.5

import scrapy
from ship.items import ShipItem

class ShipSpider(scrapy.Spider):

    name = "shipspider"
    allowed_domains = ["shipspotting.com"]
    page_url = "http://www.shipspotting.com"
    start_urls = [
    page_url + "/gallery/search.php?limit=192&limitstart=2112&sortkey=p.lid&sortorder=desc&page_limit=192&viewtype=2"
]

    def parse(self, response):
        ships = response.xpath('/html/body/center/table/tbody/tr/td[1]/table[1]/tbody/tr/td[2]/div[3]/center/table/tbody/tr/td/table[4]/tbody/tr')

        for ship in ships:
            item = ShipItem()
            item['name'] = ship.xpath('td/center/table[1]/tbody/tr/td[2]/span').extract()[0]

            yield item

import scrapy
from ship.items import CategoryItem

class CategorySpider(scrapy.Spider):
    name = "catspider"
    allowed_domains = ["shipspotting.com"]
    page_url = "http://www.shipspotting.com"
    start_urls = [
        page_url + "/gallery/categories.php"
    ]

    def parse(self, response):
        cats = response.xpath('//td[@class="whiteboxstroke"]/a')
        file = open('categories.txt', 'a')

        for cat in cats:
            item = CategoryItem()

            item['name'] = cat.xpath('img/@title').extract()[0]
            item['link'] = cat.xpath('@href').extract()[0]

            yield item

        file.close()

spider/categoryspider.py

import scrapy

class ShipItem(scrapy.Item):
    name        = scrapy.Field()
    imo         = scrapy.Field()
    category    = scrapy.Field()
    image_urls  = scrapy.Field()
    images      = scrapy.Field()

class CategoryItem(scrapy.Item):
    name = scrapy.Field()
    link = scrapy.Field()

BOT_NAME = 'ship'
SPIDER_MODULES = ['ship.spiders']
NEWSPIDER_MODULE = 'ship.spiders'
DOWNLOAD_DELAY = 0.5

import scrapy
from ship.items import ShipItem

class ShipSpider(scrapy.Spider):

    name = "shipspider"
    allowed_domains = ["shipspotting.com"]
    page_url = "http://www.shipspotting.com"
    start_urls = [
    page_url + "/gallery/search.php?limit=192&limitstart=2112&sortkey=p.lid&sortorder=desc&page_limit=192&viewtype=2"
]

    def parse(self, response):
        ships = response.xpath('/html/body/center/table/tbody/tr/td[1]/table[1]/tbody/tr/td[2]/div[3]/center/table/tbody/tr/td/table[4]/tbody/tr')

        for ship in ships:
            item = ShipItem()
            item['name'] = ship.xpath('td/center/table[1]/tbody/tr/td[2]/span').extract()[0]

            yield item

import scrapy
from ship.items import CategoryItem

class CategorySpider(scrapy.Spider):
    name = "catspider"
    allowed_domains = ["shipspotting.com"]
    page_url = "http://www.shipspotting.com"
    start_urls = [
        page_url + "/gallery/categories.php"
    ]

    def parse(self, response):
        cats = response.xpath('//td[@class="whiteboxstroke"]/a')
        file = open('categories.txt', 'a')

        for cat in cats:
            item = CategoryItem()

            item['name'] = cat.xpath('img/@title').extract()[0]
            item['link'] = cat.xpath('@href').extract()[0]

            yield item

        file.close()

catspider

运行非常完美。但是，

shipspider

不起作用。它只显示输出：

2015-06-24 20:15:16+0800 [scrapy] INFO: Scrapy 0.24.6 started (bot: ship)
2015-06-24 20:15:16+0800 [scrapy] INFO: Optional features available: ssl, http11
2015-06-24 20:15:16+0800 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'ship.spiders', 'SPIDER_MODULES': ['ship.spiders'], 'DOWNLOAD_DELAY': 0.5, 'BOT_NAME': 'ship'}
2015-06-24 20:15:16+0800 [scrapy] INFO: Enabled extensions: LogStats, TelnetConsole, CloseSpider, WebService, CoreStats, SpiderState
2015-06-24 20:15:16+0800 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2015-06-24 20:15:16+0800 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2015-06-24 20:15:16+0800 [scrapy] INFO: Enabled item pipelines: 
2015-06-24 20:15:16+0800 [shipspider] INFO: Spider opened
2015-06-24 20:15:16+0800 [shipspider] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2015-06-24 20:15:16+0800 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2015-06-24 20:15:16+0800 [scrapy] DEBUG: Web service listening on 127.0.0.1:6080
2015-06-24 20:15:19+0800 [shipspider] DEBUG: Crawled (200) <GET http://www.shipspotting.com/gallery/search.php?limit=192&limitstart=2112&sortkey=p.lid&sortorder=desc&page_limit=192&viewtype=2> (referer: None)
2015-06-24 20:15:19+0800 [shipspider] INFO: Closing spider (finished)
2015-06-24 20:15:19+0800 [shipspider] INFO: Dumping Scrapy stats:
    {'downloader/request_bytes': 318,
     'downloader/request_count': 1,
     'downloader/request_method_count/GET': 1,
     'downloader/response_bytes': 477508,
     'downloader/response_count': 1,
     'downloader/response_status_count/200': 1,
     'finish_reason': 'finished',
     'finish_time': datetime.datetime(2015, 6, 24, 12, 15, 19, 620358),
     'log_count/DEBUG': 3,
     'log_count/INFO': 7,
     'response_received_count': 1,
     'scheduler/dequeued': 1,
     'scheduler/dequeued/memory': 1,
     'scheduler/enqueued': 1,
     'scheduler/enqueued/memory': 1,
     'start_time': datetime.datetime(2015, 6, 24, 12, 15, 16, 319378)}
2015-06-24 20:15:19+0800 [shipspider] INFO: Spider closed (finished)

2015-06-2420:15:16+0800[scrapy]信息：scrapy 0.24.6已启动（bot:ship）
2015-06-24 20:15:16+0800[scrapy]信息：可选功能可用：ssl、http11
2015-06-24 20:15:16+0800[scrapy]信息：覆盖的设置：{'NEWSPIDER_MODULE'：'ship.SPIDER'，'SPIDER_MODULES'：['ship.SPIDER']，'DOWNLOAD_DELAY'：0.5，'BOT_NAME'：'ship'}
2015-06-24 20:15:16+0800[scrapy]信息：启用的扩展：LogStats、TelnetConsole、CloseSpider、WebService、CoreStats、SpiderState
2015-06-24 20:15:16+0800[scrapy]信息：启用的下载中间件：HttpAuthMiddleware、DownloadTimeoutMiddleware、UserAgentMiddleware、RetryMiddleware、DefaultHeadersMiddleware、MetaRefreshMiddleware、HttpCompressionMiddleware、RedirectMiddleware、Cookies Middleware、ChunkedTransferMiddleware、DownloadersStats
2015-06-24 20:15:16+0800[scrapy]信息：启用的spider中间件：HttpErrorMiddleware、OffsiteMiddleware、referermidleware、urlengthmiddleware、DepthMiddleware
2015-06-24 20:15:16+0800[scrapy]信息：启用的项目管道：
2015-06-24 20:15:16+0800[船蜘蛛]信息：蜘蛛已打开
2015-06-24 20:15:16+0800[shipspider]信息：爬网0页（0页/分钟），刮取0项（0项/分钟）
2015-06-24 20:15:16+0800[scrapy]调试：Telnet控制台监听127.0.0.1:6023
2015-06-24 20:15:16+0800[scrapy]调试：在127.0.0.1:6080上侦听Web服务
2015-06-24 20:15:19+0800[shipspider]调试：爬网（200）（参考：无）
2015-06-24 20:15:19+0800【shipspider】信息：关闭卡盘（已完成）
2015-06-24 20:15:19+0800【shipspider】信息：倾倒刮擦数据：
{'downloader/request_bytes'：318，
“下载程序/请求计数”：1，
“downloader/request\u method\u count/GET”：1，
“downloader/response_字节”：477508，
“下载程序/响应计数”：1，
“下载程序/响应状态\计数/200”：1，
“完成原因”：“完成”，
“完成时间”：datetime.datetime（2015,6,24,12,15,19,620358），
“日志计数/调试”：3，
“日志计数/信息”：7，
“响应\u已接收\u计数”：1，
“调度程序/出列”：1，
“调度程序/出列/内存”：1，
“调度程序/排队”：1，
“调度程序/排队/内存”：1，
“开始时间”：datetime.datetime（2015,6,24,12,15,16,319378）}
2015-06-24 20:15:19+0800[船蜘蛛]信息：蜘蛛已关闭（完成）

我想知道我的xpath是否错误。但当我尝试在Chrome中使用这些元素时，一切都正常工作

那么，我的shipspider有一些微妙的问题吗？

浏览器将tbody添加到表元素中，这就是为什么xpath在开发工具中工作，但在scrapy中失败的原因

通常您需要自己找到xpath，不要相信自动生成的xpath，因为它们通常都是不必要的长。例如，要获取有关船舶的数据，可以像这样使用xpath

//tr[td[@class='whiteboxstroke']]

要测试你的XPath，你应该使用scrapy shell

> scrapy shell "http://www.shipspotting.com/gallery/search.php?limit=192&limitstart=2112&sortkey=p.lid&sortorder=desc&page_limit=192&viewtype=2"
[s] Available Scrapy objects:
[s]   crawler    <scrapy.crawler.Crawler object at 0x7fbf52c122d0>
[s]   item       {}
[s]   request    <GET http://www.shipspotting.com/gallery/search.php?limit=192&limitstart=2112&sortkey=p.lid&sortorder=desc&page_limit=192&viewtype=2>
[s]   response   <200 http://www.shipspotting.com/gallery/search.php?limit=192&limitstart=2112&sortkey=p.lid&sortorder=desc&page_limit=192&viewtype=2>
[s]   settings   <scrapy.settings.Settings object at 0x7fbf54f5cf90>
[s]   spider     <DefaultSpider 'default' at 0x7fbf51f6a1d0>
[s] Useful shortcuts:
[s]   shelp()           Shell help (print this help)
[s]   fetch(req_or_url) Fetch request (or URL) and update local objects
[s]   view(response)    View response in a browser

In [1]: x = "/html/body/center/table/tbody/tr/td[1]/table[1]/tbody/tr/td[2]/div[3]/center/table/tbody/tr/td/table[4]/tbody/tr"

In [2]: response.xpath(x)
Out[2]: []

In [4]: response.xpath("//tr[td[@class='whiteboxstroke']]")
Out[4]: 
[<Selector xpath="//tr[td[@class='whiteboxstroke']]" data=u'<tr><td class="whiteboxstroke" style="pa'>,
 <Selector xpath="//tr[td[@class='whiteboxstroke']]" data=u'<tr><td class="whiteboxstroke" style="pa'>,
 <Selector xpath="//tr[td[@class='whiteboxstroke']]" data=u'<tr><td class="whiteboxstroke" style="pa'>,

>刮壳“http://www.shipspotting.com/gallery/search.php?limit=192&limitstart=2112&sortkey=p.lid&sortorder=desc&page_limit=192&viewtype=2"
[s] 可用的刮擦对象：
[s] 爬虫
[s] 项目{}
[s] 请求
[s] 回应
[s] 背景
[s] 蜘蛛
[s] 有用的快捷方式：
[s] shelp（）Shell帮助（打印此帮助）
[s] 获取（请求或url）获取请求（或url）并更新本地对象
[s] 查看（响应）在浏览器中查看响应
在[1]中：x=“/html/body/center/table/tbody/tr/td[1]/table[1]/tbody/tr/td[2]/div[3]/center/table/tbody/tr/td/table[4]/tbody/tr”
在[2]中：response.xpath（x）
输出[2]：[]
在[4]：response.xpath（//tr[td[@class='whiteboxstroke']]）中
出[4]：
[,
,
,

浏览器将tbody添加到表元素中，这就是xpath在开发工具中工作，但在scrapy中失败的原因，这就是原因

通常你需要自己找到xpath，不要相信自动生成的xpath，因为它们通常都很长

//tr[td[@class='whiteboxstroke']]

要测试你的XPath，你应该使用scrapy shell

> scrapy shell "http://www.shipspotting.com/gallery/search.php?limit=192&limitstart=2112&sortkey=p.lid&sortorder=desc&page_limit=192&viewtype=2"
[s] Available Scrapy objects:
[s]   crawler    <scrapy.crawler.Crawler object at 0x7fbf52c122d0>
[s]   item       {}
[s]   request    <GET http://www.shipspotting.com/gallery/search.php?limit=192&limitstart=2112&sortkey=p.lid&sortorder=desc&page_limit=192&viewtype=2>
[s]   response   <200 http://www.shipspotting.com/gallery/search.php?limit=192&limitstart=2112&sortkey=p.lid&sortorder=desc&page_limit=192&viewtype=2>
[s]   settings   <scrapy.settings.Settings object at 0x7fbf54f5cf90>
[s]   spider     <DefaultSpider 'default' at 0x7fbf51f6a1d0>
[s] Useful shortcuts:
[s]   shelp()           Shell help (print this help)
[s]   fetch(req_or_url) Fetch request (or URL) and update local objects
[s]   view(response)    View response in a browser

In [1]: x = "/html/body/center/table/tbody/tr/td[1]/table[1]/tbody/tr/td[2]/div[3]/center/table/tbody/tr/td/table[4]/tbody/tr"

In [2]: response.xpath(x)
Out[2]: []

In [4]: response.xpath("//tr[td[@class='whiteboxstroke']]")
Out[4]: 
[<Selector xpath="//tr[td[@class='whiteboxstroke']]" data=u'<tr><td class="whiteboxstroke" style="pa'>,
 <Selector xpath="//tr[td[@class='whiteboxstroke']]" data=u'<tr><td class="whiteboxstroke" style="pa'>,
 <Selector xpath="//tr[td[@class='whiteboxstroke']]" data=u'<tr><td class="whiteboxstroke" style="pa'>,

>刮壳“http://www.shipspotting.com/gallery/search.php?limit=192&limitstart=2112&sortkey=p.lid&sortorder=desc&page_limit=192&viewtype=2"
[s] 可用的刮擦对象：
[s] 爬虫
[s] 项目{}
[s] 请求
[s] 回应
[s] 背景
[s] 蜘蛛
[s] 有用的快捷方式：
[s] shelp（）Shell帮助（打印此帮助）
[s] 获取（请求或url）获取请求（或url）并更新本地对象
[s] 查看（响应）在浏览器中查看响应
在[1]中：x=“/html/body/center/table/tbody/tr/td[1]/table[1]/tbody/tr/td[2]/div[3]/center/table/tbody/tr/td/table[4]/tbody/tr”
在[2]中：response.xpath（x）
输出[2]：[]
在[4]：response.xpath（//tr[td[@class='whiteboxstroke']]）中
出[4]：
[,
,
,

非常感谢。您的回答教会了我很多关于xpath和scrapy的知识。非常感谢您。您的回答教会了我很多关于xpath和scrapy的知识