Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/logging/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 在Scrapy中,如何使用嵌套项目加载器而无需进一步';添加xpath';电话_Python_Scrapy - Fatal编程技术网

Python 在Scrapy中,如何使用嵌套项目加载器而无需进一步';添加xpath';电话

Python 在Scrapy中,如何使用嵌套项目加载器而无需进一步';添加xpath';电话,python,scrapy,Python,Scrapy,我正试图利用这一点重构一只粘乎乎的蜘蛛。我已经讲到了一个部分,我将从从单个XPath表达式提取的数据中填充几个字段,为了简洁起见,我想使用这些字段 这是迄今为止的蜘蛛: from scrapy.spiders import SitemapSpider from apkmirror_scraper.items import ApkmirrorScraperItem, ApkmirrorItemLoader class ApkmirrorSitemapSpider(SitemapSpider):

我正试图利用这一点重构一只粘乎乎的蜘蛛。我已经讲到了一个部分,我将从从单个XPath表达式提取的数据中填充几个字段,为了简洁起见,我想使用这些字段

这是迄今为止的蜘蛛:

from scrapy.spiders import SitemapSpider
from apkmirror_scraper.items import ApkmirrorScraperItem, ApkmirrorItemLoader

class ApkmirrorSitemapSpider(SitemapSpider):
    name = 'apkmirror-spider'
    sitemap_urls = ['http://www.apkmirror.com/sitemap_index.xml']
    sitemap_rules = [(r'.*-android-apk-download/$', 'parse')]

    def parse(self, response):
        loader = ApkmirrorItemLoader(item=ApkmirrorScraperItem(), response=response)

        loader.add_value('url', response.url)
        loader.add_xpath(field_name='title', xpath='//h1[@title]/text()')
        loader.add_xpath(field_name='developer', xpath='//h3[@title]/a/text()')
        loader.add_xpath(field_name='app', xpath='//*[contains(@data-channel-name, "App Updates")]/@data-channel-name')

        apk_details_loader = loader.nested_xpath('//*[@title="APK details"]/following-sibling::*[@class="appspec-value"]')

        apk_details_loader.add_xpath(field_name="version_name", xpath=".//text()")
        apk_details_loader.add_xpath(field_name="version_code", xpath=".//text()")

        return loader.load_item()
其中
items.py

import re

import scrapy
import scrapy.loader

from scrapy.loader.processors import Compose, MapCompose, TakeFirst

class ApkmirrorScraperItem(scrapy.Item):
    url = scrapy.Field()
    title = scrapy.Field()
    developer = scrapy.Field()
    app = scrapy.Field()
    version_name = scrapy.Field()
    version_code = scrapy.Field()
    architectures = scrapy.Field()
    package = scrapy.Field()
    apk_file_size = scrapy.Field()
    android_min_version = scrapy.Field()
    android_target_version = scrapy.Field()
    supported_dpis = scrapy.Field()
    md5_signature = scrapy.Field()
    time_uploaded = scrapy.Field()
    time_scraped = scrapy.Field()
    download_link = scrapy.Field()


def parse_app(data_channel_name):
    '''Parse the name of the app from the "data-channel-name" attribute of the button named "Follow [app_name] Updates".'''
    pattern = re.compile(r'(?P<app>.+) App Updates')
    return pattern.search(data_channel_name).groupdict().get("app")

def get_version_line(apk_details):
    '''Get the line containing the version from the 'APK details' section.'''
    return next(line for line in apk_details if line.startswith("Version:"))

def get_architectures_line(apk_details):
    '''Get the line containing the supported architectures (e.g. "arm", "x64") from the 'APK details' section, if present.'''
    return apk_details[1] if not apk_details[1].startswith("Package:") else None       # The line does not contain any keywords and may not be present, in which case None is returned

def get_package_line(apk_details):
    return next(line for line in apk_details if line.startswith("Package:"))                # The 'package line' is always present and starts with "Package:"

def parse_version_line(version_line):
    '''Parse the 'versionName' and 'versionCode' from the relevant line in 'APK details'.'''
    PATTERN = r"^Version: (?P<version_name>.+) \((?P<version_code>\d+)\)\s*$"       # Note that the pattern includes the end-of-line character ($). This is necessary because some package names (e.g. Google Play) themselves contain brackets.
    return re.match(PATTERN, version_line).groupdict()


class ApkmirrorItemLoader(scrapy.loader.ItemLoader):

    url_out = TakeFirst()

    title_in = MapCompose(unicode.strip)
    title_out = TakeFirst()

    developer_in = MapCompose(unicode.strip)
    developer_out = TakeFirst()

    app_in = MapCompose(parse_app)
    app_out = TakeFirst()

    version_name_in = Compose(get_version_line, parse_version_line, lambda d: d.get("version_name"))
    version_name_out = TakeFirst()

    version_code_in = Compose(get_version_line, parse_version_line, lambda d: d.get("version_code"))
    version_code_out = TakeFirst()
它按预期提取字段:

# Scraped Items  ------------------------------------------------------------
[{'app': u'Google Sheets',
  'developer': u'Google Inc.',
  'title': u'Google Sheets 1.7.152.06.30 (arm) (nodpi)',
  'url': 'http://www.apkmirror.com/apk/google-inc/sheets/sheets-1-7-152-06-release/google-sheets-1-7-152-06-30-android-apk-download/',
  'version_code': u'71520630',
  'version_name': u'1.7.152.06.30'}]
然而,我想进一步重构apk\u details\u加载程序。请注意带有
apk\u details\u load.add\u xpath
的两行如何具有相同的
xpath
参数,
“//text()”
。为了避免这种代码重复,我想将
apk\u details\u loader
定义为

apk_details_loader = loader.nested_xpath('//*[@title="APK details"]/following-sibling::*[@class="appspec-value"]//text()')
在末尾包含
//text()
,而不是
add_xpath
执行类似于
add
的操作,它不是进一步细化xpath表达式,然后调用输入处理器,而是直接调用输入处理器。这样的事情可能吗?

绝对可能

您需要将此新方法添加到
scrapy/loader/\uuu init\uu py
中的
ItemLoader
类中

这应该做到:

def添加(自身、字段名称、*处理器、**kw):
values=self.selector.extract()
添加值(字段名称、值、*处理器、**kw)

说明:
loader.nested_xpath(xpath)
不返回特殊的新加载程序;它初始化与您以前使用的加载程序类型相同的加载程序,只是将值
selector
设置为
self.selector.xpath(xpath)
(注意这里
self
指的是调用对象
loader
),新创建的
apk\u details\u loader
已包含要从中提取数据的选择器。您只需要在add方法中引用它,我们使用行
values=self.selector.extract()

apk_details_loader = loader.nested_xpath('//*[@title="APK details"]/following-sibling::*[@class="appspec-value"]//text()')