Python 使用';刮痧';已爬网0个页面和项目

Python 使用';刮痧';已爬网0个页面和项目,python,web-scraping,scrapy,Python,Web Scraping,Scrapy,我从一个站点设置了一个代理抓取器,但是我什么也没有得到 import scrapy 从scrapy.item导入字段,项目 从scrapy.spider导入爬行蜘蛛,规则 从scrapy.LinkExtractor导入LinkExtractor 从scrapy.loader导入ItemLoader 从scrapy.loader.processors导入MapCompose ProxyServersPro类(项目): ip=scrapy.Field() 端口=scrapy.Field() coun

我从一个站点设置了一个代理抓取器,但是我什么也没有得到

import scrapy
从scrapy.item导入字段,项目
从scrapy.spider导入爬行蜘蛛,规则
从scrapy.LinkExtractor导入LinkExtractor
从scrapy.loader导入ItemLoader
从scrapy.loader.processors导入MapCompose
ProxyServersPro类(项目):
ip=scrapy.Field()
端口=scrapy.Field()
country=scrapy.Field()
速度=scrapy.Field()
协议=scrapy.Field()
anon=scrapy.Field()
类ProxyServersPro(爬行爬行器):
name=“ProxyServersProCrawler”
起始URL=[”https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/1"]
允许的_域=['proxyservers.pro']
规则={
规则(LinkExtractor(allow=r'page'),回调='parse_item')
}
def解析_项(自身、响应):
item=ItemLoader(ProxyServersPro(),response=response)
item.add_xpath('ip','/*[@id=“content content”]/div/div/div[1]/table/tbody/tr[1]/td[2]/a/text()
item.add_xpath('port','//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[3]/span/text()
item.add_xpath('country','//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[4]/text())
item.add_xpath('speed','//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[5]/div[1]/div/div/text()
item.add_xpath('protocol','//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[7]/text())
item.add_xpath('anon','//html/body/div[1]/div/div[2]/div/div/div/div[1]/table/tbody/tr[1]/td[8]/text())
返回项。加载_项()
这就是控制台所说的

2019-03-24 04:53:27 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)

有人能知道发生了什么事吗?谢谢

这是工作示例代码,请看一看

# -*- coding: utf-8 -*-
from scrapy import Selector
from scrapy.http import Request, FormRequest, HtmlResponse
from scrapy.spiders import CrawlSpider
from scrapy.conf import settings
import urllib
import json
import re
from urllib.parse import urljoin
from html.parser import HTMLParser
from requests import Session

from scrapy import Item, Field


class ProxyServersPro(Item):
    ip = Field()
    port = Field()
    country = Field()
    speed = Field()
    protocol = Field()
    anon = Field()
    port = Field()
class ProxyServers(CrawlSpider):
    name = "ProxyServersProCrawler"

    allowed_domains = ['proxyservers.pro']
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }
    start_url = ['https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/1', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/2', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/3', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/4', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/5']

    def __init__(self):
        super(ProxyServers, self).__init__()

    def start_requests(self):
        for url in self.start_url:
            yield Request(url, callback=self.parse_companies, headers=self.headers)


    def parse_companies(self, response):
        table = response.xpath('//table[@class="table table-hover"]/tbody/tr')
        for data in table:
            ip = data.xpath('./td[2]/a/text()').extract_first()
            country = data.xpath('./td[4]/text()').extract_first()
            protocol = data.xpath('./td[7]/text()').extract_first()
            anon = data.xpath('./td[8]/text()').extract_first()
            port = data.xpath('./td[3]/text()').extract_first()

            item = ProxyServersPro()
            item['ip'] = ip
            item['country'] = country
            item['protocol'] = protocol
            item['anon'] = anon
            item['port'] = port
            yield item

此外,端口和速度不在网站内容中,它正在加载,我们无法通过xpath。

以下是工作示例代码,请查看

# -*- coding: utf-8 -*-
from scrapy import Selector
from scrapy.http import Request, FormRequest, HtmlResponse
from scrapy.spiders import CrawlSpider
from scrapy.conf import settings
import urllib
import json
import re
from urllib.parse import urljoin
from html.parser import HTMLParser
from requests import Session

from scrapy import Item, Field


class ProxyServersPro(Item):
    ip = Field()
    port = Field()
    country = Field()
    speed = Field()
    protocol = Field()
    anon = Field()
    port = Field()
class ProxyServers(CrawlSpider):
    name = "ProxyServersProCrawler"

    allowed_domains = ['proxyservers.pro']
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive'
    }
    start_url = ['https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/1', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/2', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/3', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/4', 'https://es.proxyservers.pro/proxy/list/speed/2/anonymity/elite/order/duration/order_dir/asc/page/5']

    def __init__(self):
        super(ProxyServers, self).__init__()

    def start_requests(self):
        for url in self.start_url:
            yield Request(url, callback=self.parse_companies, headers=self.headers)


    def parse_companies(self, response):
        table = response.xpath('//table[@class="table table-hover"]/tbody/tr')
        for data in table:
            ip = data.xpath('./td[2]/a/text()').extract_first()
            country = data.xpath('./td[4]/text()').extract_first()
            protocol = data.xpath('./td[7]/text()').extract_first()
            anon = data.xpath('./td[8]/text()').extract_first()
            port = data.xpath('./td[3]/text()').extract_first()

            item = ProxyServersPro()
            item['ip'] = ip
            item['country'] = country
            item['protocol'] = protocol
            item['anon'] = anon
            item['port'] = port
            yield item

此外,端口和速度不在网站内容中,它正在加载,我们无法通过xpath。

不适用于我。。。“NameError:name'Request'未定义”我已经编辑了我的答案,请现在检查它是否可以工作现在正在发生“runspider:error:无法加载'22.py':没有名为'requests'的模块”请安装模块请求,然后再试一次,您可以使用此命令安装它
pip install requests
好,现在它可以工作了,但它只提取了一个项目,我如何继续?谢谢你不为我工作。。。“NameError:name'Request'未定义”我已经编辑了我的答案,请现在检查它是否可以工作现在正在发生“runspider:error:无法加载'22.py':没有名为'requests'的模块”请安装模块请求,然后再试一次,您可以使用此命令安装它
pip install requests
好,现在它可以工作了,但它只提取了一个项目,我如何继续?谢谢,顺便说一句