Web scraping 530尝试打开FTP目录时出错

Web scraping 530尝试打开FTP目录时出错,web-scraping,ftp,scrapy,scrapy-spider,Web Scraping,Ftp,Scrapy,Scrapy Spider,我想使用Scrapy在ftp://ftp.co.palm-beach.fl.us/Building%20Permits/ 这是我的蜘蛛: # -*- coding: utf-8 -*- import scrapy from scrapy.http import Request class LatestPermitsSpider(scrapy.Spider): name= "latest_permits" allowed_domains=["ftp.co.pa

我想使用Scrapy在
ftp://ftp.co.palm-beach.fl.us/Building%20Permits/

这是我的蜘蛛:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request

class LatestPermitsSpider(scrapy.Spider):
    name=           "latest_permits"
    allowed_domains=["ftp.co.palm-beach.fl.us"]
    handle_httpstatus_list = [404]

    ftpUser=        "the_username"
    ftpPW=          "the_password"
    permitFilesDir= "ftp://ftp.co.palm-beach.fl.us/Building%20Permits/"

    def start_requests(self):
        yield Request(
            url=self.permitFilesDir,
            meta={
                "ftp_user": self.ftpUser,
                "ftp_password": self.ftpPW
            }
        )

    def parse(self,response):
        print response.body
当我运行
scrapy crawl\u-allows
时,出现以下错误:

ConnectionLost: ('FTP connection lost', <twisted.python.failure.Failure twisted.protocols.ftp.CommandFailed: ['530 Sorry, no ANONYMOUS access allowed.']>)
ConnectionLost:('FTP连接丢失',)

为什么即使我提供了正确的用户名和密码,也会出现此错误?

请查看下面的scrapy源代码

问题不在于您的
用户名
密码
。问题是
scrapy
只支持使用
ftp
下载
文件,不支持列出目录。您正在使用的url是目录url

有一种可能的解决方法可以实际使用包名
ftptree

使用下面的代码添加
handlers.py

import json

from twisted.protocols.ftp import FTPFileListProtocol

from scrapy.http import Response
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler

class FtpListingHandler(FTPDownloadHandler):
    def gotClient(self, client, request, filepath):
        self.client = client
        protocol = FTPFileListProtocol()
        return client.list(filepath, protocol).addCallbacks(
            callback=self._build_response, callbackArgs=(request, protocol),
            errback=self._failed, errbackArgs=(request,))

    def _build_response(self, result, request, protocol):
        self.result = result
        body = json.dumps(protocol.files)
        return Response(url=request.url, status=200, body=body)
然后在
设置.py中使用

DOWNLOAD_HANDLERS = {'ftp': 'cralwername.handlers.FtpListingHandler'}
蜘蛛标本

import os
import json
from urlparse import urlparse

from scrapy import Spider
from scrapy.http.request import Request

from ftptree_crawler.items import FtpTreeLeaf

class AnonFtpRequest(Request):
    anon_meta = {'ftp_user': 'anonymous',
                 'ftp_password': 'laserson@cloudera.com'}

    def __init__(self, *args, **kwargs):
        super(AnonFtpRequest, self).__init__(*args, **kwargs)
        self.meta.update(self.anon_meta)


class FtpTreeSpider(Spider):
    name = 'ftptree'

    def __init__(self, config_file, *args, **kwargs):
        super(FtpTreeSpider, self).__init__(*args, **kwargs)
        with open(config_file, 'r') as ip:
            config = json.loads(ip.read())
        url = 'ftp://%s/%s' % (config['host'], config['root_path'])
        self.start_url = url
        self.site_id = config['id']

    def start_requests(self):
        yield AnonFtpRequest(self.start_url)

    def parse(self, response):
        url = urlparse(response.url)
        basepath = url.path
        files = json.loads(response.body)
        for f in files:
            if f['filetype'] == 'd':
                path = os.path.join(response.url, f['filename'])
                request = AnonFtpRequest(path)
                yield request
            if f['filetype'] == '-':
                path = os.path.join(basepath, f['filename'])
                result = FtpTreeLeaf(
                    filename=f['filename'], path=path, size=f['size'])
                yield result
如果需要更多信息,请查看链接


查看下面的scrapy源代码

问题不在于您的
用户名
密码
。问题是
scrapy
只支持使用
ftp
下载
文件,不支持列出目录。您正在使用的url是目录url

有一种可能的解决方法可以实际使用包名
ftptree

使用下面的代码添加
handlers.py

import json

from twisted.protocols.ftp import FTPFileListProtocol

from scrapy.http import Response
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler

class FtpListingHandler(FTPDownloadHandler):
    def gotClient(self, client, request, filepath):
        self.client = client
        protocol = FTPFileListProtocol()
        return client.list(filepath, protocol).addCallbacks(
            callback=self._build_response, callbackArgs=(request, protocol),
            errback=self._failed, errbackArgs=(request,))

    def _build_response(self, result, request, protocol):
        self.result = result
        body = json.dumps(protocol.files)
        return Response(url=request.url, status=200, body=body)
然后在
设置.py中使用

DOWNLOAD_HANDLERS = {'ftp': 'cralwername.handlers.FtpListingHandler'}
蜘蛛标本

import os
import json
from urlparse import urlparse

from scrapy import Spider
from scrapy.http.request import Request

from ftptree_crawler.items import FtpTreeLeaf

class AnonFtpRequest(Request):
    anon_meta = {'ftp_user': 'anonymous',
                 'ftp_password': 'laserson@cloudera.com'}

    def __init__(self, *args, **kwargs):
        super(AnonFtpRequest, self).__init__(*args, **kwargs)
        self.meta.update(self.anon_meta)


class FtpTreeSpider(Spider):
    name = 'ftptree'

    def __init__(self, config_file, *args, **kwargs):
        super(FtpTreeSpider, self).__init__(*args, **kwargs)
        with open(config_file, 'r') as ip:
            config = json.loads(ip.read())
        url = 'ftp://%s/%s' % (config['host'], config['root_path'])
        self.start_url = url
        self.site_id = config['id']

    def start_requests(self):
        yield AnonFtpRequest(self.start_url)

    def parse(self, response):
        url = urlparse(response.url)
        basepath = url.path
        files = json.loads(response.body)
        for f in files:
            if f['filetype'] == 'd':
                path = os.path.join(response.url, f['filename'])
                request = AnonFtpRequest(path)
                yield request
            if f['filetype'] == '-':
                path = os.path.join(basepath, f['filename'])
                result = FtpTreeLeaf(
                    filename=f['filename'], path=path, size=f['size'])
                yield result
如果需要更多信息,请查看链接


因为这不是向FTP传递用户名和密码的方式。使用旧式连接
protocol://username:password@主机:端口改为URL格式。我不知道我要访问的FTP的主机和端口主机是
FTP.co.palm-beach.fl.us
,您当然知道。您正在访问
ftp://ftp.co.palm-beach.fl.us/Building%20Permits/
因此,登录该FTP的正确方法是使用
ftp://the_username:the_password@ftp.co.palm-beach.fl.us/建筑%20许可证/“
因为这不是向FTP传递用户名和密码的方式。使用旧式连接
protocol://username:password@主机:端口改为URL格式。我不知道我要访问的FTP的主机和端口主机是
FTP.co.palm-beach.fl.us
,您当然知道。您正在访问
ftp://ftp.co.palm-beach.fl.us/Building%20Permits/
因此,登录该FTP的正确方法是使用
ftp://the_username:the_password@ftp.co.palm-beach.fl.us/建筑%20许可证/“