Web scraping 530尝试打开FTP目录时出错
我想使用Scrapy在Web scraping 530尝试打开FTP目录时出错,web-scraping,ftp,scrapy,scrapy-spider,Web Scraping,Ftp,Scrapy,Scrapy Spider,我想使用Scrapy在ftp://ftp.co.palm-beach.fl.us/Building%20Permits/ 这是我的蜘蛛: # -*- coding: utf-8 -*- import scrapy from scrapy.http import Request class LatestPermitsSpider(scrapy.Spider): name= "latest_permits" allowed_domains=["ftp.co.pa
ftp://ftp.co.palm-beach.fl.us/Building%20Permits/
这是我的蜘蛛:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
class LatestPermitsSpider(scrapy.Spider):
name= "latest_permits"
allowed_domains=["ftp.co.palm-beach.fl.us"]
handle_httpstatus_list = [404]
ftpUser= "the_username"
ftpPW= "the_password"
permitFilesDir= "ftp://ftp.co.palm-beach.fl.us/Building%20Permits/"
def start_requests(self):
yield Request(
url=self.permitFilesDir,
meta={
"ftp_user": self.ftpUser,
"ftp_password": self.ftpPW
}
)
def parse(self,response):
print response.body
当我运行scrapy crawl\u-allows
时,出现以下错误:
ConnectionLost: ('FTP connection lost', <twisted.python.failure.Failure twisted.protocols.ftp.CommandFailed: ['530 Sorry, no ANONYMOUS access allowed.']>)
ConnectionLost:('FTP连接丢失',)
为什么即使我提供了正确的用户名和密码,也会出现此错误?请查看下面的scrapy源代码 问题不在于您的
用户名
或密码
。问题是scrapy
只支持使用ftp
下载文件,不支持列出目录。您正在使用的url是目录url
有一种可能的解决方法可以实际使用包名ftptree
使用下面的代码添加handlers.py
import json
from twisted.protocols.ftp import FTPFileListProtocol
from scrapy.http import Response
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
class FtpListingHandler(FTPDownloadHandler):
def gotClient(self, client, request, filepath):
self.client = client
protocol = FTPFileListProtocol()
return client.list(filepath, protocol).addCallbacks(
callback=self._build_response, callbackArgs=(request, protocol),
errback=self._failed, errbackArgs=(request,))
def _build_response(self, result, request, protocol):
self.result = result
body = json.dumps(protocol.files)
return Response(url=request.url, status=200, body=body)
然后在设置.py中使用
DOWNLOAD_HANDLERS = {'ftp': 'cralwername.handlers.FtpListingHandler'}
蜘蛛标本
import os
import json
from urlparse import urlparse
from scrapy import Spider
from scrapy.http.request import Request
from ftptree_crawler.items import FtpTreeLeaf
class AnonFtpRequest(Request):
anon_meta = {'ftp_user': 'anonymous',
'ftp_password': 'laserson@cloudera.com'}
def __init__(self, *args, **kwargs):
super(AnonFtpRequest, self).__init__(*args, **kwargs)
self.meta.update(self.anon_meta)
class FtpTreeSpider(Spider):
name = 'ftptree'
def __init__(self, config_file, *args, **kwargs):
super(FtpTreeSpider, self).__init__(*args, **kwargs)
with open(config_file, 'r') as ip:
config = json.loads(ip.read())
url = 'ftp://%s/%s' % (config['host'], config['root_path'])
self.start_url = url
self.site_id = config['id']
def start_requests(self):
yield AnonFtpRequest(self.start_url)
def parse(self, response):
url = urlparse(response.url)
basepath = url.path
files = json.loads(response.body)
for f in files:
if f['filetype'] == 'd':
path = os.path.join(response.url, f['filename'])
request = AnonFtpRequest(path)
yield request
if f['filetype'] == '-':
path = os.path.join(basepath, f['filename'])
result = FtpTreeLeaf(
filename=f['filename'], path=path, size=f['size'])
yield result
如果需要更多信息,请查看链接
查看下面的scrapy源代码
问题不在于您的用户名
或密码
。问题是scrapy
只支持使用ftp
下载文件,不支持列出目录。您正在使用的url是目录url
有一种可能的解决方法可以实际使用包名ftptree
使用下面的代码添加handlers.py
import json
from twisted.protocols.ftp import FTPFileListProtocol
from scrapy.http import Response
from scrapy.core.downloader.handlers.ftp import FTPDownloadHandler
class FtpListingHandler(FTPDownloadHandler):
def gotClient(self, client, request, filepath):
self.client = client
protocol = FTPFileListProtocol()
return client.list(filepath, protocol).addCallbacks(
callback=self._build_response, callbackArgs=(request, protocol),
errback=self._failed, errbackArgs=(request,))
def _build_response(self, result, request, protocol):
self.result = result
body = json.dumps(protocol.files)
return Response(url=request.url, status=200, body=body)
然后在设置.py中使用
DOWNLOAD_HANDLERS = {'ftp': 'cralwername.handlers.FtpListingHandler'}
蜘蛛标本
import os
import json
from urlparse import urlparse
from scrapy import Spider
from scrapy.http.request import Request
from ftptree_crawler.items import FtpTreeLeaf
class AnonFtpRequest(Request):
anon_meta = {'ftp_user': 'anonymous',
'ftp_password': 'laserson@cloudera.com'}
def __init__(self, *args, **kwargs):
super(AnonFtpRequest, self).__init__(*args, **kwargs)
self.meta.update(self.anon_meta)
class FtpTreeSpider(Spider):
name = 'ftptree'
def __init__(self, config_file, *args, **kwargs):
super(FtpTreeSpider, self).__init__(*args, **kwargs)
with open(config_file, 'r') as ip:
config = json.loads(ip.read())
url = 'ftp://%s/%s' % (config['host'], config['root_path'])
self.start_url = url
self.site_id = config['id']
def start_requests(self):
yield AnonFtpRequest(self.start_url)
def parse(self, response):
url = urlparse(response.url)
basepath = url.path
files = json.loads(response.body)
for f in files:
if f['filetype'] == 'd':
path = os.path.join(response.url, f['filename'])
request = AnonFtpRequest(path)
yield request
if f['filetype'] == '-':
path = os.path.join(basepath, f['filename'])
result = FtpTreeLeaf(
filename=f['filename'], path=path, size=f['size'])
yield result
如果需要更多信息,请查看链接
因为这不是向FTP传递用户名和密码的方式。使用旧式连接protocol://username:password@主机:端口改为URL格式。我不知道我要访问的FTP的主机和端口主机是FTP.co.palm-beach.fl.us
,您当然知道。您正在访问ftp://ftp.co.palm-beach.fl.us/Building%20Permits/
因此,登录该FTP的正确方法是使用ftp://the_username:the_password@ftp.co.palm-beach.fl.us/建筑%20许可证/“
因为这不是向FTP传递用户名和密码的方式。使用旧式连接protocol://username:password@主机:端口改为URL格式。我不知道我要访问的FTP的主机和端口主机是FTP.co.palm-beach.fl.us
,您当然知道。您正在访问ftp://ftp.co.palm-beach.fl.us/Building%20Permits/
因此,登录该FTP的正确方法是使用ftp://the_username:the_password@ftp.co.palm-beach.fl.us/建筑%20许可证/“