Python 从S3读取ZIP文件而不下载整个文件

Python 从S3读取ZIP文件而不下载整个文件,python,amazon-s3,boto3,boto,zipfile,Python,Amazon S3,Boto3,Boto,Zipfile,我们有大小为5-10GB的ZIP文件。典型的ZIP文件有5-10个内部文件,每个文件的大小为1-5GB 我有一套很好的Python工具来读取这些文件。基本上,我可以打开一个文件名,如果有ZIP文件,工具会在ZIP文件中搜索,然后打开压缩文件。这一切都相当透明 我想将这些文件作为压缩文件存储在AmazonS3中。我可以获取S3文件的范围,所以应该可以获取ZIP中心目录(它是文件的结尾,所以我可以读取最后的64KiB),找到我想要的组件,下载它,然后直接流式传输到调用进程 所以我的问题是,如何通过标

我们有大小为5-10GB的ZIP文件。典型的ZIP文件有5-10个内部文件,每个文件的大小为1-5GB

我有一套很好的Python工具来读取这些文件。基本上,我可以打开一个文件名,如果有ZIP文件,工具会在ZIP文件中搜索,然后打开压缩文件。这一切都相当透明

我想将这些文件作为压缩文件存储在AmazonS3中。我可以获取S3文件的范围,所以应该可以获取ZIP中心目录(它是文件的结尾,所以我可以读取最后的64KiB),找到我想要的组件,下载它,然后直接流式传输到调用进程


所以我的问题是,如何通过标准的Python ZipFile API做到这一点?没有文档说明如何用支持POSIX语义的任意对象替换文件系统传输。在不重写模块的情况下,这是可能的吗?

下面的代码允许您在Amazon S3上打开一个文件,就像打开一个普通文件一样。注意,我使用的是
aws
命令,而不是
boto3
Python模块。(我没有访问bot3的权限。)您可以打开该文件并在其上查找。该文件在本地缓存。如果使用Python ZipFile API打开该文件,并且它是一个ZipFile,则可以读取各个部分。但是,您不能写入,因为S3不支持部分写入

另外,我实现了
s3open()

from urllib.parse import urlparse
from subprocess import run,Popen,PIPE
import copy
import json
import os
import tempfile

# Tools for reading and write files from Amazon S3 without boto or boto3
# http://boto.cloudhackers.com/en/latest/s3_tut.html
# but it is easier to use the aws cli, since it's configured to work.

def s3open(path, mode="r", encoding=None):
    """
    Open an s3 file for reading or writing. Can handle any size, but cannot seek.
    We could use boto.
    http://boto.cloudhackers.com/en/latest/s3_tut.html
    but it is easier to use the aws cli, since it is present and more likely to work.
    """
    from subprocess import run,PIPE,Popen
    if "b" in mode:
        assert encoding == None
    else:
        if encoding==None:
            encoding="utf-8"
    assert 'a' not in mode
    assert '+' not in mode

    if "r" in mode:
        p = Popen(['aws','s3','cp',path,'-'],stdout=PIPE,encoding=encoding)
        return p.stdout

    elif "w" in mode:
        p = Popen(['aws','s3','cp','-',path],stdin=PIPE,encoding=encoding)
        return p.stdin
    else:
        raise RuntimeError("invalid mode:{}".format(mode))




CACHE_SIZE=4096                 # big enough for front and back caches
MAX_READ=65536*16
debug=False
class S3File:
    """Open an S3 file that can be seeked. This is done by caching to the local file system."""
    def __init__(self,name,mode='rb'):
        self.name   = name
        self.url    = urlparse(name)
        if self.url.scheme != 's3':
            raise RuntimeError("url scheme is {}; expecting s3".format(self.url.scheme))
        self.bucket = self.url.netloc
        self.key    = self.url.path[1:]
        self.fpos   = 0
        self.tf     = tempfile.NamedTemporaryFile()
        cmd = ['aws','s3api','list-objects','--bucket',self.bucket,'--prefix',self.key,'--output','json']
        data = json.loads(Popen(cmd,encoding='utf8',stdout=PIPE).communicate()[0])
        file_info = data['Contents'][0]
        self.length = file_info['Size']
        self.ETag   = file_info['ETag']

        # Load the caches

        self.frontcache = self._readrange(0,CACHE_SIZE) # read the first 1024 bytes and get length of the file
        if self.length > CACHE_SIZE:
            self.backcache_start = self.length-CACHE_SIZE
            if debug: print("backcache starts at {}".format(self.backcache_start))
            self.backcache  = self._readrange(self.backcache_start,CACHE_SIZE)
        else:
            self.backcache  = None

    def _readrange(self,start,length):
        # This is gross; we copy everything to the named temporary file, rather than a pipe
        # because the pipes weren't showing up in /dev/fd/?
        # We probably want to cache also... That's coming
        cmd = ['aws','s3api','get-object','--bucket',self.bucket,'--key',self.key,'--output','json',
               '--range','bytes={}-{}'.format(start,start+length-1),self.tf.name]
        if debug:print(cmd)
        data = json.loads(Popen(cmd,encoding='utf8',stdout=PIPE).communicate()[0])
        if debug:print(data)
        self.tf.seek(0)         # go to the beginning of the data just read
        return self.tf.read(length) # and read that much

    def __repr__(self):
        return "FakeFile<name:{} url:{}>".format(self.name,self.url)

    def read(self,length=-1):
        # If length==-1, figure out the max we can read to the end of the file
        if length==-1:
            length = min(MAX_READ, self.length - self.fpos + 1)

        if debug:
            print("read: fpos={}  length={}".format(self.fpos,length))
        # Can we satisfy from the front cache?
        if self.fpos < CACHE_SIZE and self.fpos+length < CACHE_SIZE:
            if debug:print("front cache")
            buf = self.frontcache[self.fpos:self.fpos+length]
            self.fpos += len(buf)
            if debug:print("return 1: buf=",buf)
            return buf

        # Can we satisfy from the back cache?
        if self.backcache and (self.length - CACHE_SIZE < self.fpos):
            if debug:print("back cache")
            buf = self.backcache[self.fpos - self.backcache_start:self.fpos - self.backcache_start + length]
            self.fpos += len(buf)
            if debug:print("return 2: buf=",buf)
            return buf

        buf = self._readrange(self.fpos, length)
        self.fpos += len(buf)
        if debug:print("return 3: buf=",buf)
        return buf

    def seek(self,offset,whence=0):
        if debug:print("seek({},{})".format(offset,whence))
        if whence==0:
            self.fpos = offset
        elif whence==1:
            self.fpos += offset
        elif whence==2:
            self.fpos = self.length + offset
        else:
            raise RuntimeError("whence={}".format(whence))
        if debug:print("   ={}  (self.length={})".format(self.fpos,self.length))

    def tell(self):
        return self.fpos

    def write(self):
        raise RuntimeError("Write not supported")

    def flush(self):
        raise RuntimeError("Flush not supported")

    def close(self):
        return
从urllib.parse导入urlparse
从子流程导入运行、Popen、管道
导入副本
导入json
导入操作系统
导入临时文件
#用于在不使用boto或boto3的情况下从AmazonS3读取和写入文件的工具
# http://boto.cloudhackers.com/en/latest/s3_tut.html
#但是使用aws cli更容易,因为它已配置为工作。
def s3open(路径,模式=“r”,编码=无):
"""
打开s3文件进行读取或写入。可以处理任何大小,但不能查找。
我们可以用boto。
http://boto.cloudhackers.com/en/latest/s3_tut.html
但是使用aws cli更容易,因为它存在并且更容易工作。
"""
从子流程导入运行、管道、Popen
如果模式为“b”:
断言编码==无
其他:
如果编码==无:
encoding=“utf-8”
断言“a”不在模式中
断言“+”不在模式中
如果模式为“r”:
p=Popen(['aws','s3','cp',path',-',],stdout=PIPE,encoding=encoding)
返回p.stdout
模式中的elif“w”:
p=Popen(['aws','s3','cp','-',path],stdin=PIPE,encoding=encoding)
返回p.stdin
其他:
引发运行时错误(“无效模式:{}”。格式(模式))
CACHE_SIZE=4096#足够大,可用于前后缓存
最大读数=65536*16
调试=错误
类S3文件:
“”“打开可以查找的S3文件。这是通过缓存到本地文件系统来完成的。”“”
定义初始化(self,name,mode='rb'):
self.name=名称
self.url=urlparse(名称)
如果self.url.scheme!='s3':
raise RUNTIMERROR(“url方案为{};应为s3”。格式(self.url.scheme))
self.bucket=self.url.netloc
self.key=self.url.path[1:]
self.fpos=0
self.tf=tempfile.NamedTemporaryFile()
cmd=['aws'、's3api'、'list-objects'、'-bucket',self.bucket'--prefix',self.key'--output',json']
data=json.load(Popen(cmd,encoding='utf8',stdout=PIPE).communicate()[0])
file_info=data['Contents'][0]
self.length=文件信息['Size']
self.ETag=文件信息['ETag']
#加载缓存
self.frontcache=self._readrange(0,缓存大小)#读取前1024个字节并获取文件的长度
如果self.length>缓存大小:
self.backcache\u start=self.length-CACHE\u SIZE
if debug:print(“backcache从{}开始”。格式(self.backcache_start))
self.backcache=self.\u readrange(self.backcache\u start,CACHE\u SIZE)
其他:
self.backcache=None
def_读取范围(自身、开始、长度):
#这太恶心了;我们将所有内容复制到命名的临时文件,而不是管道
#因为管道没有显示在/dev/fd/中?
#我们可能还想缓存。。。就要来了
cmd=['aws'、's3api'、'get-object'、'-bucket',self.bucket'-key',self.key'-output',json'-,
'--range','bytes={}-{}'。格式(start,start+length-1),self.tf.name]
如果调试:打印(cmd)
data=json.load(Popen(cmd,encoding='utf8',stdout=PIPE).communicate()[0])
如果调试:打印(数据)
self.tf.seek(0)#转到刚刚读取的数据的开头
返回self.tf.read(length)#读那么多
定义报告(自我):
返回“FakeFile.format”(self.name,self.url)
def读取(自身,长度=-1):
#如果length==-1,计算出我们可以读取到文件末尾的最大值
如果长度==-1:
长度=最小值(最大读取,self.length-self.fpos+1)
如果调试:
打印(“读取:fpos={}length={}”.format(self.fpos,length))
#我们能从前端缓存中满足吗?
如果self.fpos<缓存大小,self.fpos+长度<缓存大小:
如果调试:打印(“前端缓存”)
buf=self.frontcache[self.fpos:self.fpos+length]
self.fpos+=len(buf)
如果调试:打印(“返回1:buf=,buf”)
返回buf
#我们能从后台缓存中得到满足吗?
如果self.backcache和(self.length-CACHE_SIZEimport sys
import zlib
import zipfile
import io

import boto
from boto.s3.connection import OrdinaryCallingFormat


# range-fetches a S3 key
def fetch(key, start, len):
    end = start + len - 1
    return key.get_contents_as_string(headers={"Range": "bytes=%d-%d" % (start, end)})


# parses 2 or 4 little-endian bits into their corresponding integer value
def parse_int(bytes):
    val = ord(bytes[0]) + (ord(bytes[1]) << 8)
    if len(bytes) > 3:
        val += (ord(bytes[2]) << 16) + (ord(bytes[3]) << 24)
    return val


"""
bucket: name of the bucket
key:    path to zipfile inside bucket
entry:  pathname of zip entry to be retrieved (path/to/subdir/file.name)    
"""

# OrdinaryCallingFormat prevents certificate errors on bucket names with dots
# https://stackoverflow.com/questions/51604689/read-zip-files-from-amazon-s3-using-boto3-and-python#51605244
_bucket = boto.connect_s3(calling_format=OrdinaryCallingFormat()).get_bucket(bucket)
_key = _bucket.get_key(key)

# fetch the last 22 bytes (end-of-central-directory record; assuming the comment field is empty)
size = _key.size
eocd = fetch(_key, size - 22, 22)

# start offset and size of the central directory
cd_start = parse_int(eocd[16:20])
cd_size = parse_int(eocd[12:16])

# fetch central directory, append EOCD, and open as zipfile!
cd = fetch(_key, cd_start, cd_size)
zip = zipfile.ZipFile(io.BytesIO(cd + eocd))


for zi in zip.filelist:
    if zi.filename == entry:
        # local file header starting at file name length + file content
        # (so we can reliably skip file name and extra fields)

        # in our "mock" zipfile, `header_offset`s are negative (probably because the leading content is missing)
        # so we have to add to it the CD start offset (`cd_start`) to get the actual offset

        file_head = fetch(_key, cd_start + zi.header_offset + 26, 4)
        name_len = parse_int(file_head[0:2])
        extra_len = parse_int(file_head[2:4])

        content = fetch(_key, cd_start + zi.header_offset + 30 + name_len + extra_len, zi.compress_size)

        # now `content` has the file entry you were looking for!
        # you should probably decompress it in context before passing it to some other program

        if zi.compress_type == zipfile.ZIP_DEFLATED:
            print zlib.decompressobj(-15).decompress(content)
        else:
            print content
        break
import boto3
import io
import struct
import zipfile

s3 = boto3.client('s3')

EOCD_RECORD_SIZE = 22
ZIP64_EOCD_RECORD_SIZE = 56
ZIP64_EOCD_LOCATOR_SIZE = 20

MAX_STANDARD_ZIP_SIZE = 4_294_967_295

def lambda_handler(event):
    bucket = event['bucket']
    key = event['key']
    zip_file = get_zip_file(bucket, key)
    print_zip_content(zip_file)

def get_zip_file(bucket, key):
    file_size = get_file_size(bucket, key)
    eocd_record = fetch(bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE)
    if file_size <= MAX_STANDARD_ZIP_SIZE:
        cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
        central_directory = fetch(bucket, key, cd_start, cd_size)
        return zipfile.ZipFile(io.BytesIO(central_directory + eocd_record))
    else:
        zip64_eocd_record = fetch(bucket, key,
                                  file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
                                  ZIP64_EOCD_RECORD_SIZE)
        zip64_eocd_locator = fetch(bucket, key,
                                   file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
                                   ZIP64_EOCD_LOCATOR_SIZE)
        cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
        central_directory = fetch(bucket, key, cd_start, cd_size)
        return zipfile.ZipFile(io.BytesIO(central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record))


def get_file_size(bucket, key):
    head_response = s3.head_object(Bucket=bucket, Key=key)
    return head_response['ContentLength']

def fetch(bucket, key, start, length):
    end = start + length - 1
    response = s3.get_object(Bucket=bucket, Key=key, Range="bytes=%d-%d" % (start, end))
    return response['Body'].read()

def get_central_directory_metadata_from_eocd(eocd):
    cd_size = parse_little_endian_to_int(eocd[12:16])
    cd_start = parse_little_endian_to_int(eocd[16:20])
    return cd_start, cd_size

def get_central_directory_metadata_from_eocd64(eocd64):
    cd_size = parse_little_endian_to_int(eocd64[40:48])
    cd_start = parse_little_endian_to_int(eocd64[48:56])
    return cd_start, cd_size

def parse_little_endian_to_int(little_endian_bytes):
    format_character = "i" if len(little_endian_bytes) == 4 else "q"
    return struct.unpack("<" + format_character, little_endian_bytes)[0]

def print_zip_content(zip_file):
    files = [zi.filename for zi in zip_file.filelist]
    print(f"Files: {files}")