Python将文件拆分为多个较小的文件
编写一个名为Python将文件拆分为多个较小的文件,python,file,split,Python,File,Split,编写一个名为file\u split(文件名,文件数)的函数,将输入文件拆分为多个输出文件。文件应尽可能均匀地分割。当文件长度可被要创建的文件数平均整除时(一个10行的文件,分为2个文件,每个输出文件应该有5行。当长度不可均匀分割时,所有输出文件的长度不得大于1。例如,一个10行的文件,分为3,将有长度为3、3和4的输出文件 我已经编写了我的代码,但我无法找出如何处理大于1个部分的差异,我需要帮助修改我的代码以包含该部分。(如果最后一行不是偶数,我的代码将创建一个新文件) 未经测试: 我会使用模
file\u split(文件名,文件数)
的函数,将输入文件拆分为多个输出文件。文件应尽可能均匀地分割。当文件长度可被要创建的文件数平均整除时(一个10行的文件,分为2个文件,每个输出文件应该有5行。当长度不可均匀分割时,所有输出文件的长度不得大于1。例如,一个10行的文件,分为3,将有长度为3、3和4的输出文件
我已经编写了我的代码,但我无法找出如何处理大于1个部分的差异,我需要帮助修改我的代码以包含该部分。(如果最后一行不是偶数,我的代码将创建一个新文件)
未经测试: 我会使用模运算
res = len(lines) % number_of_files
for lines in range(0, len(input), base_size):
if at == len(input)+res+1:
outputData = input[lines:-1]
else:
...
也就是说,只需将剩余的行转储到最后一个文件中即可。循环工作非常简单:
with open('myfile.txt') as infp:
files = [open('%d.txt' % i, 'w') for i in range(number_of_files)]
for i, line in enumerate(infp):
files[i % number_of_files].write(line)
for f in files:
f.close()
从未来导入打印功能
import boto3
import shutil
import os
import os.path
import urllib
import json
import urllib2
import subprocess
import linecache
import sys
s3client = boto3.client('s3')
s3 = boto3.resource('s3')
def lambda_handler(event, context):
try:
for record in event['Records']:
bucket = record['s3']['bucket']['name']
key = record['s3']['object']['key']
print(key)
p = key.rsplit('/',1)
keyfile =p[1]
print("S Object: " + keyfile + " is a FILE")
inpfilename = keyfile
ou = inpfilename.split('.',1)
outfilename = ou[0]
print("inpfilename :" + inpfilename)
body = s3client.get_object(
Bucket=bucket,
Key=key
)["Body"].read().split('\n')
lines_per_file = 3 # Lines on each small file
created_files = 0 # Counting how many small files have been created
op_rec='' # Stores lines not yet written on a small file
lines_counter = 0 # Same as len(lines)
for line in body: # Go throught the whole big file
op_rec = op_rec + '\n' + line
lines_counter += 1
if lines_counter == lines_per_file:
idx = lines_per_file * (created_files + 1)
body_contents = str(op_rec)
file_name = "%s_%s.txt" %(outfilename, idx)
target_file = "folder-name/" + file_name
print(target_file)
s3client.put_object(ACL='public-read',ServerSideEncryption='AES256', Bucket='bucket-name',Key= target_file, Body=body_contents )
op_rec ='' # Reset variables
lines_counter = 0
created_files += 1 # One more small file has been created
# After for-loop has finished
if lines_counter: # There are still some lines not written on a file?
idx = lines_per_file * (created_files + 1)
body_contents = str(op_rec)
file_name = "%s_%s.txt" %(outfilename, idx)
target_file = "folder-name/" + file_name
print(target_file)
s3client.put_object(ACL='public-read',ServerSideEncryption='AES256', Bucket='bucket-name',Key= target_file, Body=body_contents )
created_files += 1
print ('%s small files (with %s lines each) were created.' % (created_files,lines_per_file))
except Exception as e:
print(e)
检查这个
“编写函数…”的可能重复项?我们是你的奴隶,你可以给我们命令吗?@Stefan Pochmann你读了下一段吗?@Stefan Pochmann这就是课文中的问题所在,而不是我的措辞lolWell那么你为什么不这么说,或者以某种方式将它标记为引用?从它的书写方式来看,我发现它的风格和礼仪很差,当然不是“lol”关于。这不是他想要的,请看zero323的评论汉克斯,这不完全是我需要的,但仍然很有帮助!谢谢你,帮助很大
import boto3
import shutil
import os
import os.path
import urllib
import json
import urllib2
import subprocess
import linecache
import sys
s3client = boto3.client('s3')
s3 = boto3.resource('s3')
def lambda_handler(event, context):
try:
for record in event['Records']:
bucket = record['s3']['bucket']['name']
key = record['s3']['object']['key']
print(key)
p = key.rsplit('/',1)
keyfile =p[1]
print("S Object: " + keyfile + " is a FILE")
inpfilename = keyfile
ou = inpfilename.split('.',1)
outfilename = ou[0]
print("inpfilename :" + inpfilename)
body = s3client.get_object(
Bucket=bucket,
Key=key
)["Body"].read().split('\n')
lines_per_file = 3 # Lines on each small file
created_files = 0 # Counting how many small files have been created
op_rec='' # Stores lines not yet written on a small file
lines_counter = 0 # Same as len(lines)
for line in body: # Go throught the whole big file
op_rec = op_rec + '\n' + line
lines_counter += 1
if lines_counter == lines_per_file:
idx = lines_per_file * (created_files + 1)
body_contents = str(op_rec)
file_name = "%s_%s.txt" %(outfilename, idx)
target_file = "folder-name/" + file_name
print(target_file)
s3client.put_object(ACL='public-read',ServerSideEncryption='AES256', Bucket='bucket-name',Key= target_file, Body=body_contents )
op_rec ='' # Reset variables
lines_counter = 0
created_files += 1 # One more small file has been created
# After for-loop has finished
if lines_counter: # There are still some lines not written on a file?
idx = lines_per_file * (created_files + 1)
body_contents = str(op_rec)
file_name = "%s_%s.txt" %(outfilename, idx)
target_file = "folder-name/" + file_name
print(target_file)
s3client.put_object(ACL='public-read',ServerSideEncryption='AES256', Bucket='bucket-name',Key= target_file, Body=body_contents )
created_files += 1
print ('%s small files (with %s lines each) were created.' % (created_files,lines_per_file))
except Exception as e:
print(e)
--------------------------------------------------------------
Usage: SplitAndCombine.py [-h] [-i INPUT] [-s] [-n CHUNK] [-m]\n
optional arguments:
-h, --help show this help message and exit
-i INPUT, --input INPUT
Provide the File that needs to be Split
-s, --split To Split the File
-n CHUNK, --chunk CHUNK
[n]: No. of files to be created
[n]kb : Split the file in nKB size
[n]b : Split the file in nb size
[n]mb : Split the file in nmb size
[n]gb : Split the file in ngb size
-m, --merge Merge the Files