Python 将文本文件拆分为较小的文件
我正在尝试将原始文本文件拆分为更小的文件,具体取决于文本块或“记录”的数量。 现在代码输出一个带有最后一条记录的文本文件,但我希望(在本例中)每个文件都有一条记录。 可以使用一些帮助编辑我的函数来循环包含每条记录的列表并写入新文件 input.txtPython 将文本文件拆分为较小的文件,python,text,Python,Text,我正在尝试将原始文本文件拆分为更小的文件,具体取决于文本块或“记录”的数量。 现在代码输出一个带有最后一条记录的文本文件,但我希望(在本例中)每个文件都有一条记录。 可以使用一些帮助编辑我的函数来循环包含每条记录的列表并写入新文件 input.txt "GROUP";"DetailA1";"DetailA2";"DetailA3";"DetailA4" "PRINT";"1" "BodyA1";"BodyA2";"BodyA3" "BodyB1";"BodyB2";"BodyB3" "BodyC
"GROUP";"DetailA1";"DetailA2";"DetailA3";"DetailA4"
"PRINT";"1"
"BodyA1";"BodyA2";"BodyA3"
"BodyB1";"BodyB2";"BodyB3"
"BodyC1";"BodyC2";"BodyC3"
"BodyD1";"BodyD2";"END"
"GROUP";"DetailB1";"DetailB2";"DetailB3";"DetailB4"
"PRINT";"1"
"BodyA1";"BodyA2";"BodyA3"
"BodyB1";"BodyB2";"BodyB3"
"BodyC1";"BodyC2";"BodyC3"
"BodyD1";"BodyD2";"END"
"GROUP";"DetailC1";"DetailC2";"DetailC3";"DetailC4"
"PRINT";"1"
"BodyA1";"BodyA2";"BodyA3"
"BodyB1";"BodyB2";"BodyB3"
"BodyC1";"BodyC2";"BodyC3"
"BodyD1";"BodyD2";"END"
"GROUP";"DetailD1";"DetailD2";"DetailD3";"DetailD4"
"PRINT";"1"
"BodyA1";"BodyA2";"BodyA3"
"BodyB1";"BodyB2";"BodyB3"
"BodyC1";"BodyC2";"BodyC3"
"BodyD1";"BodyD2";"END"
"GROUP";"DetailE1";"DetailE2";"DetailE3";"DetailE4"
"PRINT";"1"
"BodyA1";"BodyA2";"BodyA3"
"BodyB1";"BodyB2";"BodyB3"
"BodyC1";"BodyC2";"BodyC3"
"BodyD1";"BodyD2";"END"
"GROUP";"DetailF1";"DetailF2";"DetailF3";"DetailF4"
"PRINT";"1"
"BodyA1";"BodyA2";"BodyA3"
"BodyB1";"BodyB2";"BodyB3"
"BodyC1";"BodyC2";"BodyC3"
"BodyD1";"BodyD2";"END"
Split.py
import re
import math
# Path to file
input_text = "input.txt"
# Empty list containing parsed lists of text
parsed = []
# Empty list containing single parsed list
lastblock = []
# define the "beginning of new block" pattern
newblockregex = re.compile('^"GROUP.*')
# Create blocks of text as list of lists
with open(input_text) as textfile:
for line in textfile.readlines():
if newblockregex.match(line.rstrip('\n')):
if lastblock:
parsed.append(lastblock)
lastblock = []
lastblock = [line.rstrip('\n')]
else:
lastblock.append(line.rstrip('\n'))
parsed.append(lastblock)
''' End of blocking of text'''
# Get total number of lines
sumlen = sum([len(rec) for leng, rec in enumerate(parsed)])
print(f"Total rows of record: {sumlen}")
# Function to calculate number of resulting files
def maxPrimeFactors (n):
# Initialize the maximum prime factor
# variable with the lowest one
maxPrime = -1
# Print the number of 2s that divide n
while n % 2 == 0:
maxPrime = 2
n >>= 1 # equivalent to n /= 2
# n must be odd at this point,
# thus skip the even numbers and
# iterate only for odd integers
for i in range(3, int(math.sqrt(n)) + 1, 2):
while n % i == 0:
maxPrime = i
n = n / i
# This condition is to handle the
# case when n is a prime number
# greater than 2
if n > 2:
maxPrime = n
return int(maxPrime)
# Count of blocks (forms/records) in the file
# Gets number of blocks in the raw file
formnum = len(parsed)
# Number of resulting files
splitsnum = maxPrimeFactors(formnum)
blocksPerFile = round(formnum/splitsnum)
print(f"There are {formnum} forms.") # Prints to user the number of blocks
print(f"Number of forms per output file: {blocksPerFile}")
# Split records into new file
def slice_per(parsed, blocksPerFile):
for record in parsed:
counter = 1
with open(f'small_file_{counter + blocksPerFile}.txt', 'w+') as output:
for L in record:
output.write(L)
counter+=1
print(f"File number {parsed.index(record)}")
output.close()
print ("Done!")
slice_per(parsed, blocksPerFile)
切片方法计数器中的更改应在循环之外
你是说每行一个文件吗?你是否做过任何测试以确保
解析的和块文件是正确的?您可能应该将其缩减为一个-仅是解析
和块文件
和切片的一个示例。为什么这个问题被否决了?它说关闭作为离题。这个话题应该是什么?还是因为它不被视为最小的可复制示例而被否决?我只是想澄清一下。我将编辑这篇文章,只作为解析
和blocksPerFile
@wwii的一个例子,Thanks@GiovaniSalazar我指的是一定数量的记录块(从文本中的“组”到“组”),因此中间所有有趣的计算。有什么建议可以让这更清楚吗?嗨,二战。不,我还没有测试,尽管我已经通过其他SO问题确认了已解析和blocksPerFile是正确的。最好的测试方法是什么?Try,Assert,除了块?我刚开始编程。举个简单的例子?试试看,那会很好哇。。。非常感谢您的快速回复@RachitKumar!通常在需要计数器时使用。您可能希望包含一个使用它的示例。
def slice_per(parsed, blocksPerFile):
counter = 1
for record in parsed:
with open('small_file_%s.txt'%(counter + blocksPerFile), 'w') as output:
for L in record:
output.write(L)
counter+=1
print("File number %s"%(parsed.index(record)))
output.close()
print ("Done!")