如何在Python中廉价地获取大文件的行数?
我需要用python获得一个大文件的行数(数十万行)。记忆和时间方面最有效的方法是什么 目前我确实:如何在Python中廉价地获取大文件的行数?,python,text-files,line-count,Python,Text Files,Line Count,我需要用python获得一个大文件的行数(数十万行)。记忆和时间方面最有效的方法是什么 目前我确实: def file_len(fname): with open(fname) as f: for i, l in enumerate(f): pass return i + 1 有可能做得更好吗?您可以执行子流程并运行wc-l filename import subprocess def file_len(fname): p =
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
有可能做得更好吗?您可以执行子流程并运行
wc-l filename
import subprocess
def file_len(fname):
p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
result, err = p.communicate()
if p.returncode != 0:
raise IOError(err)
return int(result.strip().split()[0])
没有比这更好的了 毕竟,任何解决方案都必须读取整个文件,计算出您有多少个
\n
,然后返回结果
您有没有更好的方法可以在不读取整个文件的情况下执行此操作?不确定。。。最好的解决方案始终是I/O绑定,您所能做的最好是确保不使用不必要的内存,但看起来您已经涵盖了这一点。对于我来说,此变体将是最快的:
#!/usr/bin/env python
def main():
f = open('filename')
lines = 0
buf_size = 1024 * 1024
read_f = f.read # loop optimization
buf = read_f(buf_size)
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
print lines
if __name__ == '__main__':
main()
原因:缓冲速度比逐行读取和
字符串快。计数也非常快打开文件的结果是一个迭代器,它可以转换为一个序列,该序列的长度为:
with open(filename) as f:
return len(list(f))
这比显式循环更简洁,避免了枚举
这是怎么回事
def file_len(fname):
counts = itertools.count()
with open(fname) as f:
for _ in f: counts.next()
return counts.next()
为什么不读取前100行和最后100行并估计平均行长,然后将总文件大小除以这些数字?如果您不需要精确的值,这可能会起作用。我相信内存映射文件将是最快的解决方案。我尝试了四个函数:OP发布的函数(opcount
);对文件中的行进行简单迭代(simplecount
);具有内存映射字段(mmap)的读线(mapcount
);以及Mykola Kharechko提供的缓冲区读取解决方案(bufcount
)
我将每个函数运行了五次,并计算了120万行文本文件的平均运行时间
WindowsXP,Python 2.5,2GB内存,2GHz AMD处理器
以下是我的结果:
mapcount : 0.465599966049
simplecount : 0.756399965286
bufcount : 0.546800041199
opcount : 0.718600034714
编辑:Python 2.6的数字:
mapcount : 0.471799945831
simplecount : 0.634400033951
bufcount : 0.468800067902
opcount : 0.602999973297
因此,对于Windows/Python 2.6来说,缓冲区读取策略似乎是最快的
代码如下:
from __future__ import with_statement
import time
import mmap
import random
from collections import defaultdict
def mapcount(filename):
f = open(filename, "r+")
buf = mmap.mmap(f.fileno(), 0)
lines = 0
readline = buf.readline
while readline():
lines += 1
return lines
def simplecount(filename):
lines = 0
for line in open(filename):
lines += 1
return lines
def bufcount(filename):
f = open(filename)
lines = 0
buf_size = 1024 * 1024
read_f = f.read # loop optimization
buf = read_f(buf_size)
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
return lines
def opcount(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
counts = defaultdict(list)
for i in range(5):
for func in [mapcount, simplecount, bufcount, opcount]:
start_time = time.time()
assert func("big_file.txt") == 1209138
counts[func].append(time.time() - start_time)
for key, vals in counts.items():
print key.__name__, ":", sum(vals) / float(len(vals))
一句话,可能很快:
num_lines = sum(1 for line in open('myfile.txt'))
为了完成上述方法,我尝试了fileinput模块的一个变体:
import fileinput as fi
def filecount(fname):
for line in fi.input(fname):
pass
return fi.lineno()
并将一个60mil行文件传递给上述所有方法:
mapcount : 6.1331050396
simplecount : 4.588793993
opcount : 4.42918205261
filecount : 43.2780818939
bufcount : 0.170812129974
让我有点惊讶的是,fileinput如此糟糕,而且比所有其他方法的伸缩性都差得多…count=max(enumerate(open(filename)))[0]
这个怎么样
import fileinput
import sys
counter=0
for line in fileinput.input([sys.argv[1]]):
counter+=1
fileinput.close()
print counter
下面是一个python程序,它使用多处理库跨机器/内核分发行计数。我的测试使用8核windows 64服务器将2000万行文件的计数从26秒提高到7秒。注意:不使用内存映射会使事情变得更慢
import multiprocessing, sys, time, os, mmap
import logging, logging.handlers
def init_logger(pid):
console_format = 'P{0} %(levelname)s %(message)s'.format(pid)
logger = logging.getLogger() # New logger at root level
logger.setLevel( logging.INFO )
logger.handlers.append( logging.StreamHandler() )
logger.handlers[0].setFormatter( logging.Formatter( console_format, '%d/%m/%y %H:%M:%S' ) )
def getFileLineCount( queues, pid, processes, file1 ):
init_logger(pid)
logging.info( 'start' )
physical_file = open(file1, "r")
# mmap.mmap(fileno, length[, tagname[, access[, offset]]]
m1 = mmap.mmap( physical_file.fileno(), 0, access=mmap.ACCESS_READ )
#work out file size to divide up line counting
fSize = os.stat(file1).st_size
chunk = (fSize / processes) + 1
lines = 0
#get where I start and stop
_seedStart = chunk * (pid)
_seekEnd = chunk * (pid+1)
seekStart = int(_seedStart)
seekEnd = int(_seekEnd)
if seekEnd < int(_seekEnd + 1):
seekEnd += 1
if _seedStart < int(seekStart + 1):
seekStart += 1
if seekEnd > fSize:
seekEnd = fSize
#find where to start
if pid > 0:
m1.seek( seekStart )
#read next line
l1 = m1.readline() # need to use readline with memory mapped files
seekStart = m1.tell()
#tell previous rank my seek start to make their seek end
if pid > 0:
queues[pid-1].put( seekStart )
if pid < processes-1:
seekEnd = queues[pid].get()
m1.seek( seekStart )
l1 = m1.readline()
while len(l1) > 0:
lines += 1
l1 = m1.readline()
if m1.tell() > seekEnd or len(l1) == 0:
break
logging.info( 'done' )
# add up the results
if pid == 0:
for p in range(1,processes):
lines += queues[0].get()
queues[0].put(lines) # the total lines counted
else:
queues[0].put(lines)
m1.close()
physical_file.close()
if __name__ == '__main__':
init_logger( 'main' )
if len(sys.argv) > 1:
file_name = sys.argv[1]
else:
logging.fatal( 'parameters required: file-name [processes]' )
exit()
t = time.time()
processes = multiprocessing.cpu_count()
if len(sys.argv) > 2:
processes = int(sys.argv[2])
queues=[] # a queue for each process
for pid in range(processes):
queues.append( multiprocessing.Queue() )
jobs=[]
prev_pipe = 0
for pid in range(processes):
p = multiprocessing.Process( target = getFileLineCount, args=(queues, pid, processes, file_name,) )
p.start()
jobs.append(p)
jobs[0].join() #wait for counting to finish
lines = queues[0].get()
logging.info( 'finished {} Lines:{}'.format( time.time() - t, lines ) )
导入多处理、系统、时间、操作系统、mmap
导入日志记录,logging.handlers
def初始记录器(pid):
控制台_格式='P{0}%(levelname)s%(message)s'。格式(pid)
logger=logging.getLogger()#根级别的新记录器
logger.setLevel(logging.INFO)
logger.handlers.append(logging.StreamHandler())
logger.handlers[0].setFormatter(logging.Formatter(控制台_格式,'%d/%m/%y%H:%m:%S'))
def getFileLineCount(队列、pid、进程、文件1):
初始记录器(pid)
logging.info('start')
物理文件=打开(文件1,“r”)
#mmap.mmap(文件号、长度[、标记名[、访问[、偏移]]
m1=mmap.mmap(物理文件.fileno(),0,access=mmap.access\u READ)
#计算文件大小以划分行计数
fSize=os.stat(file1).st\u size
区块=(fSize/processs)+1
直线=0
#到达我的起点和终点
_seedStart=chunk*(pid)
_seekEnd=chunk*(pid+1)
seekStart=int(_seedStart)
seekEnd=int(_seekEnd)
如果seekEnd我修改了缓冲区大小写如下:
def CountLines(filename):
f = open(filename)
try:
lines = 1
buf_size = 1024 * 1024
read_f = f.read # loop optimization
buf = read_f(buf_size)
# Empty file
if not buf:
return 0
while buf:
lines += buf.count('\n')
buf = read_f(buf_size)
return lines
finally:
f.close()
现在,空文件和最后一行(不带\n)也被计算在内。我在这个版本中得到了一个小的改进(4-8%),它重新使用了一个常量缓冲区,因此应该避免任何内存或GC开销:
lines = 0
buffer = bytearray(2048)
with open(filename) as f:
while f.readinto(buffer) > 0:
lines += buffer.count('\n')
您可以随意调整缓冲区大小,可能会看到一些改进。这一行如何:
file_length = len(open('myfile.txt','r').read().split('\n'))
使用此方法计算时间需要0.003秒
file_length = len(open('myfile.txt','r').read().split('\n'))
def c():
import time
s = time.time()
file_length = len(open('myfile.txt','r').read().split('\n'))
print time.time() - s
with open(input_file) as foo:
lines = len(foo.readlines())
print open('file.txt', 'r').read().count("\n") + 1
def line_count(path):
count = 0
with open(path) as lines:
for count, l in enumerate(lines, start=1):
pass
return count
import os
print os.popen("wc -l file_path").readline().split()[0]
num_lines = sum(1 for line in open('my_file.txt'))
num_lines = len(open('my_file.txt').read().splitlines())
In [20]: timeit sum(1 for line in open('Charts.ipynb'))
100000 loops, best of 3: 9.79 µs per loop
In [21]: timeit len(open('Charts.ipynb').read().splitlines())
100000 loops, best of 3: 12 µs per loop
def rawcount(filename):
f = open(filename, 'rb')
lines = 0
buf_size = 1024 * 1024
read_f = f.raw.read
buf = read_f(buf_size)
while buf:
lines += buf.count(b'\n')
buf = read_f(buf_size)
return lines
def _make_gen(reader):
b = reader(1024 * 1024)
while b:
yield b
b = reader(1024*1024)
def rawgencount(filename):
f = open(filename, 'rb')
f_gen = _make_gen(f.raw.read)
return sum( buf.count(b'\n') for buf in f_gen )
from itertools import (takewhile,repeat)
def rawincount(filename):
f = open(filename, 'rb')
bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
return sum( buf.count(b'\n') for buf in bufgen )
function average, s min, s ratio
rawincount 0.0043 0.0041 1.00
rawgencount 0.0044 0.0042 1.01
rawcount 0.0048 0.0045 1.09
bufcount 0.008 0.0068 1.64
wccount 0.01 0.0097 2.35
itercount 0.014 0.014 3.41
opcount 0.02 0.02 4.83
kylecount 0.021 0.021 5.05
simplecount 0.022 0.022 5.25
mapcount 0.037 0.031 7.46
num_lines = open('yourfile.ext').read().count('\n')
from functools import partial
buffer=2**16
with open(myfile) as f:
print sum(x.count('\n') for x in iter(partial(f.read,buffer), ''))
import os
os.system("wc -l filename")
>>> os.system('wc -l *.txt')
0 bar.txt
1000 command.txt
3 test_file.txt
1003 total
def line_count(filename):
return int(subprocess.check_output(['wc', '-l', filename]).split()[0])
import subprocess
def count_file_lines(file_path):
"""
Counts the number of lines in a file using wc utility.
:param file_path: path to file
:return: int, no of lines
"""
num = subprocess.check_output(['wc', '-l', file_path])
num = num.split(' ')
return int(num[0])
def count_text_file_lines(path):
with open(path, 'rt') as file:
line_count = sum(1 for _line in file)
return line_count