如何在Python中廉价地获取大文件的行数?

如何在Python中廉价地获取大文件的行数?,python,text-files,line-count,Python,Text Files,Line Count,我需要用python获得一个大文件的行数(数十万行)。记忆和时间方面最有效的方法是什么 目前我确实: def file_len(fname): with open(fname) as f: for i, l in enumerate(f): pass return i + 1 有可能做得更好吗?您可以执行子流程并运行wc-l filename import subprocess def file_len(fname): p =

我需要用python获得一个大文件的行数(数十万行)。记忆和时间方面最有效的方法是什么

目前我确实:

def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

有可能做得更好吗?

您可以执行子流程并运行
wc-l filename

import subprocess

def file_len(fname):
    p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE, 
                                              stderr=subprocess.PIPE)
    result, err = p.communicate()
    if p.returncode != 0:
        raise IOError(err)
    return int(result.strip().split()[0])

没有比这更好的了

毕竟,任何解决方案都必须读取整个文件,计算出您有多少个
\n
,然后返回结果


您有没有更好的方法可以在不读取整个文件的情况下执行此操作?不确定。。。最好的解决方案始终是I/O绑定,您所能做的最好是确保不使用不必要的内存,但看起来您已经涵盖了这一点。

对于我来说,此变体将是最快的:

#!/usr/bin/env python

def main():
    f = open('filename')                  
    lines = 0
    buf_size = 1024 * 1024
    read_f = f.read # loop optimization

    buf = read_f(buf_size)
    while buf:
        lines += buf.count('\n')
        buf = read_f(buf_size)

    print lines

if __name__ == '__main__':
    main()

原因:缓冲速度比逐行读取和
字符串快。计数也非常快

打开文件的结果是一个迭代器,它可以转换为一个序列,该序列的长度为:

with open(filename) as f:
   return len(list(f))
这比显式循环更简洁,避免了
枚举

这是怎么回事

def file_len(fname):
  counts = itertools.count()
  with open(fname) as f: 
    for _ in f: counts.next()
  return counts.next()

为什么不读取前100行和最后100行并估计平均行长,然后将总文件大小除以这些数字?如果您不需要精确的值,这可能会起作用。

我相信内存映射文件将是最快的解决方案。我尝试了四个函数:OP发布的函数(
opcount
);对文件中的行进行简单迭代(
simplecount
);具有内存映射字段(mmap)的读线(
mapcount
);以及Mykola Kharechko提供的缓冲区读取解决方案(
bufcount

我将每个函数运行了五次,并计算了120万行文本文件的平均运行时间

WindowsXP,Python 2.5,2GB内存,2GHz AMD处理器

以下是我的结果:

mapcount : 0.465599966049
simplecount : 0.756399965286
bufcount : 0.546800041199
opcount : 0.718600034714
编辑:Python 2.6的数字:

mapcount : 0.471799945831
simplecount : 0.634400033951
bufcount : 0.468800067902
opcount : 0.602999973297
因此,对于Windows/Python 2.6来说,缓冲区读取策略似乎是最快的

代码如下:

from __future__ import with_statement
import time
import mmap
import random
from collections import defaultdict

def mapcount(filename):
    f = open(filename, "r+")
    buf = mmap.mmap(f.fileno(), 0)
    lines = 0
    readline = buf.readline
    while readline():
        lines += 1
    return lines

def simplecount(filename):
    lines = 0
    for line in open(filename):
        lines += 1
    return lines

def bufcount(filename):
    f = open(filename)                  
    lines = 0
    buf_size = 1024 * 1024
    read_f = f.read # loop optimization

    buf = read_f(buf_size)
    while buf:
        lines += buf.count('\n')
        buf = read_f(buf_size)

    return lines

def opcount(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1


counts = defaultdict(list)

for i in range(5):
    for func in [mapcount, simplecount, bufcount, opcount]:
        start_time = time.time()
        assert func("big_file.txt") == 1209138
        counts[func].append(time.time() - start_time)

for key, vals in counts.items():
    print key.__name__, ":", sum(vals) / float(len(vals))

一句话,可能很快:

num_lines = sum(1 for line in open('myfile.txt'))

为了完成上述方法,我尝试了fileinput模块的一个变体:

import fileinput as fi   
def filecount(fname):
        for line in fi.input(fname):
            pass
        return fi.lineno()
并将一个60mil行文件传递给上述所有方法:

mapcount : 6.1331050396
simplecount : 4.588793993
opcount : 4.42918205261
filecount : 43.2780818939
bufcount : 0.170812129974
让我有点惊讶的是,fileinput如此糟糕,而且比所有其他方法的伸缩性都差得多…

count=max(enumerate(open(filename)))[0]
这个怎么样

import fileinput
import sys

counter=0
for line in fileinput.input([sys.argv[1]]):
    counter+=1

fileinput.close()
print counter

下面是一个python程序,它使用多处理库跨机器/内核分发行计数。我的测试使用8核windows 64服务器将2000万行文件的计数从26秒提高到7秒。注意:不使用内存映射会使事情变得更慢

import multiprocessing, sys, time, os, mmap
import logging, logging.handlers

def init_logger(pid):
    console_format = 'P{0} %(levelname)s %(message)s'.format(pid)
    logger = logging.getLogger()  # New logger at root level
    logger.setLevel( logging.INFO )
    logger.handlers.append( logging.StreamHandler() )
    logger.handlers[0].setFormatter( logging.Formatter( console_format, '%d/%m/%y %H:%M:%S' ) )

def getFileLineCount( queues, pid, processes, file1 ):
    init_logger(pid)
    logging.info( 'start' )

    physical_file = open(file1, "r")
    #  mmap.mmap(fileno, length[, tagname[, access[, offset]]]

    m1 = mmap.mmap( physical_file.fileno(), 0, access=mmap.ACCESS_READ )

    #work out file size to divide up line counting

    fSize = os.stat(file1).st_size
    chunk = (fSize / processes) + 1

    lines = 0

    #get where I start and stop
    _seedStart = chunk * (pid)
    _seekEnd = chunk * (pid+1)
    seekStart = int(_seedStart)
    seekEnd = int(_seekEnd)

    if seekEnd < int(_seekEnd + 1):
        seekEnd += 1

    if _seedStart < int(seekStart + 1):
        seekStart += 1

    if seekEnd > fSize:
        seekEnd = fSize

    #find where to start
    if pid > 0:
        m1.seek( seekStart )
        #read next line
        l1 = m1.readline()  # need to use readline with memory mapped files
        seekStart = m1.tell()

    #tell previous rank my seek start to make their seek end

    if pid > 0:
        queues[pid-1].put( seekStart )
    if pid < processes-1:
        seekEnd = queues[pid].get()

    m1.seek( seekStart )
    l1 = m1.readline()

    while len(l1) > 0:
        lines += 1
        l1 = m1.readline()
        if m1.tell() > seekEnd or len(l1) == 0:
            break

    logging.info( 'done' )
    # add up the results
    if pid == 0:
        for p in range(1,processes):
            lines += queues[0].get()
        queues[0].put(lines) # the total lines counted
    else:
        queues[0].put(lines)

    m1.close()
    physical_file.close()

if __name__ == '__main__':
    init_logger( 'main' )
    if len(sys.argv) > 1:
        file_name = sys.argv[1]
    else:
        logging.fatal( 'parameters required: file-name [processes]' )
        exit()

    t = time.time()
    processes = multiprocessing.cpu_count()
    if len(sys.argv) > 2:
        processes = int(sys.argv[2])
    queues=[] # a queue for each process
    for pid in range(processes):
        queues.append( multiprocessing.Queue() )
    jobs=[]
    prev_pipe = 0
    for pid in range(processes):
        p = multiprocessing.Process( target = getFileLineCount, args=(queues, pid, processes, file_name,) )
        p.start()
        jobs.append(p)

    jobs[0].join() #wait for counting to finish
    lines = queues[0].get()

    logging.info( 'finished {} Lines:{}'.format( time.time() - t, lines ) )
导入多处理、系统、时间、操作系统、mmap 导入日志记录,logging.handlers def初始记录器(pid): 控制台_格式='P{0}%(levelname)s%(message)s'。格式(pid) logger=logging.getLogger()#根级别的新记录器 logger.setLevel(logging.INFO) logger.handlers.append(logging.StreamHandler()) logger.handlers[0].setFormatter(logging.Formatter(控制台_格式,'%d/%m/%y%H:%m:%S')) def getFileLineCount(队列、pid、进程、文件1): 初始记录器(pid) logging.info('start') 物理文件=打开(文件1,“r”) #mmap.mmap(文件号、长度[、标记名[、访问[、偏移]] m1=mmap.mmap(物理文件.fileno(),0,access=mmap.access\u READ) #计算文件大小以划分行计数 fSize=os.stat(file1).st\u size 区块=(fSize/processs)+1 直线=0 #到达我的起点和终点 _seedStart=chunk*(pid) _seekEnd=chunk*(pid+1) seekStart=int(_seedStart) seekEnd=int(_seekEnd) 如果seekEndfSize: seekEnd=fSize #找到起点 如果pid>0: m1.搜索(seekStart) #读下一行 l1=m1.readline()#需要将readline与内存映射文件一起使用 seekStart=m1.tell() #告诉上一个等级我的搜索开始,使他们的搜索结束 如果pid>0: 队列[pid-1].put(seekStart) 如果pid<过程-1: seekEnd=queues[pid].get() m1.搜索(seekStart) l1=m1.readline() 当len(l1)>0时: 行数+=1 l1=m1.readline() 如果m1.tell()>seekEnd或len(l1)==0: 打破 logging.info('done') #把结果加起来 如果pid==0: 对于范围内的p(1,过程): 行+=队列[0]。获取() 队列[0]。放置(行)#已计数的总行数 其他: 队列[0]。放置(行) m1.关闭() 物理_文件。关闭() 如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu': 初始化记录器('main') 如果len(sys.argv)>1: 文件名=sys.argv[1] 其他: logging.fatal('所需参数:文件名[进程]') 退出() t=time.time() 进程=多处理。cpu\u计数() 如果len(sys.argv)>2: 进程=int(sys.argv[2]) 队列=[]#每个进程的队列 对于范围内的pid(过程): queues.append(multiprocessing.Queue()) 工作=[] 上一管道=0 对于范围内的pid(过程): p=multiprocessing.Process(target=getFileLineCount,args=(队列、pid、进程、文件名) p、 开始() jobs.append(p) 作业[0]。join()#等待计数完成 lines=队列[0]。get() logging.info('finished{}行:{}'。格式(time.time()-t,行))
我修改了缓冲区大小写如下:

def CountLines(filename):
    f = open(filename)
    try:
        lines = 1
        buf_size = 1024 * 1024
        read_f = f.read # loop optimization
        buf = read_f(buf_size)

        # Empty file
        if not buf:
            return 0

        while buf:
            lines += buf.count('\n')
            buf = read_f(buf_size)

        return lines
    finally:
        f.close()
现在,空文件和最后一行(不带\n)也被计算在内。

我在这个版本中得到了一个小的改进(4-8%),它重新使用了一个常量缓冲区,因此应该避免任何内存或GC开销:

lines = 0
buffer = bytearray(2048)
with open(filename) as f:
  while f.readinto(buffer) > 0:
      lines += buffer.count('\n')

您可以随意调整缓冲区大小,可能会看到一些改进。

这一行如何:

file_length = len(open('myfile.txt','r').read().split('\n'))
使用此方法计算时间需要0.003秒
file_length = len(open('myfile.txt','r').read().split('\n'))
def c():
  import time
  s = time.time()
  file_length = len(open('myfile.txt','r').read().split('\n'))
  print time.time() - s
with open(input_file) as foo:
    lines = len(foo.readlines())
print open('file.txt', 'r').read().count("\n") + 1
def line_count(path):
    count = 0
    with open(path) as lines:
        for count, l in enumerate(lines, start=1):
            pass
    return count
import os
print os.popen("wc -l file_path").readline().split()[0]
num_lines = sum(1 for line in open('my_file.txt'))
num_lines =  len(open('my_file.txt').read().splitlines())
In [20]: timeit sum(1 for line in open('Charts.ipynb'))
100000 loops, best of 3: 9.79 µs per loop

In [21]: timeit len(open('Charts.ipynb').read().splitlines())
100000 loops, best of 3: 12 µs per loop
def rawcount(filename):
    f = open(filename, 'rb')
    lines = 0
    buf_size = 1024 * 1024
    read_f = f.raw.read

    buf = read_f(buf_size)
    while buf:
        lines += buf.count(b'\n')
        buf = read_f(buf_size)

    return lines
def _make_gen(reader):
    b = reader(1024 * 1024)
    while b:
        yield b
        b = reader(1024*1024)

def rawgencount(filename):
    f = open(filename, 'rb')
    f_gen = _make_gen(f.raw.read)
    return sum( buf.count(b'\n') for buf in f_gen )
from itertools import (takewhile,repeat)

def rawincount(filename):
    f = open(filename, 'rb')
    bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None)))
    return sum( buf.count(b'\n') for buf in bufgen )
function      average, s  min, s   ratio
rawincount        0.0043  0.0041   1.00
rawgencount       0.0044  0.0042   1.01
rawcount          0.0048  0.0045   1.09
bufcount          0.008   0.0068   1.64
wccount           0.01    0.0097   2.35
itercount         0.014   0.014    3.41
opcount           0.02    0.02     4.83
kylecount         0.021   0.021    5.05
simplecount       0.022   0.022    5.25
mapcount          0.037   0.031    7.46
num_lines = open('yourfile.ext').read().count('\n')
from functools import partial

buffer=2**16
with open(myfile) as f:
        print sum(x.count('\n') for x in iter(partial(f.read,buffer), ''))
import os
os.system("wc -l  filename")  
>>> os.system('wc -l *.txt')

0 bar.txt
1000 command.txt
3 test_file.txt
1003 total
def line_count(filename):
    return int(subprocess.check_output(['wc', '-l', filename]).split()[0])
import subprocess

def count_file_lines(file_path):
    """
    Counts the number of lines in a file using wc utility.
    :param file_path: path to file
    :return: int, no of lines
    """
    num = subprocess.check_output(['wc', '-l', file_path])
    num = num.split(' ')
    return int(num[0])
def count_text_file_lines(path):
    with open(path, 'rt') as file:
        line_count = sum(1 for _line in file)
    return line_count