Python 基于行内容将大文件拆分为多个

Python 基于行内容将大文件拆分为多个,python,multithreading,performance,file,Python,Multithreading,Performance,File,来自流的输入: from queue import Queue, Empty from threading import Thread import fileinput import fcntl import os num_threads = 10 total_in_memory = 1000000 single_in_memory_count = int(total_in_memory / num_threads) def process_tag_queue(q): # Each

来自流的输入:

from queue import Queue, Empty
from threading import Thread
import fileinput
import fcntl
import os

num_threads = 10
total_in_memory = 1000000
single_in_memory_count = int(total_in_memory / num_threads)

def process_tag_queue(q):
    # Each thread gets a line from the queue, saves it into a dict and flushes
    # the dict into a file when single_in_memory_count is reached
    current_in_mem_count = 0
    tag_dict = {}
    while True:
        try:
            tag, tag_id = q.get().rstrip().split("\t")
        except Empty:
            break

        if tag_id in tag_dict:
            tag_dict[tag_id].append(tag)
        else:
            tag_dict[tag_id] = [tag]
        current_in_mem_count +=1
        if current_in_mem_count == single_in_memory_count:
            write_tag_dict_to_file(tag_dict)
            current_in_mem_count = 0
            tag_dict = {}
        q.task_done()
    write_tag_dict_to_file(tag_dict) # Add remaining

def write_tag_dict_to_file(tag_dict):
    for tag_id in tag_dict:
        out_file = f"{tag_id}.tsv"

        f = open(out_file, "a")
        fcntl.flock(f.fileno(), fcntl.LOCK_EX)
        f.write("\t"+"\t".join(tag_dict[tag_id]))
        fcntl.flock(f.fileno(), fcntl.LOCK_UN)
        f.close()

def run_single(): # Save tags into a dict and flush it into a file single_in_memory_count is reached
    tag_dict = {}
    current_in_mem_count = 0
    for line in fileinput.input():
        tag, tag_id = line.rstrip().split("\t")

        if tag_id in tag_dict:
            tag_dict[tag_id].append(tag)
        else:
            tag_dict[tag_id] = [tag]
        current_in_mem_count +=1
        if current_in_mem_count == single_in_memory_count:
            write_tag_dict_to_file(tag_dict)
            current_in_mem_count = 0
            tag_dict = {}
    write_tag_dict_to_file(tag_dict) 

def run_multi(): # Main thread puts lines into queue 
    q = Queue(maxsize=5000) #Don't let the queue get too long
    for i in range(num_threads):
        thread = Thread(target=process_tag_queue, args=(q,))
        thread.setDaemon(True)
        thread.start()
    for line in fileinput.input():
        q.put(line)
    q.join()

if __name__ == '__main__':
    run_single()
    #run_multi()
标签id

tag0000001  12312
tag0000002  12
tag0000003  3
tag0000004  8
tag0000005  12312
tag0000006  12312
... ...
首选输出(可以在单个或多个文件中):

其他信息:

from queue import Queue, Empty
from threading import Thread
import fileinput
import fcntl
import os

num_threads = 10
total_in_memory = 1000000
single_in_memory_count = int(total_in_memory / num_threads)

def process_tag_queue(q):
    # Each thread gets a line from the queue, saves it into a dict and flushes
    # the dict into a file when single_in_memory_count is reached
    current_in_mem_count = 0
    tag_dict = {}
    while True:
        try:
            tag, tag_id = q.get().rstrip().split("\t")
        except Empty:
            break

        if tag_id in tag_dict:
            tag_dict[tag_id].append(tag)
        else:
            tag_dict[tag_id] = [tag]
        current_in_mem_count +=1
        if current_in_mem_count == single_in_memory_count:
            write_tag_dict_to_file(tag_dict)
            current_in_mem_count = 0
            tag_dict = {}
        q.task_done()
    write_tag_dict_to_file(tag_dict) # Add remaining

def write_tag_dict_to_file(tag_dict):
    for tag_id in tag_dict:
        out_file = f"{tag_id}.tsv"

        f = open(out_file, "a")
        fcntl.flock(f.fileno(), fcntl.LOCK_EX)
        f.write("\t"+"\t".join(tag_dict[tag_id]))
        fcntl.flock(f.fileno(), fcntl.LOCK_UN)
        f.close()

def run_single(): # Save tags into a dict and flush it into a file single_in_memory_count is reached
    tag_dict = {}
    current_in_mem_count = 0
    for line in fileinput.input():
        tag, tag_id = line.rstrip().split("\t")

        if tag_id in tag_dict:
            tag_dict[tag_id].append(tag)
        else:
            tag_dict[tag_id] = [tag]
        current_in_mem_count +=1
        if current_in_mem_count == single_in_memory_count:
            write_tag_dict_to_file(tag_dict)
            current_in_mem_count = 0
            tag_dict = {}
    write_tag_dict_to_file(tag_dict) 

def run_multi(): # Main thread puts lines into queue 
    q = Queue(maxsize=5000) #Don't let the queue get too long
    for i in range(num_threads):
        thread = Thread(target=process_tag_queue, args=(q,))
        thread.setDaemon(True)
        thread.start()
    for line in fileinput.input():
        q.put(line)
    q.join()

if __name__ == '__main__':
    run_single()
    #run_multi()
  • 输入通过管道从另一个程序输入(从二进制文件读取-可以多次读取)
  • 输入有数亿行
  • 输入行数已知
  • 已知所有输出文件/不同的标记ID(成千上万)
  • 输出文件中的标记数量差异很大(从几到数百万)
到目前为止我所尝试的:

from queue import Queue, Empty
from threading import Thread
import fileinput
import fcntl
import os

num_threads = 10
total_in_memory = 1000000
single_in_memory_count = int(total_in_memory / num_threads)

def process_tag_queue(q):
    # Each thread gets a line from the queue, saves it into a dict and flushes
    # the dict into a file when single_in_memory_count is reached
    current_in_mem_count = 0
    tag_dict = {}
    while True:
        try:
            tag, tag_id = q.get().rstrip().split("\t")
        except Empty:
            break

        if tag_id in tag_dict:
            tag_dict[tag_id].append(tag)
        else:
            tag_dict[tag_id] = [tag]
        current_in_mem_count +=1
        if current_in_mem_count == single_in_memory_count:
            write_tag_dict_to_file(tag_dict)
            current_in_mem_count = 0
            tag_dict = {}
        q.task_done()
    write_tag_dict_to_file(tag_dict) # Add remaining

def write_tag_dict_to_file(tag_dict):
    for tag_id in tag_dict:
        out_file = f"{tag_id}.tsv"

        f = open(out_file, "a")
        fcntl.flock(f.fileno(), fcntl.LOCK_EX)
        f.write("\t"+"\t".join(tag_dict[tag_id]))
        fcntl.flock(f.fileno(), fcntl.LOCK_UN)
        f.close()

def run_single(): # Save tags into a dict and flush it into a file single_in_memory_count is reached
    tag_dict = {}
    current_in_mem_count = 0
    for line in fileinput.input():
        tag, tag_id = line.rstrip().split("\t")

        if tag_id in tag_dict:
            tag_dict[tag_id].append(tag)
        else:
            tag_dict[tag_id] = [tag]
        current_in_mem_count +=1
        if current_in_mem_count == single_in_memory_count:
            write_tag_dict_to_file(tag_dict)
            current_in_mem_count = 0
            tag_dict = {}
    write_tag_dict_to_file(tag_dict) 

def run_multi(): # Main thread puts lines into queue 
    q = Queue(maxsize=5000) #Don't let the queue get too long
    for i in range(num_threads):
        thread = Thread(target=process_tag_queue, args=(q,))
        thread.setDaemon(True)
        thread.start()
    for line in fileinput.input():
        q.put(line)
    q.join()

if __name__ == '__main__':
    run_single()
    #run_multi()
虽然这两种方法都有效,但我对多线程方法的性能有些失望——单线程的速度要快得多。使用python多处理甚至更糟糕,因为添加到队列中的元素需要耗费大量时间

类似问题的答案建议保持输出文件的打开状态,但对于成千上万的输出文件来说,这似乎不合理。由于输入文件的大小,我避免将其转换为文本文件并对其进行排序

在这种情况下,多线程/处理甚至可以提高性能吗


欢迎提供任何建议或提示。

这是否回答了您的问题@JérômeRichard不幸的是,没有。如前所述,通过队列向进程发送数据的速度非常慢。但这给了我一个想法,尝试使用几个进程遍历这个大文件,每个进程都有自己的ID来查找和编写。更多的阅读,但至少没有GIL。一些想法:如果是io绑定,线程将不会有帮助<代码>内存中的总数似乎很低。只是一个想法:使用
multiprocess.manager
创建一个共享的
collections.defaultdict
。每个线程都会获得自己的文件块
Defaultdict
非常高效,在缺少密钥时自动具体化值,因此无需检查密钥是否存在或处理keyerror异常。Defaultdict也是线程安全的。@IODEV我做得对吗:主进程填充Defaultdict,而子进程不断检查dict是否“已满”?目前,我已经提高了内存计数,并保持尽可能多的文件打开(100)。