Python 我可以使用对分打印行的内容吗？_Python_Bisect

Python 我可以使用对分打印行的内容吗？

python

Python 我可以使用对分打印行的内容吗？,python,bisect,Python,Bisect,我有一个文件，每行都按字母顺序排列。这个文件是12Gb，这意味着我不能简单地逐行读取它。数据如下所示： brown0112 福克斯3501 跳了20610 每行开头的单词都是唯一的。每行上的单词和数字由制表符分隔。我希望能够查询特定关键字的文件。例如，如果我查询“fox”，程序应该返回“fox3 5 0 0 1” 似乎对分模块是一个很好的候选模块：我发现了一篇文章，它使用对分来找出关键字的行号：这是代码的样子： import bisect import os class Query(ob

我有一个文件，每行都按字母顺序排列。这个文件是12Gb，这意味着我不能简单地逐行读取它。数据如下所示：

brown0112
福克斯3501
跳了20610

每行开头的单词都是唯一的。每行上的单词和数字由制表符分隔。我希望能够查询特定关键字的文件。例如，如果我查询“fox”，程序应该返回“fox3 5 0 0 1”

似乎对分模块是一个很好的候选模块：

我发现了一篇文章，它使用对分来找出关键字的行号：

这是代码的样子：

import bisect
import os

class Query(object):
    def __init__(self, query, index=5):
        self.query = query
        self.index = index

    def __lt__(self, comparable):
        return self.query < comparable[self.index:]

class FileSearcher(object):
    def __init__(self, file_pointer, record_size=35):
        self.file_pointer = file_pointer
        self.file_pointer.seek(0, os.SEEK_END)
        self.record_size = record_size + len(os.linesep)
        self.num_bytes = self.file_pointer.tell()
        self.file_size = (self.num_bytes // self.record_size)

    def __len__(self):
        return self.file_size

    def __getitem__(self, item):
        self.file_pointer.seek(item * self.record_size)
        return self.file_pointer.read(self.record_size)

with open('myfile') as file_to_search:
    query = 'fox\t' #token to query
    wrapped_query = Query(query)
    searchable_file = FileSearcher(file_to_search)
    linepos = bisect.bisect(searchable_file, wrapped_query)
    print "Located @ line: ", linepos
    #print content of line?

导入对分导入操作系统类查询（对象）：定义初始化（self，查询，索引=5）： self.query=query self.index=索引定义（自身，可比）：返回self.query 但是，我不知道如何实际打印该行的内容。我至少应该在某个地方添加一个read语句，但我不知道在哪里

是否可以使用对分模块打印该行的内容？

尝试

查找到相关行并使用readline

print "Located @ line: ", linepos
file_to_search.seek(linepos)
line = file_to_search.readline()


这是假设linepos
是行的位置，从文件开头开始以字节计。如果是以行号计算的位置，则在查找之前需要乘以每行的字节数
print "Located @ line: ", linepos
file_to_search.seek(linepos * searchable_file.record_size)
line = file_to_search.readline()

尝试seek
ing到有问题的行并使用readline

print "Located @ line: ", linepos
file_to_search.seek(linepos)
line = file_to_search.readline()


这是假设linepos
是行的位置，从文件开头开始以字节计。如果是以行号计算的位置，则在查找之前需要乘以每行的字节数
print "Located @ line: ", linepos
file_to_search.seek(linepos * searchable_file.record_size)
line = file_to_search.readline()

如果要使用Python解决方案，可以执行以下操作：

按MAX_LINE
字节的小块读取文件，每次按固定偏移量向前移动
该偏移量决定块大小
对于每次读取，确定关键字（一行中的第一个字）
这些键用作块的分隔符
构造这些键的列表。列表将按键的顺序进行排序
您可以通过pickle/json.dumps/将这样的列表保存到某个地方
查询时，通过对分键所在块的索引进行查找
完全读取该块并找到包含数据的键

下面是示例文件bigfile
：
abc 4
bar 2
baz 3
egg 6
foo 1
god 8
ham 5
sex 7

守则：
import os
from bisect import bisect

MAX_LINE = 7
BLOCK_SIZE = 10

def parse_chunks(filename):

    size = os.path.getsize(filename)
    chunks = []

    with open(filename, 'rb') as file:
        block = str(file.read(MAX_LINE*2))
        first_line = block[:block.find('\n') + 1]
        chunks.append(first_line.split()[0])

        pos = BLOCK_SIZE
        while pos < size:
            file.seek(pos)
            block = str(file.read(MAX_LINE*2))
            first_eol = block.find('\n')
            second_eol = block.find('\n', first_eol + 1)
            if first_eol == -1 or second_eol == -1:
                break

            line = block[first_eol + 1:second_eol]

            key = line.split()[0]
            chunks.append(key)

            pos += BLOCK_SIZE

    return chunks


if __name__ == '__main__':
    BLOCK_SIZE = 10
    filename = 'bigfile'
    chunks = parse_chunks(filename)

    query = 'abc'
    pos_before = bisect(chunks, query) - 1

    with open(filename, 'rb') as file:
        file.seek(pos_before*BLOCK_SIZE)
        block = str(file.read(BLOCK_SIZE + MAX_LINE))
        line_start = block.find(query)
        line_end = block.find('\n', line_start + 1)
        line = block[line_start:line_end]

        print(line)

导入操作系统
从对分导入对分
最大线=7
块大小=10
def parse_块（文件名）：
size=os.path.getsize（文件名）
块=[]
打开（文件名为“rb”）作为文件：
block=str（file.read（最大行*2））
第一行=block[：block.find（'\n'）+1]
chunks.append（第一行.split（）[0]）
pos=块大小
而pos<尺寸：
文件查找（pos）
block=str（file.read（最大行*2））
first\u eol=block.find（'\n'）
第二个下线=块。查找（'\n'，第一个下线+1）
如果第一个下线==-1或第二个下线==-1：
打破
行=块[第一次下线+1：第二次下线]
key=line.split（）[0]
chunks.append（键）
pos+=块大小
返回块
如果uuuu name uuuuuu='\uuuuuuu main\uuuuuuu'：
块大小=10
文件名='bigfile'
chunks=parse_chunks（文件名）
查询='abc'
pos_before=对分（块，查询）-1
打开（文件名为“rb”）作为文件：
查找文件（位置在*块大小之前）
block=str（file.read（块大小+最大行））
行\u start=block.find（查询）
行\u结束=块。查找（'\n'，行\u开始+1）
行=块[行开始：行结束]
打印（行）

在这个玩具示例中，我使用10字节的块大小，对于12GB文件，我建议您从1M开始。
如果您想使用Python解决方案，可以执行以下操作：

按MAX_LINE
字节的小块读取文件，每次按固定偏移量向前移动
该偏移量决定块大小
对于每次读取，确定关键字（一行中的第一个字）
这些键用作块的分隔符
构造这些键的列表。列表将按键的顺序进行排序
您可以通过pickle/json.dumps/将这样的列表保存到某个地方
查询时，通过对分键所在块的索引进行查找
完全读取该块并找到包含数据的键

下面是示例文件bigfile
：
abc 4
bar 2
baz 3
egg 6
foo 1
god 8
ham 5
sex 7

守则：
import os
from bisect import bisect

MAX_LINE = 7
BLOCK_SIZE = 10

def parse_chunks(filename):

    size = os.path.getsize(filename)
    chunks = []

    with open(filename, 'rb') as file:
        block = str(file.read(MAX_LINE*2))
        first_line = block[:block.find('\n') + 1]
        chunks.append(first_line.split()[0])

        pos = BLOCK_SIZE
        while pos < size:
            file.seek(pos)
            block = str(file.read(MAX_LINE*2))
            first_eol = block.find('\n')
            second_eol = block.find('\n', first_eol + 1)
            if first_eol == -1 or second_eol == -1:
                break

            line = block[first_eol + 1:second_eol]

            key = line.split()[0]
            chunks.append(key)

            pos += BLOCK_SIZE

    return chunks


if __name__ == '__main__':
    BLOCK_SIZE = 10
    filename = 'bigfile'
    chunks = parse_chunks(filename)

    query = 'abc'
    pos_before = bisect(chunks, query) - 1

    with open(filename, 'rb') as file:
        file.seek(pos_before*BLOCK_SIZE)
        block = str(file.read(BLOCK_SIZE + MAX_LINE))
        line_start = block.find(query)
        line_end = block.find('\n', line_start + 1)
        line = block[line_start:line_end]

        print(line)

导入操作系统
从对分导入对分
最大线=7
块大小=10
def parse_块（文件名）：
size=os.path.getsize（文件名）
块=[]
打开（文件名为“rb”）作为文件：
block=str（file.read（最大行*2））
第一行=block[：block.find（'\n'）+1]
chunks.append（第一行.split（）[0]）
pos=块大小
而pos<尺寸：
文件查找（pos）
集团