Python 如果第1列匹配，则具有第11列值的提取行位于第二个文件的第2列和第3列之间_Python

Python 如果第1列匹配，则具有第11列值的提取行位于第二个文件的第2列和第3列之间

python

Python 如果第1列匹配，则具有第11列值的提取行位于第二个文件的第2列和第3列之间,python,Python,你好，我有两个文件文件1： chr5 20311169 20311244 5 20311177 20311251 K00230:40:HNWJLBBXX:4:1101:1002:35936 255 + - 20311210.00 chr5 26610220 26610295 5 26610221 26610296 K00230:40:HNWJLBBXX:4:1101:1022:24155 255 + - 26610258.00 chr5 20311169 20311244 5 20311177

你好，我有两个文件

文件1：

chr5 20311169 20311244 5 20311177 20311251 K00230:40:HNWJLBBXX:4:1101:1002:35936 255 + - 20311210.00 chr5 26610220 26610295 5 26610221 26610296 K00230:40:HNWJLBBXX:4:1101:1022:24155 255 + - 26610258.00 chr5 20311169 20311244 5 20311177 20311251 K00230:40:HNWJLBBXX:4:1101:1002:35936 255+-20311210.00 chr5 26610220 26610295 5 26610221 26610296 K00230:40:HNWJLBBXX:4:1101:1022:24155255+-26610258.00 文件2

chr5 20311200 20311220 Nucleosome:1 110 5.0 39.9 MainPeak 1.43492858 0.68583064 chr5 801 861 Nucleosome:2 70 1.0 5.4 MainPeak 0.17076187 0.806538035 chr5 1021 1091 Nucleosome:3 80 2.0 14.4 MainPeak 0.42430331 0.481579895 chr5 1181 1251 Nucleosome:4 80 1.0 7.5 MainPeak 0.1362587 0.32626102999999995 chr5 1361 1441 Nucleosome:5 90 2.0 14.7 MainPeak 0.34212933 0.291726595 chr5 1621 1801 Nucleosome:6 190 2.0 26.1 MainPeak:doublet 0.37546564 0.353192625 chr5 2011 2071 Nucleosome:7 70 1.0 5.7 MainPeak 0.15091517 0.396369735 chr5 2161 2331 Nucleosome:8 180 1.0 17.2 MainPeak 0.08865312 0.42133046500000004 chr5 2441 2561 Nucleosome:9 130 2.5 25.3 MainPeak 0.7368501 0.48843276 chr5 2781 2851 Nucleosome:10 80 3.0 17.5 MainPeak 0.80818501 1.303005 chr5 3271 3431 Nucleosome:11 170 3.0 34.5 MainPeak+Shoulder 0.72967697 1.348257495 chr5 3521 3571 Nucleosome:12 60 1.0 5.8 MainPeak 0.1880739 0.504429705 chr5 3641 3791 Nucleosome:13 160 1.0 12.5 MainPeak:doublet 0.10098579 0.363148215 chr5 20311200 20311220核小体：11105.039.9主峰1.43492858 0.68583064 chr5 801861核小体：270 1.0 5.4主峰0.17076187 0.806538035 chr5 1021 1091核小体：380 2.0 14.4主峰0.42430331 0.481579895 chr5 1181 1251核小体：480 1.0 7.5主峰0.1362587 0.3262610999995 chr5 1361 1441核小体：590 2.0 14.7主峰0.34212933 0.291726595 chr5 1621 1801核小体：61902.026.1主峰：双峰0.37546564 0.353192625 chr5 2011 2071核小体：770 1.0 5.7主峰0.15091517 0.396369735 chr5 2161 2331核小体：8 180 1.0 17.2主峰0.08865312 0.421330404650000004 chr5 2441 2561核小体：9 130 2.5 25.3主峰0.7368501 0.48843276 chr5 2781 2851核小体：10803.017.5主峰0.80818501 1.303005 chr5 3271 3431核小体：11 170 3.0 34.5主峰+肩峰0.72967697 1.348257495 chr5 3521 3571核小体：12 60 1.0 5.8主峰0.1880739 0.504429705 chr5 3641 3791核小体：13 160 1.0 12.5主峰：双峰0.10098579 0.363148215 如果第11列的值在seconds文件中声明的start和end（第2列和第3列）范围内，我对使用python代码打印文件1中的行感兴趣。由于该位置仅在特定染色体（chr）内是唯一的，因此必须首先测试chr是否相同。。。因此，我期望的输出是

chr5 20311169 20311244 5 20311177 20311251 K00230:40:HNWJLBBXX:4:1101:1002:35936 255 + - 20311210.00 chr5 20311169 20311244 5 20311177 20311251 K00230:40:HNWJLBBXX:4:1101:1002:35936 255+-20311210.00 我试过awk代码。。它工作得非常好，但速度非常慢

我正在测试的文件（我需要从中打印大约4GB的行）

如果我能有一些python代码，我将不胜感激

谢谢

一个从文本中提取第n列的简单函数使这一点变得相当简单。我假设你说的“第11列”是指从1开始计算的11列，而不是索引-11列，其中第一项是索引-0

伪代码：

Until there's no data left ~
    Read line1 from file1
    Read line2 from file2
    Extract Col11 from line1 as a real number
    Extract Col2 & Col3 from line2 as real numbers
    IF Col11 is within Col2 & Col3
        do something

Python代码：

import sys

# Given a space-separated row of data, return the Nth column as a real number
def getNthColumn(row, N):
    # Single-space the row, removing tabs, double-spaces etc.
    row = ' '.join(row.split())
    fields = row.split(' ')
    result = float(fields[N-1])   # fields are numbered 0->(N-1)
    #print("Returning column %d from [%s] -> %f" % (N, row, result))
    return result

if (len(sys.argv) == 3):
    fin1 = open(sys.argv[1], "rt")
    fin2 = open(sys.argv[2], "rt")  #TODO - handle file-not-found errors, etc.

    line1 = fin1.readline()
    line2 = fin2.readline()
    while (line1 != "" and line2 != ""):
        # Get the columns from the two lines
        f1_col11 = getNthColumn(line1, 11)
        f2_col2  = getNthColumn(line2,  2)
        f2_col3  = getNthColumn(line2,  3)  ### TODO handle errors
        # work out if it's a keeper
        # print("Is %f >= %f and %f <= %f" % (f1_col11, f2_col2, f1_col11, f2_col3))
        if (f1_col11 >= f2_col2 and f1_col11 <= f2_col3):
            print("MATCH: "+line1)
        else:
            print("NO-MATCH: "+line1)
        # Next rows
        line1 = fin1.readline()
        line2 = fin2.readline()
else:
    print("Give 2 files as arguments")

导入系统 #给定以空格分隔的数据行，将第n列作为实数返回 def GETNTH列（行，N）： #单行空格、删除制表符、双空格等。 row=''.join（row.split（））字段=行分割（“”）结果=浮点（字段[N-1]）#字段编号为0->（N-1） #打印（“从[%s]->%f”%返回列%d（N，行，结果））返回结果如果（len（sys.argv）==3）： fin1=打开（sys.argv[1]，“rt”） fin2=open（sys.argv[2]，“rt”）#TODO-handle file not found错误等。 line1=fin1.readline（） line2=fin2.readline（）而（第1行！=”和第2行！=”）： #从两行中获取列 f1\u col11=GetnTholumn（第1行，第11行） f2_col2=第n列（第2行，第2行） f2_col3=getNthColumn（第2行，第3行）####TODO处理错误 #看看是不是守门员 #打印（“是%f>=%f和%f%f”%（N，行，结果））返回结果如果（len（sys.argv）==3）： fin1=打开（sys.argv[1]，“rt”） fin2=open（sys.argv[2]，“rt”）#TODO-handle file not found错误等。 #加载整个file2，但只加载column2和column3 #注意最小col2和最大c3 line2=fin2.readline（）最小值c2=无最大值c3=无而（第2行！=“”）： col2=getnth列（第2行，第2行） col3=getnth列（第2行，第3行） file2_col23.append（（col2，col3）） #注意最小c2和最大c3，以便我们可以快速知道搜索是否可以 #可能产生结果如果（最小值c2==无或col2<最小值c2）：最小值c2=col2 如果（max_c3==None或col3>max_c3）：最大值c3=col3 #下一行 line2=fin2.readline（）.strip（） #对列进行排序，使我们能够快速搜索 file2_col23.sort（） line1=fin1.readline（）而（第1行！=“”）： col11=getNthColumn（第1行，第11行）匹配=错误 #col11是否在任何file2行col2或col3中

如果（col11>=min_c2和col11=col2和col11欢迎使用StackOverflow。请阅读并遵循帮助文档中的发布指南，如您创建此帐户时所建议的，并在此处应用。StackOverflow不是设计、编码、研究或教程资源。但是，如果您遵循联机找到的任何资源，请嵌套编码尝试，并遇到问题，您将有一个很好的示例发布。您好，非常感谢您的回复！我已复制粘贴了您的代码，因为它位于名为test.py的文件中，并使用以下命令python test.py file1 file2运行它。但是，我得到了如下不匹配的输出：chr5 19776794 19776869 5 19776845 19776920 K00230:40：HNWJLBBXX:4:1127:18467:28112 255+-19776857.00但我确信我的输入文件有一半以上在该范围内（这里我只测试了chr5）。请帮助！我应该更改您发布的代码中的任何参数吗？很抱歉，我是python新手…不需要C/C++…此代码非常快。@Smiley-在您的问题中添加几行测试数据，我可以查看一下。再次感谢您的回复…以下是文件1中的几行：chr5 20311169 20311244 5 20311177 20311251 K00230：40:HNWJLBBXX:4:1101:1002:35936 255+-20311210.00 chr5 26610220 26610295 5 26610221 26610296 K00230:40:HNWJLBBXX:4:1101:1022:24155 255+-2

import sys

# Hold all the file2 Columns
file2_col23 = []

# Given a space-separated row of data, return the Nth column as a real number
def getNthColumn(row, N):
    # Single-space the row, removing tabs, double-spaces etc.
    row = ' '.join(row.split())
    fields = row.split(' ')
    try:
        result = float(fields[N-1])   # fields are numbered 0->(N-1)
    except:
        sys.stderr.write("Failed to fetch number column %d from [%s]" % (N, row))
        sys.exit(1)
    #print("Returning column %d from [%s] -> %f" % (N, row, result))
    return result

if (len(sys.argv) == 3):
    fin1 = open(sys.argv[1], "rt")
    fin2 = open(sys.argv[2], "rt")  #TODO - handle file-not-found errors, etc.

    # Load in the whole of file2, but just the column2 & column3
    # note the minimum col2 and maximum c3
    line2 = fin2.readline()
    min_c2 = None
    max_c3 = None
    while (line2 != ""):
        col2 = getNthColumn(line2, 2)
        col3 = getNthColumn(line2, 3)
        file2_col23.append( ( col2, col3 ) )
        # Note the min c2 and max c3 so we can quickly know if a search can
        # possible produce a result
        if (min_c2 == None or col2 < min_c2):
            min_c2 = col2
        if (max_c3 == None or col3 > max_c3):
            max_c3 = col3
        # next line
        line2 = fin2.readline().strip()

    # sort the columns to allow us to short-cut searching
    file2_col23.sort()


    line1 = fin1.readline()
    while (line1 != ""):
        col11 = getNthColumn(line1, 11)

        matched = False
        # is col11 is within any file2 row col2 or col3
        if (col11 >= min_c2 and col11 <= max_c3):   # make sure the search is worthwhile
            for col23 in file2_col23:
                (col2, col3) = col23
                if (col11 >= col2 and col11 <= col3):
                    matched = True
                    break

        if (matched == True):
            print("MATCH: "+str(line1))
        else:
            print("NO-MATCH: "+str(line1))

        # Next row
        line1 = fin1.readline()
else:
    print("Give 2 files as arguments")