Python 匹配文本和替换的更好方法_Python_Replace_Match_Performance

Python 匹配文本和替换的更好方法

python replace performance

Python 匹配文本和替换的更好方法,python,replace,match,performance,Python,Replace,Match,Performance,考虑到这一点，我有文件1： SNP_ID GENE_ID # Header not present in original file rs1 TRIML1,TRIML2 rs2 D4S234E rs4 ACCN5,CTSO rs5 ODZ3 rs6 TRIML1 和文件2： SNP1_ID SNP2_ID DRUG # Header not present in original file rs1 rs2 xyz rs1 rs8 abc rs2 rs4 xyz rs2 rs5 abc1 rs

考虑到这一点，我有文件1：

SNP_ID  GENE_ID # Header not present in original file
rs1 TRIML1,TRIML2
rs2 D4S234E
rs4 ACCN5,CTSO
rs5 ODZ3
rs6 TRIML1

和文件2：

SNP1_ID SNP2_ID DRUG # Header not present in original file
rs1 rs2 xyz
rs1 rs8 abc
rs2 rs4 xyz
rs2 rs5 abc1
rs5 rs7 abc2
rs6 rs5 xyz1

我想将文件2中的SNP ID与文件1匹配，并替换为相应的基因ID，同时在输出中包含药物名称。输出应该如下所示：

GENE1_ID    GENE2_ID    SNP1_ID SNP2_ID Drug
TRIML1  D4S234E rs1 rs2 xyz
TRIML2  D4S234E rs1 rs2 xyz
TRIML1  rs8 rs1 rs8 abc
TRIML2  rs8 rs1 rs8 abc
D4S234E ACCN5   rs2 rs4 xyz
D4S234E CTSO    rs2 rs4 xyz
D4S234E ODZ3    rs2 rs5 abc1
ODZ3    rs7 rs5 rs7 abc2
TRIML1  ODZ3    rs6 rs5 xyz1

我编写了下面的代码来进行匹配和替换，但是我不知道如何获得输出中的最后三列。而且，当我必须在大文件上执行此操作时，这需要相当长的时间。有效地完成这项工作的投入

 snp_gene_dict = {}
 with open('File1') as f1:
     for line in f1:
         snp_key = line.split()[0]
         vals = line.split()[1]
         gene_val = vals.split(',')
         snp_gene_dict[snp_key] = gene_val

 col0 = []
 col1 = []
 snp_first_col = []
 snp_second_col = []
 with open('File2') as f2:
     for line in f2:
         snp0, snp1 = line.split()
             col0.append(snp0)
             col1.append(snp1)
     for i in range(len(col0)):
          if col0[i] in snp_gene_dict.keys():
              snp_first_col.append(snp_gene_dict[col0[i]])
          else:
              snp_first_col.append([col0[i]])
     for i in range(len(col1)):
          if col1[i] in snp_gene_dict.keys():
              snp_second_col.append(snp_gene_dict[col1[i]])
          else:
              snp_second_col.append([col1[i]])

 with open('output-gene-gene', 'w') as out:
     for i,j in map(None,snp_first_col,snp_second_col): 
         if len(i) ==  1 and len(j) == 1:
             out.write ('{a}\t{b} \n'.format(a = '\t'.join(i), b = '\t'.join(j)))
         elif len(i) > 1 and len(j) == 1:
             for item in i:
                 out.write ('{a}\t{b} \n'.format(a = item, b = '\t'.join(j)))
         elif len(j) > 1 and len(i) == 1:
             for item in j:
                 out.write ('{a}\t{b} \n'.format(a = '\t'.join(i), b= item))
         elif len(i) > 1 and len(j) > 1:
             for elem1 in i:
                 for elem2 in j:
                     out.write('{a}\t{b} \n'.format(a = elem1, b = elem2))

这里有一种使用SQLITE的方法，概念非常简单。只需将FILE1插入数据库，然后从中读取

import logging
INSERT_SPN_STATEMENT = 'INSERT INTO spn_table (spn_id, gene_id) VALUES (?, ?)'
SELECT_SPN_BY_ID_STATEMENT='SELECT ID FROM spn_table WHERE spn_id=? and GENE_ID=?'
def dump_file_to_db(File1, connection):
    cursor = connection.cursor()
    for l in File1:
        #line looks like:  rs1 TRIML1,TRIML2, split will split on spaces and since we have 2 we can save it in 2 vars
        SPN_ID, GENE_ID = l.split()
        for g in GENE_ID.split(','):#now for each gene (comma separated) insert into the spn_table
            cursor.execute(SELECT_SPN_BY_ID_STATEMENT , (SPN_ID, g))
            if cursor.fetchone(): continue # record exists
            cursor.execute(INSERT_SPN_STATEMENT , (SPN_ID, g))
    connection.commit()

SELECT_SPN_STATEMENT = 'SELECT ID, spn_id, gene_id FROM spn_table WHERE spn_id=?'
def read_file(File2, connection):
    cursor = connection.cursor()
    for l in File2:
        spn1, spn2, drug = l.split()
        #get spn1 from database
        cursor.execute(SELECT_SPN_STATEMENT , (spn1,))
        _id, spn1_id, gene_id = cursor.fetchone()
        cursor.execute(SELECT_SPN_STATEMENT , (spn2,))
        _id, spn2_id, gene_id = cursor.fetchone()
        logging.info("%s %s %s %s", spn1_id, spn2_id, gene_id, drug)


def initialize_db():
    conn = sqlite3.connect('test.db');c = conn.cursor()
    # Create table
    c.execute('''CREATE TABLE IF NOT EXISTS spn_table
             (Id INTEGER PRIMARY KEY, spn_id text, gene_id text)''')
    return conn

import sqlite3
connection = initialize_db()
logging.basicConfig(level=logging.DEBUG)
logging.info("Started")
with open('File1.txt') as File1:
    dump_file_to_db(File1, connection)
with open('File2.txt') as File2:
    read_file(File2, connection)
logging.info("Done")

@Nix我不知道sqlite，只是个初学者！我只是想确保您可以自由使用任何可用的工具，我想您可以将第一个文件读入sqlite数据库，然后查询它。