2013-01-21 51 views
1

考慮這一點,我有文件1:更好的方式來匹配並替換文本

SNP_ID GENE_ID # Header not present in original file 
rs1 TRIML1,TRIML2 
rs2 D4S234E 
rs4 ACCN5,CTSO 
rs5 ODZ3 
rs6 TRIML1 

和File2 ID並且在輸出中也有藥物名稱。輸出應該是這樣的:

GENE1_ID GENE2_ID SNP1_ID SNP2_ID Drug 
TRIML1 D4S234E rs1 rs2 xyz 
TRIML2 D4S234E rs1 rs2 xyz 
TRIML1 rs8 rs1 rs8 abc 
TRIML2 rs8 rs1 rs8 abc 
D4S234E ACCN5 rs2 rs4 xyz 
D4S234E CTSO rs2 rs4 xyz 
D4S234E ODZ3 rs2 rs5 abc1 
ODZ3 rs7 rs5 rs7 abc2 
TRIML1 ODZ3 rs6 rs5 xyz1 

我寫了下面的代碼做匹配和替換,但我想不出來獲得在輸出的最後三列。而且,當我必須在大文件上執行此操作時,這需要相當長的時間。關於如何有效地進行輸入?

snp_gene_dict = {} 
with open('File1') as f1: 
    for line in f1: 
     snp_key = line.split()[0] 
     vals = line.split()[1] 
     gene_val = vals.split(',') 
     snp_gene_dict[snp_key] = gene_val 

col0 = [] 
col1 = [] 
snp_first_col = [] 
snp_second_col = [] 
with open('File2') as f2: 
    for line in f2: 
     snp0, snp1 = line.split() 
      col0.append(snp0) 
      col1.append(snp1) 
    for i in range(len(col0)): 
      if col0[i] in snp_gene_dict.keys(): 
       snp_first_col.append(snp_gene_dict[col0[i]]) 
      else: 
       snp_first_col.append([col0[i]]) 
    for i in range(len(col1)): 
      if col1[i] in snp_gene_dict.keys(): 
       snp_second_col.append(snp_gene_dict[col1[i]]) 
      else: 
       snp_second_col.append([col1[i]]) 

with open('output-gene-gene', 'w') as out: 
    for i,j in map(None,snp_first_col,snp_second_col): 
     if len(i) == 1 and len(j) == 1: 
      out.write ('{a}\t{b} \n'.format(a = '\t'.join(i), b = '\t'.join(j))) 
     elif len(i) > 1 and len(j) == 1: 
      for item in i: 
       out.write ('{a}\t{b} \n'.format(a = item, b = '\t'.join(j))) 
     elif len(j) > 1 and len(i) == 1: 
      for item in j: 
       out.write ('{a}\t{b} \n'.format(a = '\t'.join(i), b= item)) 
     elif len(i) > 1 and len(j) > 1: 
      for elem1 in i: 
       for elem2 in j: 
        out.write('{a}\t{b} \n'.format(a = elem1, b = elem2)) 
+0

你可以使用sqlite嗎? – Nix

+0

@ Nix我不知道sqlite,只是一個初學者! – jules

+0

只是想確保你可以自由使用任何可用的工具,我想你可以將第一個文件讀入sqlite數據庫,然後查詢它。 – Nix

回答

0

這裏有一種使用SQLITE的方法,它的概念很簡單。只需將FILE1插入數據庫,然後從中讀取即可。

import logging 
INSERT_SPN_STATEMENT = 'INSERT INTO spn_table (spn_id, gene_id) VALUES (?, ?)' 
SELECT_SPN_BY_ID_STATEMENT='SELECT ID FROM spn_table WHERE spn_id=? and GENE_ID=?' 
def dump_file_to_db(File1, connection): 
    cursor = connection.cursor() 
    for l in File1: 
     #line looks like: rs1 TRIML1,TRIML2, split will split on spaces and since we have 2 we can save it in 2 vars 
     SPN_ID, GENE_ID = l.split() 
     for g in GENE_ID.split(','):#now for each gene (comma separated) insert into the spn_table 
      cursor.execute(SELECT_SPN_BY_ID_STATEMENT , (SPN_ID, g)) 
      if cursor.fetchone(): continue # record exists 
      cursor.execute(INSERT_SPN_STATEMENT , (SPN_ID, g)) 
    connection.commit() 

SELECT_SPN_STATEMENT = 'SELECT ID, spn_id, gene_id FROM spn_table WHERE spn_id=?' 
def read_file(File2, connection): 
    cursor = connection.cursor() 
    for l in File2: 
     spn1, spn2, drug = l.split() 
     #get spn1 from database 
     cursor.execute(SELECT_SPN_STATEMENT , (spn1,)) 
     _id, spn1_id, gene_id = cursor.fetchone() 
     cursor.execute(SELECT_SPN_STATEMENT , (spn2,)) 
     _id, spn2_id, gene_id = cursor.fetchone() 
     logging.info("%s %s %s %s", spn1_id, spn2_id, gene_id, drug) 


def initialize_db(): 
    conn = sqlite3.connect('test.db');c = conn.cursor() 
    # Create table 
    c.execute('''CREATE TABLE IF NOT EXISTS spn_table 
      (Id INTEGER PRIMARY KEY, spn_id text, gene_id text)''') 
    return conn 

import sqlite3 
connection = initialize_db() 
logging.basicConfig(level=logging.DEBUG) 
logging.info("Started") 
with open('File1.txt') as File1: 
    dump_file_to_db(File1, connection) 
with open('File2.txt') as File2: 
    read_file(File2, connection) 
logging.info("Done")