2016-11-21 66 views
2
import distance 
from collections import defaultdict 

my_list = ['ACAA', 'TCAA','TCAT','TGAT','TCGA','TGGA','GCGA','AAAA','GGGG','GGGC'] 
counts = {'ACAA':60, 'TCAA':3,'TCAT':30,'TGAT':8,'TCGA':1,'TGGA':1,'GCGA':8,'AAAA':5,'GGGG':8,'GGGC':1} 

adj_list = defaultdict(list) 
for strng1 in my_list: 
    for strng2 in my_list: 
     if distance.hamming(strng1, strng2) == 1 and counts[strng1] >= (counts[strng2]*2): 
      adj_list[strng1].append(strng2) 

我對得到一個定向鄰接表此實現。預期結果:定向鄰接錶快速實現

ACAA: TCAA 
TCAA: TCGA 
TCAT: TCAA, TGAT 
TGAT 
TCGA: TGGA 
TGGA: TCGA 
GCGA: TCGA 
AAAA 
GGGG: GGGC 
GGGC 

是否有更快的實施?對於大型數據集,這會變得非常緩慢。在cython中重寫它會加速它嗎?如果是的話,有人可以幫我開始使用cython嗎?

回答

1

我不知道用Cython,但你能避免在內部循環訪問字典項:

adj_list = defaultdict(list) 
for strng1 in my_list: 
    a1 = adj_list[strng1] 
    c1 = counts[strng1] 
    for strng2 in my_list: 
     if distance.hamming(strng1, strng2) == 1 and c1 >= (counts[strng2]*2): 
      a1.append(strng2) 

你甚至可以削減更多隻在下半場迭代以及執行對稱追加。這樣你就可以節省距離的50%,因爲它是對稱的。你只在上面的矩陣三角形(對角線除外,我假設一個字符串與它自己的距離爲0)而不是整個矩陣上執行它。

for i,strng1 in enumerate(my_list): 
    ... 
    for j in range(i+1,len(my_list)): 

我嘗試,我不知道,但應該接近:

adj_list = defaultdict(list) 
for i,strng1 in enumerate(my_list): 
    a1 = adj_list[strng1] 
    c1 = counts[strng1] 
    for j in range(i+1,len(my_list)): 
     strng2 = my_list[j] 
     if distance.hamming(strng1, strng2) == 1: 
      c2 = counts[strng2] 
      if c1 >= (c2*2): 
       a1.append(strng2) 
      if c2 >= (c1*2): 
       adj_list[strng1].append(strng2) 

crysis405編輯:

原文:

def adj_lst(my_list, counts): 
    adj_list = defaultdict(list) 
    for strng1 in my_list: 
     a1 = adj_list[strng1] 
     c1 = counts[strng1] 
     for strng2 in my_list: 
      if distance.hamming(strng1, strng2) == 1 and c1 >= (counts[strng2]*2): 
       adj_list[strng1].append(strng2) 

建議改進:

def adj_lst_fast(my_list, counts): 
    adj_list_fast = defaultdict(list) 
    for i,strng1 in enumerate(my_list): 
     a1 = adj_list_fast[strng1] 
     c1 = counts[strng1] 
     for j in range(i+1,len(my_list)): 
      strng2 = my_list[j] 
      if distnace.hamming(strng1, strng2: 
       c2 = counts[strng2] 
       if c1 >= (c2*2): 
        adj_list_fast[strng1].append(strng2) 
       elif c2 >= (c1*2): 
        adj_list_fast[strng2].append(strng1) 

性能:

print(timeit.timeit('adj_lst(my_list, counts)', number = 10000, 
setup="from __main__ import adj_lst, my_list, counts")) 

1.2892486669989012

print(timeit.timeit('adj_lst_fast(my_list, counts)', number = 10000, 
setup="from __main__ import adj_lst_fast, my_list, counts")) 

0.6437049919986748

+0

@ crysis405巨大的豎起大拇指你那證明我的觀點:)我們需要更多的編輯像你這樣優秀的編輯。 –