2011-09-29 18 views
0

我從SQL查詢中獲取大量數據,並且需要很長時間才能運行。由於SQL查詢需要很長時間才能運行,因此我以最細化的形式從數據庫中獲取數據。然後,我循環這些數據一次,並以對我有用的形式聚合它。Python試圖重構(DRY輸出)很長的控制流程

我的問題是,我一遍又一遍地重複自己。但是,我不確定重構此控制流的最佳方式。提前致謝!

def processClickOutData(cls, raw_data): 
    singles = {} 
    total={} 
    absolute_total = 0 
    channels = {} 

    singles_true = {} 
    total_true={} 
    channels_true = {} 
    absolute_total_true = 0 

    list_channels = set([]) 
    list_tids = set([]) 


    total_position = {} 
    total_position_true = {} 
    tid_position = {} 
    channel_position = {} 
    channel_position_true = {} 
    tid_position_true = {} 

    for row in raw_data: 
     gap=row[0] 
     count=row[1] 
     tid=row[2] 
     prefered=row[3] 
     channel=row[4] 
     position=row[5] 

     list_channels.add(channel) 
     list_tids.add(tid) 


     absolute_total += int(count) 

     if total.has_key(gap): 
      total[gap] += count 
     else: 
      total[gap] = count 

     if singles.has_key(gap) and singles[gap].has_key(tid): 
      singles[gap][tid] += count 
     elif singles.has_key(gap): 
      singles[gap][tid] = count 
     else: 
      singles[gap] = {} 
      singles[gap][tid] = count 

     if channels.has_key(gap) and channels[gap].has_key(channel): 
      channels[gap][channel] += count 
     elif channels.has_key(gap): 
      channels[gap][channel] = count 
     else: 
      channels[gap] = {} 
      channels[gap][channel] = count 
     if total_position.has_key(position): 
      total_position[position] += count 
     else: 
      total_position[position] = count 
     if tid_position.has_key(position) and tid_position[position].has_key(tid): 
      tid_position[position][tid] += count  
     elif tid_position.has_key(position): 
      tid_position[position][tid] = count 
     else: 
      tid_position[position] = {} 
      tid_position[position][tid] = count 

     if channel_position.has_key(position) and channel_position[position].has_key(channel): 
      channel_position[position][channel] += count  
     elif channel_position.has_key(position): 
      channel_position[position][channel] = count 
     else: 
      channel_position[position] = {} 
      channel_position[position][channel] = count 

     if prefered == 0: 
      absolute_total_true += count 
      if total_true.has_key(gap): 
       total_true[gap] += count 
      else: 
       total_true[gap] = count 

      if singles_true.has_key(gap) and singles_true[gap].has_key(tid): 
       singles_true[gap][tid] += count 
      elif singles_true.has_key(gap): 
       singles_true[gap][tid] = count 
      else: 
       singles_true[gap] = {} 
       singles_true[gap][tid] = count 

      if channels_true.has_key(gap) and channels_true[gap].has_key(channel): 
       channels_true[gap][channel] += count 
      elif channels_true.has_key(gap): 
       channels_true[gap][channel] = count 
      else: 
       channels_true[gap] = {} 
       channels_true[gap][channel] = count 

      if total_position_true.has_key(position): 
       total_position_true[position] += count 
      else: 
       total_position_true[position] = count 

      if tid_position_true.has_key(position) and tid_position_true[position].has_key(tid): 
       tid_position_true[position][tid] += count  
      elif tid_position_true.has_key(position): 
       tid_position_true[position][tid] = count 
      else: 
       tid_position_true[position] = {} 
       tid_position_true[position][tid] = count 

      if channel_position_true.has_key(position) and channel_position_true[position].has_key(channel): 
       channel_position_true[position][channel] += count  
      elif channel_position_true.has_key(position): 
       channel_position_true[position][channel] = count 
      else: 
       channel_position_true[position] = {} 
       channel_position_true[position][channel] = count 




    final_values = {"singles" : singles, "singles_true" : singles_true, "total" : total, "total_true": total_true, "absolute_total": absolute_total, "absolute_total_true": absolute_total_true, "channel_totals" : channels, "list_channels" : list_channels, "list_tids" : list_tids, "channel_totals_true" : channels_true, 
        "total_position" : total_position, "total_position_true" : total_position_true, "tid_position" : tid_position, "channel_position" : channel_position, "tid_position_true" : tid_position_true, "channel_position_true" : channel_position_true } 
    return final_values 

回答

1

您用來存儲數據的整個結構可能是錯誤的,但由於我不知道如何使用它,因此我無法幫助您。

您可以通過使用collections.defaultdict來擺脫全部這些has_key()調用。注意thedict.has_key(key)已被棄用,您應該使用key in thedict來代替。

看我如何改變for環太 - 你可以分配到的名字就在for聲明中,無需單獨做到這一點。

from collections import defaultdict 

def processClickOutData(cls, raw_data): 
    absolute_total = 0 
    absolute_total_true = 0 

    list_channels = set() 
    list_tids = set() 

    total = defaultdict(int) 
    total_true = defaultdict(int) 
    total_position = defaultdict(int) 
    total_position_true = defaultdict(int) 

    def defaultdict_int(): 
     return defaultdict(int) 

    singles = defaultdict(defaultdict_int) 
    singles_true = defaultdict(defaultdict_int) 
    channels = defaultdict(defaultdict_int) 
    channels_true = defaultdict(defaultdict_int) 
    tid_position = defaultdict(defaultdict_int) 
    tid_position_true = defaultdict(defaultdict_int) 
    channel_position = defaultdict(defaultdict_int) 
    channel_position_true = defaultdict(defaultdict_int)  

    for gap, count, prefered, channel, position in raw_data: 
     list_channels.add(channel) 
     list_tids.add(tid) 

     absolute_total += count 
     total[gap] += count 
     singles[gap][tid] += count 
     channels[gap][channel] += count 
     total_position[position] += count 
     tid_position[position][tid] += count 
     channel_position[position][channel] += count 

     if prefered == 0: 
      absolute_total_true += count 
      total_true[gap] += count 
      singles_true[gap][tid] += count 
      channels_true[gap][channel] += count 
      total_position_true[position] += count 
      tid_position_true[position][tid] += count 
      channel_position_true[position][channel] += count 




    final_values = {"singles" : singles, "singles_true" : singles_true, "total" : total, "total_true": total_true, "absolute_total": absolute_total, "absolute_total_true": absolute_total_true, "channel_totals" : channels, "list_channels" : list_channels, "list_tids" : list_tids, "channel_totals_true" : channels_true, 
        "total_position" : total_position, "total_position_true" : total_position_true, "tid_position" : tid_position, "channel_position" : channel_position, "tid_position_true" : tid_position_true, "channel_position_true" : channel_position_true } 
    return final_values 

這樣做會自動填寫正確的默認值,如果鍵不存在。你在這裏有兩種。如果您要添加int s,您希望從0開始(如果它不存在) - 那麼int將返回,因此defaultdict(int)。如果您添加的字典中添加了int s,則需要使用返回defaultdict(int)的函數,這是defaultdict_int的作用。

編輯:建議的替代字典結構:

position = defaultdict(lambda: defaultdict(defaultdict_int)) 
gap = defaultdict(lambda: defaultdict(defaultdict_int)) 
absolute_total = 0 

for gap, count, prefered, channel, position in raw_data: 
    absolute_total += count 

    posd = position[position] 
    posd.setdefault('total', 0) 
    posd['total'] += count 
    posd['tid'][tid] += count 
    posd['channel'][channel] += count 

    gapd = gap[gap] 
    gapd.setdefault('total', 0) 
    gapd['total'] += count 
    gapd['tid'][tid] += count 
    gapd['channel'][channel] += count 

請與_true版本,以及相同的,你從12個dict勞燕分飛至4

+0

你是什麼意思的我用來存儲數據的整個結構是錯誤的?輸出是正確的,我正在檢查數據與SQL查詢執行相同的功能。 – Spencer

+0

@Peter我已經爲我的答案添加了一個例子。基本上,有十二個這樣的詞典非常混亂,四個完全一樣。 – agf

+0

謝謝!我對Python和編程一般都很陌生,所以這非常有幫助。我投了票,現在會接受。 – Spencer