for i, row in df_mBase.iterrows(): 
    for idx, val in enumerate(row): 
     df_mBase.ix[i][idx] = 1 


將數據存儲在數據框中的最快方式是什麼?蟒蛇與否,我只追求速度 - 我不在乎優雅。


從你在哪裏得到數據? CSV? –


你想要什麼?全1的數據幀? –





import numpy as np 
import pandas as pd 
import datetime as dt 
import dateutil as du 

dates = [dt.date(2017, 1, 1) - du.relativedelta.relativedelta(days=i) for i in range(36500)] 
data = np.zeros((36500,36500), dtype=np.uint8) 

def my_func(i, j): 
    return (sum(divmod(i,j)) - sum(divmod(j,i))) % 255 

for i in range(1, 36500): 
    for j in range(1, 36500): 
     data[i,j] = my_func(i,j) 

df = pd.DataFrame(data, columns=dates, index=dates) 


      2017-08-21 2017-08-20 2017-08-19 2017-08-18 2017-08-17 \ 
2017-08-21   0   0   0   0   0 
2017-08-20   0   0   254   253   252 
2017-08-19   0   1   0   0   0 
2017-08-18   0   2   0   0   1 
2017-08-17   0   3   0   254   0 

       ...  1917-09-19 1917-09-18 1917-09-17 1917-09-16 
2017-08-21  ...    0   0   0   0 
2017-08-20  ...    225   224   223   222 
2017-08-19  ...    114   113   113   112 
2017-08-18  ...    77   76   77   76 
2017-08-17  ...    60   59   58   57 

這可以作爲一種魔法。有沒有其他的技巧來加速這一點?低級代碼等?屠! – afora377




.ix是一個神奇的類型索引,這可如何是好標籤和位置索引,但是對於基於標籤的更嚴格的.loc和對於基於索引的.iloc將是deprecated。 我認爲.ix做了很多幕後魔術弄清楚是否需要標籤或基於位置的索引



df_mBase.loc[i, idx] = 1 



import pandas as pd 

import itertools 
import timeit 

def generate_dummy_data(years=1): 
    period = pd.Timedelta(365 * years, unit='D') 

    start = pd.Timestamp('19000101') 
    offset = pd.Timedelta(10, unit='h') 

    dates1 = pd.DatetimeIndex(start=start, end=start + period, freq='d') 
    dates2 = pd.DatetimeIndex(start=start + offset, end=start + offset + period, freq='d') 

    return pd.DataFrame(index=dates1, columns=dates2, dtype=float) 

def assign_original(df_orig): 
    df_new = df_orig.copy(deep=True) 
    for i, row in df_new.iterrows(): 
     for idx, val in enumerate(row): 
      df_new.ix[i][idx] = 1 
    return df_new 

def assign_other(df_orig): 
    df_new = df_orig.copy(deep=True) 
    for (i, idx_i), (j, idx_j) in itertools.product(enumerate(df_new.index), enumerate(df_new.columns)): 
     df_new[idx_j][idx_i] = 1 
    return df_new 

def assign_loc(df_orig): 
    df_new = df_orig.copy(deep=True) 
    for i, row in df_new.iterrows(): 
     for idx, val in enumerate(row): 
      df_new.loc[i][idx] = 1 
    return df_new 

def assign_loc_product(df_orig): 
    df_new = df_orig.copy(deep=True) 
    for i, j in itertools.product(df_new.index, df_new.columns): 
     df_new.loc[i, j] = 1 
    return df_new 

def assign_iloc_product(df_orig): 
    df_new = df_orig.copy(deep=True) 
    for (i, idx_i), (j, idx_j) in itertools.product(enumerate(df_new.index), enumerate(df_new.columns)): 
     df_new.iloc[i, j] = 1 
    return df_new 

def assign_iloc_product_range(df_orig): 
    df_new = df_orig.copy(deep=True) 
    for i, j in itertools.product(range(len(df_new.index)), range(len(df_new.columns))): 
     df_new.iloc[i, j] = 1 
    return df_new 

def assign_index(df_orig): 
    df_new = df_orig.copy(deep=True) 
    for (i, idx_i), (j, idx_j) in itertools.product(enumerate(df_new.index), enumerate(df_new.columns)): 
     df_new[idx_j][idx_i] = 1 
    return df_new 

def assign_column(df_orig): 
    df_new = df_orig.copy(deep=True) 
    for c, column in df_new.iteritems(): 
     for idx, val in enumerate(column): 
      df_new[c][idx] = 1 
    return df_new 

def assign_column2(df_orig): 
    df_new = df_orig.copy(deep=True) 
    for c, column in df_new.iteritems(): 
     for idx, val in enumerate(column): 
      column[idx] = 1 
    return df_new 

def assign_itertuples(df_orig): 
    df_new = df_orig.copy(deep=True) 
    for i, row in enumerate(df_new.itertuples()): 
     for idx, val in enumerate(row[1:]): 
      df_new.iloc[i, idx] = 1 
    return df_new 

def assign_applymap(df_orig): 
    df_new = df_orig.copy(deep=True) 
    df_new = df_new.applymap(lambda x: 1) 
    return df_new 

def assign_vectorized(df_orig): 
    df_new = df_orig.copy(deep=True) 
    for i in df_new: 
     df_new[i] = 1 
    return df_new 

methods = [ 
    ('assign_original', assign_original), 
    ('assign_loc', assign_loc), 
    ('assign_loc_product', assign_loc_product), 
    ('assign_iloc_product', assign_iloc_product), 
    ('assign_iloc_product_range', assign_iloc_product_range), 
    ('assign_index', assign_index), 
    ('assign_column', assign_column), 
    ('assign_column2', assign_column2), 
    ('assign_itertuples', assign_itertuples), 
    ('assign_vectorized', assign_vectorized), 
    ('assign_applymap', assign_applymap), 

def get_timings(period=1, methods=()): 
    print('=' * 10) 
    print(f'generating timings for a period of {period} years') 
    df_orig = generate_dummy_data(period) 
    repeats = 1 
    for method_name, method in methods: 
     result = pd.DataFrame() 

     def my_method(): 
      This looks a bit icky, but is the best way I found to make sure the values are really changed, 
      and not just on a copy of a DataFrame 
      nonlocal result 
      result = method(df_orig) 

     t = timeit.Timer(my_method).timeit(number=repeats) 

     assert result.iloc[3, 3] == 1 

     print(f'{method_name} took {t/repeats} seconds') 
     yield (method_name, {'time': t, 'memory': result.memory_usage(deep=True).sum()/1024}) 

periods = [0.03, 0.1, 0.3, 1, 3] 

results = {period: dict(get_timings(period, methods)) for period in periods} 


timings_dict = {period: {k: v['time'] for k, v in result.items()} for period, result in results.items()} 

df = pd.DataFrame.from_dict(timings_dict) 
       0.03  0.1   0.3   1.0   3.0 
assign_applymap    0.001989 0.009862 0.018018 0.105569 0.549511 
assign_vectorized    0.002974 0.008428 0.035994 0.162565 3.810138 
assign_index     0.013717 0.137134 1.288852 14.190128 111.102662 
assign_column2    0.026260 0.186588 1.664345 19.204453 143.103077 
assign_column     0.016811 0.212158 1.838733 21.053627 153.827845 
assign_itertuples    0.025130 0.249886 2.125968 24.639593 185.975111 
assign_iloc_product_range  0.026982 0.247069 2.199019 23.902244 186.548500 
assign_iloc_product   0.021225 0.233454 2.437183 25.143673 218.849143 
assign_loc_product   0.018743 0.290104 2.515379 32.778794 258.244436 
assign_loc     0.029050 0.349551 2.822797 32.087433 294.052933 
assign_original    0.034315 0.337207 2.714154 30.361072 332.327008 


timing plot


如果你不能向量化,df[column][index] = xdf.iteritems()工作最快,與遍歷列,緊隨其後


使用建議代碼的最後一行替換原始代碼實際上會使代碼由於某種原因運行速度降低一倍。 – afora377


我發現你的結果很有趣,所以我跑了一些基準 –
