2017-04-20 152 views
1

我有這樣一個數據幀創建一個列分組的排列由另一列熊貓

df_want = pd.DataFrame([['jon snow', 'jon-snow', 'jon-snow'], 
       ['jon snow', 'jon-snow', 'jon+snow'], 
       ['jon snow', 'jon-snow', 'jonsnow'], 
       ['jon snow', 'jon-snow', np.nan], 
       ['jon snow', 'jon+snow', 'jon-snow'], 
       ['jon snow', 'jon+snow', 'jon+snow'], 
       ['jon snow', 'jon+snow', 'jonsnow'], 
       ['jon snow', 'jon+snow', np.nan], 
       ['jon snow', 'jonsnow', 'jon-snow'], 
       ['jon snow', 'jonsnow', 'jon+snow'], 
       ['jon snow', 'jonsnow', 'jon-snow'], 
       ['jon snow', 'jonsnow', np.nan], 
       ['jon snow', np.nan, 'jon-snow'], 
       ['jon snow', np.nan, 'jon+snow'], 
       ['jon snow', np.nan, 'jonsnow'], 
       ['jon snow', np.nan, np.nan]], columns=['name', 'name_variation', 'name_variation_2']) 

我是想這裏面工作,但感覺長篇大論:

def combinations(df): 
    df = df.drop_duplicates() 
    df = df.dropna() 

    df['k'] = df['brand_variation'] 
    df['val'] = 1 

    df_final = pd.DataFrame(columns=['brand', 'k', 'brand_variation',]) 
    for res in df['brand'].unique(): 
     #print(res, len(df[df['brand'] == res])) 
     dfm = df[df['brand'] == res] 
     dfk = pd.pivot_table(dfm, index=['brand', 'k'], columns=['brand_variation'], values=['val'], fill_value=0, aggfunc=[np.sum]).stack().reset_index() 
     dfk.columns = dfk.columns.get_level_values(level=0) 
     dfk = dfk[['brand', 'k', 'brand_variation']] 
     df_final = df_final.append(dfk) 

    df_final = df_final.reset_index(drop=True) 
    return df_final 

更好的方法來做到這一點?

回答

2

numpy

u = pd.unique(df.values.ravel()) 
r = np.arange(u.size) 
i, j = r.repeat(u.size), np.tile(r, u.size) 

pd.DataFrame(dict(
     name=['jon snow' for _ in range(i.size)], 
     name_variation=u[i], 
     name_variation2=u[j] 
    )) 

     name name_variation name_variation2 
0 jon snow  jon snow  jon snow 
1 jon snow  jon snow  jon-snow 
2 jon snow  jon snow  jon+snow 
3 jon snow  jon snow   jonsnow 
4 jon snow  jon-snow  jon snow 
5 jon snow  jon-snow  jon-snow 
6 jon snow  jon-snow  jon+snow 
7 jon snow  jon-snow   jonsnow 
8 jon snow  jon+snow  jon snow 
9 jon snow  jon+snow  jon-snow 
10 jon snow  jon+snow  jon+snow 
11 jon snow  jon+snow   jonsnow 
12 jon snow  jonsnow  jon snow 
13 jon snow  jonsnow  jon-snow 
14 jon snow  jonsnow  jon+snow 
15 jon snow  jonsnow   jonsnow 

pandas

u = pd.unique(df.values.ravel()) 
pd.Series(
    'jon snow', 
    pd.MultiIndex.from_product(
     [u, u], names=['name_variation', 'name_variation2'] 
    ), 
    name='name' 
).reset_index() 

    name_variation name_variation2  name 
0  jon snow  jon snow jon snow 
1  jon snow  jon-snow jon snow 
2  jon snow  jon+snow jon snow 
3  jon snow   jonsnow jon snow 
4  jon-snow  jon snow jon snow 
5  jon-snow  jon-snow jon snow 
6  jon-snow  jon+snow jon snow 
7  jon-snow   jonsnow jon snow 
8  jon+snow  jon snow jon snow 
9  jon+snow  jon-snow jon snow 
10  jon+snow  jon+snow jon snow 
11  jon+snow   jonsnow jon snow 
12  jonsnow  jon snow jon snow 
13  jonsnow  jon-snow jon snow 
14  jonsnow  jon+snow jon snow 
15  jonsnow   jonsnow jon snow 
0

其實有一個很簡單的解決辦法,我只是覺得它的後期:

df = pd.merge(df, df, left_on='brand', right_on='brand', how='inner').drop_duplicates() 

arrrrgh!