pd.get_dummies
與groupby
pd.get_dummies(df.My_CAT).groupby(df.My_ID).sum().reset_index()
My_ID A B C D
0 1 2 1 0 0
1 2 0 1 0 1
2 3 0 0 1 0
groupby
與value_counts
df.groupby('My_ID').My_CAT.value_counts() \
.unstack(fill_value=0).rename_axis(None, 1).reset_index()
My_ID A B C D
0 1 2 1 0 0
1 2 0 1 0 1
2 3 0 0 1 0
factorize
和numba
這是我的實驗方案
from numba import njit
import pandas as pd
import numpy as np
@njit
def xtab_array(f1, f2, m, n):
v = np.arange(m * n).reshape(m, n) * 0
for i in range(f1.size):
v[f1[i], f2[i]] += 1
return v
def xtab_df(df, c1, c2):
f1, u1 = pd.factorize(df[c1].values)
f2, u2 = pd.factorize(df[c2].values)
v = xtab_array(f1, f2, u1.size, u2.size)
return pd.DataFrame(
np.column_stack([u1, v]), columns=['My_ID'] + u2.tolist()
)
xtab_df(df, 'My_ID', 'My_CAT')
My_ID A B C D
0 1 2 1 0 0
1 2 0 1 0 1
2 3 0 0 1 0
純numpy
def xtab(df, c1, c2):
f1, u1 = pd.factorize(df[c1].values)
f2, u2 = pd.factorize(df[c2].values)
n, m = u1.size, u2.size
v = np.bincount(f1 * m + f2)
v = np.append(v, np.zeros(n * m - v.size)).reshape(n, -1)
return pd.DataFrame(
np.column_stack([u1, v]), columns=['My_ID'] + u2.tolist()
)
xtab(df, 'My_ID', 'My_CAT')
My_ID A B C D
0 1 2 1 0 0
1 2 0 1 0 1
2 3 0 0 1 0
個
定時
小數據
%timeit pd.crosstab(df['My_ID'], df['My_CAT'])
%timeit df.groupby(['My_ID','My_CAT']).size().unstack(fill_value=0)
%timeit pd.get_dummies(df.My_CAT).groupby(df.My_ID).sum()
%timeit df.groupby('My_ID').My_CAT.value_counts().unstack(fill_value=0)
%timeit xtab_df(df, 'My_ID', 'My_CAT')
%timeit xtab(df, 'My_ID', 'My_CAT')
100 loops, best of 3: 5.21 ms per loop
1000 loops, best of 3: 1.23 ms per loop
1000 loops, best of 3: 1.2 ms per loop
1000 loops, best of 3: 1.23 ms per loop
1000 loops, best of 3: 280 µs per loop
1000 loops, best of 3: 298 µs per loop
@ jezrael的更大的數據
np.random.seed(123)
N = 100000
L = list('abcdefghijklmno')
df = pd.DataFrame({'My_CAT': np.random.choice(L, N),
'My_ID':np.random.randint(1000,size=N)})
%timeit pd.crosstab(df['My_ID'], df['My_CAT'])
%timeit df.groupby(['My_ID','My_CAT']).size().unstack(fill_value=0)
%timeit pd.get_dummies(df.My_CAT).groupby(df.My_ID).sum()
%timeit df.groupby('My_ID').My_CAT.value_counts().unstack(fill_value=0)
%timeit xtab_df(df, 'My_ID', 'My_CAT')
%timeit xtab(df, 'My_ID', 'My_CAT')
10 loops, best of 3: 82.6 ms per loop
100 loops, best of 3: 10.7 ms per loop
100 loops, best of 3: 15.6 ms per loop
10 loops, best of 3: 19.9 ms per loop
100 loops, best of 3: 3.01 ms per loop
100 loops, best of 3: 3.22 ms per loop