2017-03-27 79 views
-1

我正在實施Greg Reda的隊列分析(http://www.gregreda.com/2015/08/23/cohort-analysis-with-python/)。這種分析很容易在每月和每年之間進行,但對於如何按季度實施這一分析我感到不知所措。熊貓每月/每季度的隊列分析

他下面

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib as mpl 

pd.set_option('max_columns', 50) 
mpl.rcParams['lines.linewidth'] = 2 

%matplotlib inline 
df = pd.read_excel('/Users/gjreda/Dropbox/datasets/relay-foods.xlsx') 
df.head() 

df['OrderPeriod'] = df.OrderDate.apply(lambda x: x.strftime('%Y-%m')) 
df.head() 

df.set_index('UserId', inplace=True) 

df['CohortGroup'] = df.groupby(level=0)['OrderDate'].min().apply(lambda x:      x.strftime('%Y-%m')) 
df.reset_index(inplace=True) 
df.head() 

grouped = df.groupby(['CohortGroup', 'OrderPeriod']) 

# count the unique users, orders, and total revenue per Group + Period 
cohorts = grouped.agg({'UserId': pd.Series.nunique, 
         'OrderId': pd.Series.nunique, 
         'TotalCharges': np.sum}) 

# make the column names more meaningful 
cohorts.rename(columns={'UserId': 'TotalUsers', 
         'OrderId': 'TotalOrders'}, inplace=True) 
cohorts.head() 

def cohort_period(df): 
    """ 
    Creates a `CohortPeriod` column, which is the Nth period based on the  user's first purchase. 

    Example 
    ------- 
    Say you want to get the 3rd month for every user: 
     df.sort(['UserId', 'OrderTime', inplace=True) 
     df = df.groupby('UserId').apply(cohort_period) 
     df[df.CohortPeriod == 3] 
    """ 
    df['CohortPeriod'] = np.arange(len(df)) + 1 
    return df 

cohorts = cohorts.groupby(level=0).apply(cohort_period) 
cohorts.head() 

# reindex the DataFrame 
cohorts.reset_index(inplace=True) 
cohorts.set_index(['CohortGroup', 'CohortPeriod'], inplace=True) 

# create a Series holding the total size of each CohortGroup 
cohort_group_size = cohorts['TotalUsers'].groupby(level=0).first() 
cohort_group_size.head() 

user_retention = cohorts['TotalUsers'].unstack(0).divide(cohort_group_size,  axis=1) 
user_retention.head(10) 

user_retention[['2009-06', '2009-07', '2009-08']].plot(figsize=(10,5)) 
plt.title('Cohorts: User Retention') 
plt.xticks(np.arange(1, 12.1, 1)) 
plt.xlim(1, 12) 
plt.ylabel('% of Cohort Purchasing'); 

# Creating heatmaps in matplotlib is more difficult than it should be. 
# Thankfully, Seaborn makes them easy for us. 
# http://stanford.edu/~mwaskom/software/seaborn/ 

import seaborn as sns 
sns.set(style='white') 

plt.figure(figsize=(12, 8)) 
plt.title('Cohorts: User Retention') 
sns.heatmap(user_retention.T, mask=user_retention.T.isnull(), annot=True,  fmt='.0%'); 

完整的代碼謝謝你,

+0

有什麼問題嗎? –

回答

4

在方法是修改現有的代碼的兩行定義的時間作爲宿舍。

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import matplotlib as mpl 

pd.set_option('max_columns', 50) 
mpl.rcParams['lines.linewidth'] = 2 

%matplotlib inline 
df = pd.read_excel('chapter-12-relay-foods.xlsx',sheetname='Purchase Data - Full Study') 
df.head() 

改變了這種線由季度來組織:

df['OrderPeriod'] = df.OrderDate.apply(lambda x: str(x.year)+'q'+str(x.quarter)) 
df 

df.set_index('UserId', inplace=True) 

df['CohortGroup'] = df.groupby(level=0)['OrderDate'].min().apply(lambda x:      x.strftime('%Y-%m')) 
df.reset_index(inplace=True) 
df.head() 

grouped = df.groupby(['CohortGroup', 'OrderPeriod']) 

# count the unique users, orders, and total revenue per Group + Period 
cohorts = grouped.agg({'UserId': pd.Series.nunique, 
         'OrderId': pd.Series.nunique, 
         'TotalCharges': np.sum}) 

# make the column names more meaningful 
cohorts.rename(columns={'UserId': 'TotalUsers', 
         'OrderId': 'TotalOrders'}, inplace=True) 
cohorts.head() 

def cohort_period(df): 
    """ 
    Creates a `CohortPeriod` column, which is the Nth period based on the  user's first purchase. 

    Example 
    ------- 
    Say you want to get the 3rd month for every user: 
     df.sort(['UserId', 'OrderTime', inplace=True) 
     df = df.groupby('UserId').apply(cohort_period) 
     df[df.CohortPeriod == 3] 
    """ 
    df['CohortPeriod'] = np.arange(len(df)) + 1 
    return df 

cohorts = cohorts.groupby(level=0).apply(cohort_period) 
cohorts.head() 

# reindex the DataFrame 
cohorts.reset_index(inplace=True) 
cohorts.set_index(['CohortGroup', 'CohortPeriod'], inplace=True) 

# create a Series holding the total size of each CohortGroup 
cohort_group_size = cohorts['TotalUsers'].groupby(level=0).first() 
cohort_group_size.head() 

user_retention = cohorts['TotalUsers'].unstack(0).divide(cohort_group_size,  axis=1) 
user_retention.head(10) 

user_retention[['2009-06', '2009-07', '2009-08']].plot(figsize=(10,5)) 
plt.title('Cohorts: User Retention') 
plt.xticks(np.arange(1, 12.1, 1)) 

而改變了這種校正x軸爲四分之四對12個月。

plt.xlim(1, 4) 
plt.ylabel('% of Cohort Purchasing'); 

# Creating heatmaps in matplotlib is more difficult than it should be. 
# Thankfully, Seaborn makes them easy for us. 
# http://stanford.edu/~mwaskom/software/seaborn/ 

import seaborn as sns 
sns.set(style='white') 

plt.figure(figsize=(12, 8)) 
plt.title('Cohorts: User Retention') 
sns.heatmap(user_retention.T, mask=user_retention.T.isnull(), annot=True,  fmt='.0%'); 

CohortGroup Chart 1

CohortGroup Chart 2