2015-12-06 35 views
1

這是我目前的數據框:如何在不同列的行中創建數據框中的新列?

>>>df = {'most_exhibitions' : pd.Series(['USA (1) Netherlands (5)' , 
'United Kingdom (2)','China (3) India (5) Pakistan (8)','USA (11) India (4)'], index=['a', 'b', 'c','d']), 
       'name' : pd.Series(['Bob', 'Joe', 'Alex', 'Bill'], index=['a', 'b', 'c','d'])} 

>>> df 
    name     most_exhibitions 
a Bob     USA (1) India (5) 
b Joe     United Kingdom (2) 
c Alex  China (3) India (5) USA (8) 
d Bill    USA (11) India (4) 

我想弄清楚如何每個細胞分裂,然後,有可能創造該國新的一列,並把相應的計數權一行。如果這個國家已經是一個現有的專欄,我想把這個計數放在正確的行中。

所以,最終的數據幀應該是這樣的:

# name     most_exhibitions   USA United Kingdom China India  
#a Bob     USA (1), India (5)    1         5 
#b Joe     United Kingdom (2)        2 
#c Alex  China (3), India (5), USA (8)    8       3  5 
#d Bill    USA (11), India (4)    11         4 

我想寫一個循環或將拆分數據,然後添加新列功能,但我無法弄清楚如何做到這一點。我最終通過一系列字典分割和清理數據,現在我堅持如何將最終字典制作成自己的數據框。我認爲,如果我可以製作這個新的數據框,我可以將它附加到舊的數據框中。我也認爲我做得比應該更難,並且對任何更優雅的解決方案都感興趣。

這是我到目前爲止已經完成:

>>>country_rank_df['country_split'] 
= indexed_rankdata['most_exhibitions'].str.split(",").astype(str) 

from collections import defaultdict 
total_dict = defaultdict(list) 

dict2 = defaultdict(list) 
dict3 = defaultdict(list) 
dict4 = defaultdict(list) 
dict5 = defaultdict(list) 
dict6 = defaultdict(list) 

for name, country_count in zip(head_df['name'], head_df['most_exhibitions']): 

    total_dict[name].append(country_count) 

for key, value in total_dict.iteritems(): 
    for line in value: 
     new_line = line.split('(') 
     dict2[key].append(new_line) 

for key, list_outside in dict2.iteritems(): 
    for list_inside in list_outside: 
     for value in list_inside: 
      new_line = value.split(',') 
      dict3[key].append(new_line) 

for key, list_outside in dict3.iteritems(): 
    for list_inside in list_outside: 
     for value in list_inside: 
      new_line = value.split(')') 
      dict4[key].append(new_line) 

for key, list_outside in dict4.iteritems(): 
    for list_inside in list_outside: 
     for value in list_inside: 
      new_line = value.strip() 
      new_line = value.lstrip() 
      dict5[key].append(new_line) 

for key, list_outside in dict5.iteritems(): 
    new_line = filter(None, list_outside) 
    dict6[key].append(new_line) 

>>>dict6['Bob'] 

[['USA', 
    '1', 
    'India', 
    '5']] 

回答

1

你可以試試這個方法,它主要使用string methods。然後我pivotfillna數據幀。我丟失了原始欄目most_exhibitions,但我希望這是不必要的。

import pandas as pd 

df = {'most_exhibitions' : pd.Series(['USA (1) Netherlands (5)' , 
'United Kingdom (2)','China (3) India (5) Pakistan (8)','USA (11) India (4)'], index=['a', 'b', 'c','d']), 
       'name' : pd.Series(['Bob', 'Joe', 'Alex', 'Bill'], index=['a', 'b', 'c','d'])} 

df = pd.DataFrame(df) 
#cange ordering of columns 
df = df[['name', 'most_exhibitions']] 
print df 
# name     most_exhibitions 
#a Bob   USA (1) Netherlands (5) 
#b Joe    United Kingdom (2) 
#c Alex China (3) India (5) Pakistan (8) 
#d Bill    USA (11) India (4) 


#remove '(' and last ')' 
df['most_exhibitions'] = df['most_exhibitions'].str.replace('(', '') 
df['most_exhibitions'] = df['most_exhibitions'].str.strip(')') 

#http://stackoverflow.com/a/34065937/2901002 
s = df['most_exhibitions'].str.split(')').apply(pd.Series, 1).stack() 
s.index = s.index.droplevel(-1) 
s.name = 'most_exhibitions' 
print s 
#a    USA 1 
#a  Netherlands 5 
#b United Kingdom 2 
#c    China 3 
#c    India 5 
#c   Pakistan 8 
#d    USA 11 
#d    India 4 
#Name: most_exhibitions, dtype: object 

df = df.drop(['most_exhibitions'], axis=1) 
df = df.join(s) 
print df 
# name most_exhibitions 
#a Bob    USA 1 
#a Bob  Netherlands 5 
#b Joe United Kingdom 2 
#c Alex   China 3 
#c Alex   India 5 
#c Alex  Pakistan 8 
#d Bill   USA 11 
#d Bill   India 4 

#exctract numbers and convert them to integer 
df['numbers'] = df['most_exhibitions'].str.extract("(\d+)").astype('int') 
#exctract text of most_exhibitions 
df['most_exhibitions'] = df['most_exhibitions'].str.rsplit(' ', n=1).str[0] 
print df 
# name most_exhibitions numbers 
#a Bob    USA  1 
#a Bob  Netherlands  5 
#b Joe United Kingdom  2 
#c Alex   China  3 
#c Alex   India  5 
#c Alex   Pakistan  8 
#d Bill    USA  11 
#d Bill   India  4 

#pivot dataframe 
df = df.pivot(index='name', columns='most_exhibitions', values='numbers') 
#NaN to empty string 
df = df.fillna('') 
print df 
#most_exhibitions India Netherlands Pakistan China USA United Kingdom 
#name                 
#Alex     5      8  3     
#Bill     4        11    
#Bob        5     1    
#Joe                 2 

編輯:

我試圖通過功能merge添加的所有列推薦輸出:

import pandas as pd 

df = {'most_exhibitions' : pd.Series(['USA (1) Netherlands (5)' , 
'United Kingdom (2)','China (3) India (5) Pakistan (8)','USA (11) India (4)'], index=['a', 'b', 'c','d']), 
       'name' : pd.Series(['Bob', 'Joe', 'Alex', 'Bill'], index=['a', 'b', 'c','d'])} 

df = pd.DataFrame(df) 
#cange ordering of columns 
df = df[['name', 'most_exhibitions']] 
print df 
# name     most_exhibitions 
#a Bob   USA (1) Netherlands (5) 
#b Joe    United Kingdom (2) 
#c Alex China (3) India (5) Pakistan (8) 
#d Bill    USA (11) India (4) 

#copy original to new dataframe for joining original df 
df1 = df.reset_index().copy() 

#remove '(' and last ')' 
df['most_exhibitions'] = df['most_exhibitions'].str.replace('(', '') 
df['most_exhibitions'] = df['most_exhibitions'].str.strip(')') 

#http://stackoverflow.com/a/34065937/2901002 
s = df['most_exhibitions'].str.split(')').apply(pd.Series, 1).stack() 
s.index = s.index.droplevel(-1) 
s.name = 'most_exhibitions' 
print s 
#a    USA 1 
#a  Netherlands 5 
#b United Kingdom 2 
#c    China 3 
#c    India 5 
#c   Pakistan 8 
#d    USA 11 
#d    India 4 
#Name: most_exhibitions, dtype: object 

df = df.drop(['most_exhibitions'], axis=1) 
df = df.join(s) 
print df 
# name most_exhibitions 
#a Bob    USA 1 
#a Bob  Netherlands 5 
#b Joe United Kingdom 2 
#c Alex   China 3 
#c Alex   India 5 
#c Alex  Pakistan 8 
#d Bill   USA 11 
#d Bill   India 4 

#exctract numbers and convert them to integer 
df['numbers'] = df['most_exhibitions'].str.extract("(\d+)").astype('int') 
#exctract text of most_exhibitions 
df['most_exhibitions'] = df['most_exhibitions'].str.rsplit(' ', n=1).str[0] 
print df 
# name most_exhibitions numbers 
#a Bob    USA  1 
#a Bob  Netherlands  5 
#b Joe United Kingdom  2 
#c Alex   China  3 
#c Alex   India  5 
#c Alex   Pakistan  8 
#d Bill    USA  11 
#d Bill   India  4 

#pivot dataframe 
df = df.pivot(index='name', columns='most_exhibitions', values='numbers') 
#NaN to empty string 
df = df.fillna('') 
df = df.reset_index() 
print df 
#most_exhibitions name India Netherlands Pakistan China USA United Kingdom 
#0     Alex  5      8  3     
#1     Bill  4        11    
#2     Bob     5     1    
#3     Joe              2 
print df1 
# index name     most_exhibitions 
#0  a Bob   USA (1) Netherlands (5) 
#1  b Joe    United Kingdom (2) 
#2  c Alex China (3) India (5) Pakistan (8) 
#3  d Bill    USA (11) India (4) 
df = pd.merge(df1,df, on=['name']) 
df = df.set_index('index') 
print df 
#  name     most_exhibitions India Netherlands Pakistan \ 
#index                   
#a  Bob   USA (1) Netherlands (5)     5    
#b  Joe    United Kingdom (2)         
#c  Alex China (3) India (5) Pakistan (8)  5      8 
#d  Bill    USA (11) India (4)  4       
# 
#  China USA United Kingdom 
#index       
#a    1     
#b       2 
#c   3      
#d   11     
+0

謝謝@jezrael。當我嘗試以下方法:'name_exhibitions_df_2 [ 'most_exhibitions'] = name_exhibitions_df_2 [ 'most_exhibitions'] str.rsplit(」 'N = 1)名爲.str [0]',我得到這個錯誤: 'AttributeError的:' StringMethods的對象沒有屬性'rsplit''。 你知道這是爲什麼?我正在使用Python 2.7.8和iPython 2.1.0。 –

+0

這個錯誤與你的樣品?什麼版本的熊貓使用'print pd .__ version__'? – jezrael

+0

非常感謝您的幫助。是的,它不適用於我的示例..版本0.14.0 –

相關問題