2016-10-30 38 views
0

我嘗試從兩個變量「date_birth」和「date_survey」錯誤的類來創建管道

import numpy as np 
import pandas as pd 
from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn import linear_model, pipeline 
from sklearn.pipeline import FeatureUnion 
from sklearn.pipeline import Pipeline 

我的數據框

df = pd.DataFrame({'a':[1,2,3], 
        'date_survey': ['10.01.2013', '20.02.2014', '30.03.2015'], 
        'birth': ['1985', '1984', '1986'] }) 

管道的代碼一個新的變量「年齡」

X = df[['date_survey', 'birth']] 
y = df['a'] 
class MultiColumn: 
    def __init__(self,columns = None): 
     self.columns = columns # array of column names to encode 

    def fit(self,X,y=None): 
     return self 
    def transform(self, X):               
     return X[self.columns] 
class Age(TransformerMixin): 

    def transform(self, X, y=None, **fit_params): 
     X['date_survey'] = pd.to_datetime(X['date_survey']) 
     year = pd.DataFrame(X['date_survey'].apply(lambda x: x.year)) 
     age = X['birth'].convert_objects(convert_numeric=True) - year 
     return age 

    def fit(self, X, y=None, **fit_params): 
     return self 
regressor = linear_model.SGDRegressor() 
pipeline = Pipeline([ 
      ('union', FeatureUnion(
     transformer_list=[  
      # age 
      ('age', Pipeline([ 
       ('selector', MultiColumn(columns=['date_survey', 'birth'])), 
       ('date', Age()) 

      ])), 
     ])), 
    # Use a regression 
    ('model_fitting', regressor), 
]) 
pipeline.fit(X, y) 

,我得到一個錯誤

ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). 

我想,在課堂上時代的錯誤,但我上無法瞭解如何改進它

回答

1
date_survey birth date_survey_in_transform year 
0 10.01.2013 1985    2013-10-01 2013 
1 20.02.2014 1984    2014-02-20 2014 
2 30.03.2015 1986    2015-03-30 2015 

birth - year爲負。

age = X['birth'].convert_objects(convert_numeric=True) - year 

我修改了一些代碼讓它運行時沒有錯誤。

import numpy as np 
import pandas as pd 
from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn import linear_model, pipeline 
from sklearn.pipeline import FeatureUnion 
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import SGDRegressor 

df = pd.DataFrame({'a':[1,2,3], 
        'date_survey': ['10.01.2013', '20.02.2014', '30.03.2015'], 
        'birth': ['1985', '1984', '1986'] }) 

X = df[['date_survey', 'birth']] 
y = df['a'] 
class MultiColumn: 
    def __init__(self,columns=None): 
     self.columns = columns # array of column names to encode 

    def fit(self,X,y=None): 
     return self 

    def transform(self, X):               
     return X[self.columns] 

class Age(TransformerMixin): 

    def transform(self, X, y=None, **fit_params): 
     X['date'] = pd.to_datetime(X['date_survey']) 
     X['year'] = X['date'].dt.year 
     X['age'] = X['year'] - X['birth'].astype('int64') 
     return X['age'].reshape(-1, 1) 

    def fit(self, X, y=None, **fit_params): 
     return self 

pipeline = Pipeline([ 
    ('union', FeatureUnion(
     transformer_list=[ 
      # age 
      ('age', Pipeline([ 
       ('selector', MultiColumn(columns=['date_survey', 'birth'])), 
       ('date', Age()) 
       ]) 
      ), 
      ] 
     ) 
    ), 
    # Use a regression 
    ('model_fitting', SGDRegressor()) 
    ]) 

pipeline.fit(X, y) 
+0

thnx @Jarad!我已經做了,但是很多! – Edward