0
我嘗試從兩個變量「date_birth」和「date_survey」錯誤的類來創建管道
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import linear_model, pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
我的數據框
df = pd.DataFrame({'a':[1,2,3],
'date_survey': ['10.01.2013', '20.02.2014', '30.03.2015'],
'birth': ['1985', '1984', '1986'] })
管道的代碼一個新的變量「年齡」
X = df[['date_survey', 'birth']]
y = df['a']
class MultiColumn:
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self
def transform(self, X):
return X[self.columns]
class Age(TransformerMixin):
def transform(self, X, y=None, **fit_params):
X['date_survey'] = pd.to_datetime(X['date_survey'])
year = pd.DataFrame(X['date_survey'].apply(lambda x: x.year))
age = X['birth'].convert_objects(convert_numeric=True) - year
return age
def fit(self, X, y=None, **fit_params):
return self
regressor = linear_model.SGDRegressor()
pipeline = Pipeline([
('union', FeatureUnion(
transformer_list=[
# age
('age', Pipeline([
('selector', MultiColumn(columns=['date_survey', 'birth'])),
('date', Age())
])),
])),
# Use a regression
('model_fitting', regressor),
])
pipeline.fit(X, y)
,我得到一個錯誤
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
我想,在課堂上時代的錯誤,但我上無法瞭解如何改進它
thnx @Jarad!我已經做了,但是很多! – Edward