2017-09-07 61 views
0

嘗試運行下面的代碼時出現問題。這是房價的機器學習問題。sklearn轉換管道和功能聯合

from sklearn.pipeline import FeatureUnion 
from sklearn.preprocessing import StandardScaler 
from sklearn.pipeline import Pipeline 
from sklearn.base import BaseEstimator,TransformerMixin 

num_attributes=list(housing_num) 
cat_attributes=['ocean_proximity'] 
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6 

class DataFrameSelector(BaseEstimator,TransformerMixin): 
    def __init__(self,attribute_names): 
     self.attribute_names=attribute_names 
    def fit(self,X,y=None): 
     return self 
    def transform(self,X,y=None): 
     return X[self.attribute_names].values 

class CombinedAttributesAdder(BaseEstimator, TransformerMixin): 
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs 
     self.add_bedrooms_per_room = add_bedrooms_per_room 
    def fit(self, X,y=None): 
     return self # nothing else to do 
    def transform(self, X,y=None): 
     rooms_per_household = X[:, rooms_ix]/X[:, household_ix] 
     population_per_household = X[:, population_ix]/X[:, household_ix] 
     if self.add_bedrooms_per_room: 
      bedrooms_per_room = X[:, bedrooms_ix]/X[:, rooms_ix] 
      return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room] 
     else: 
      return np.c_[X, rooms_per_household, population_per_household] 


num_pipeline=Pipeline([ 
    ('selector',DataFrameSelector(num_attributes)), 
    ('imputer',Imputer(strategy="median")), 
    ('attribs_adder',CombinedAttributesAdder()), 
    ('std_scalar',StandardScaler()), 
    ]) 
cat_pipeline=Pipeline([ 
    ('selector',DataFrameSelector(cat_attributes)), 
    ('label_binarizer',LabelBinarizer()), 
    ]) 
full_pipeline=FeatureUnion(transformer_list=[ 
    ("num_pipeline",num_pipeline), 
    ("cat_pipeline",cat_pipeline), 
    ]) 

有是當我試圖運行錯誤:

housing_prepared = full_pipeline.fit_transform(housing) 

和錯誤顯示爲:

--------------------------------------------------------------------------- 
TypeError         Traceback (most recent call last) 
<ipython-input-141-acd0fd68117b> in <module>() 
----> 1 housing_prepared = full_pipeline.fit_transform(housing) 

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit_transform(self, X, y, **fit_params) 
    744    delayed(_fit_transform_one)(trans, weight, X, y, 
    745           **fit_params) 
--> 746    for name, trans, weight in self._iter()) 
    747 
    748   if not result: 

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable) 
    777    # was dispatched. In particular this covers the edge 
    778    # case of Parallel used with an exhausted iterator. 
--> 779    while self.dispatch_one_batch(iterator): 
    780     self._iterating = True 
    781    else: 

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator) 
    623     return False 
    624    else: 
--> 625     self._dispatch(tasks) 
    626     return True 
    627 

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch) 
    586   dispatch_timestamp = time.time() 
    587   cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) 
--> 588   job = self._backend.apply_async(batch, callback=cb) 
    589   self._jobs.append(job) 
    590 

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback) 
    109  def apply_async(self, func, callback=None): 
    110   """Schedule a func to be run""" 
--> 111   result = ImmediateResult(func) 
    112   if callback: 
    113    callback(result) 

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch) 
    330   # Don't delay the application, to avoid keeping the input 
    331   # arguments in memory 
--> 332   self.results = batch() 
    333 
    334  def get(self): 

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self) 
    129 
    130  def __call__(self): 
--> 131   return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    132 
    133  def __len__(self): 

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/pipeline.pyc in _fit_transform_one(transformer, weight, X, y, **fit_params) 
    587      **fit_params): 
    588  if hasattr(transformer, 'fit_transform'): 
--> 589   res = transformer.fit_transform(X, y, **fit_params) 
    590  else: 
    591   res = transformer.fit(X, y, **fit_params).transform(X) 

/Users/nieguangtao/ml/env_1/lib/python2.7/site-packages/sklearn/pipeline.pyc in fit_transform(self, X, y, **fit_params) 
    290   Xt, fit_params = self._fit(X, y, **fit_params) 
    291   if hasattr(last_step, 'fit_transform'): 
--> 292    return last_step.fit_transform(Xt, y, **fit_params) 
    293   elif last_step is None: 
    294    return Xt 

TypeError: fit_transform() takes exactly 2 arguments (3 given) 

所以我的第一questio n是什麼原因導致這種錯誤?

得到這個錯誤後,我試圖找出原因,所以我運行上面的變壓器逐一此:

DFS=DataFrameSelector(num_attributes) 
a1=DFS.fit_transform(housing) 
imputer=Imputer(strategy='median') 
a2=imputer.fit_transform(a1) 
CAA=CombinedAttributesAdder() 
a3=CAA.fit_transform(a2) 
SS=StandardScaler() 
a4=SS.fit_transform(a3) 

DFS2=DataFrameSelector(cat_attributes) 
b1=DFS2.fit_transform(housing) 
LB=LabelBinarizer() 
b2=LB.fit_transform(b1) 

result=np.concatenate((a4,b2),axis=1) 

這些可以被不同的是結果我是正確執行一個numpy.ndarray與大小(16512,16),而預期的結果housing_prepared = full_pipeline.fit_transform(housing)應該是一個規模(16512,17)的顛簸規則。 所以這是我的第二個問題爲什麼導致差異?

房屋是一個大小爲(16512,9)的數據框,只有1個分類特徵和8個數字特徵。

預先感謝您。

+0

第一個錯誤是由於'LabelBinarizer'造成的。它只需要一個輸入y,但由於流水線,X和y都會發送給它。請分享這些數據,我可以提供幫助。 –

+0

@VivekKumar這是鏈接,它是住房的數據:https://drive.google.com/file/d/0B12I2_fMO94pVHZhQlVrSlFtZEk/view?usp=sharing – talentcat

+0

爲什麼你認爲結果應該有17列而不是16? –

回答

0

看起來像sklearn以另一種方式識別數據類型的方式超出了您的預期。確保數字被標識爲int。最簡單的方法:使用'your'發佈編碼的作者提供的數據。 Aurelien Geron Hands on Machine Learning