2017-06-22 84 views
0

裝配流水線我不斷收到IndexError而與FeatureUnion

IndexError: only integers, slices (), ellipsis ( ... ), numpy.newaxis () and integer or boolean arrays are valid indices

,同時試圖將我的數據框下面的管道。訓練和測試是具有相同列的兩個數據幀。有不同的列,但我只想通過ItemSelector關注其中的三個。

from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn import preprocessing 
from sklearn.pipeline import FeatureUnion 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.svm import LinearSVC 
from sklearn.multiclass import OneVsRestClassifier 
from sklearn.pipeline import Pipeline 

    class ItemSelector(BaseEstimator, TransformerMixin): 

     def __init__(self, column): 
      self.column = column 

     def fit(self, X, y=None): 
      return self 

     def transform(self, X): 
      return X[self.column] 


    def predictCases(train, test): 
     target_names = sorted(list(set(train['TARGET'].values))) 
     y_train = np.array([target_names.index(x) for x in train['TARGET'].values]) 
     y_test = np.array([target_names.index(x) for x in test['TARGET'].values]) 

     # train and predict 
     classifier = Pipeline([ 
        ('union', FeatureUnion([ 

          ('text', Pipeline([ 
           ('selector', ItemSelector(column='TEXT')), 
           ('tfidf_vec', TfidfVectorizer()) 
          ])), 

          ('feature1', Pipeline([ 
           ('selector', ItemSelector(column='CATEG_FEAT1')), 
           ('lbe', LabelEncoder()) 
          ])), 

          ('feature2', Pipeline([ 
           ('selector', ItemSelector(column='CATEG_FEAT2')), 
           ('lbe', LabelEncoder()) 
          ])) 
        ])), 
        ('clf', OneVsRestClassifier(LinearSVC()))]) 
     classifier.fit(train.values, y_train) 
     predicted = classifier.predict(test.values) 
     return(metrics.precision_recall_fscore_support(y_test, predicted)) 

完全錯誤:

IndexError        Traceback (most recent call last) 
<ipython-input-19-95d9d0c337f4> in <module>() 
----> 1 tt = predictCases(train_resampled, validate) 

<ipython-input-17-efc951f4192e> in predictCases(train, test) 
    24     ])), 
    25     ('clf', OneVsRestClassifier(LinearSVC()))]) 
---> 26  classifier.fit(train.values, y_train) 
    27  predicted = classifier.predict(test.values) 
    28  return(metrics.precision_recall_fscore_support(y_test, predicted)) 

C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params) 
    266    This estimator 
    267   """ 
--> 268   Xt, fit_params = self._fit(X, y, **fit_params) 
    269   if self._final_estimator is not None: 
    270    self._final_estimator.fit(Xt, y, **fit_params) 

C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params) 
    232     pass 
    233    elif hasattr(transform, "fit_transform"): 
--> 234     Xt = transform.fit_transform(Xt, y, **fit_params_steps[name]) 
    235    else: 
    236     Xt = transform.fit(Xt, y, **fit_params_steps[name]) \ 

C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params) 
    732    delayed(_fit_transform_one)(trans, name, weight, X, y, 
    733           **fit_params) 
--> 734    for name, trans, weight in self._iter()) 
    735 
    736   if not result: 

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable) 
    756    # was dispatched. In particular this covers the edge 
    757    # case of Parallel used with an exhausted iterator. 
--> 758    while self.dispatch_one_batch(iterator): 
    759     self._iterating = True 
    760    else: 

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator) 
    606     return False 
    607    else: 
--> 608     self._dispatch(tasks) 
    609     return True 
    610 

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch) 
    569   dispatch_timestamp = time.time() 
    570   cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) 
--> 571   job = self._backend.apply_async(batch, callback=cb) 
    572   self._jobs.append(job) 
    573 

C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback) 
    107  def apply_async(self, func, callback=None): 
    108   """Schedule a func to be run""" 
--> 109   result = ImmediateResult(func) 
    110   if callback: 
    111    callback(result) 

    C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch) 
     324   # Don't delay the application, to avoid keeping the input 
     325   # arguments in memory 
    --> 326   self.results = batch() 
     327 
     328  def get(self): 

    C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self) 
     129 
     130  def __call__(self): 
    --> 131   return [func(*args, **kwargs) for func, args, kwargs in self.items] 
     132 
     133  def __len__(self): 

    C:\\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0) 
     129 
     130  def __call__(self): 
    --> 131   return [func(*args, **kwargs) for func, args, kwargs in self.items] 
     132 
     133  def __len__(self): 

    C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params) 
     575      **fit_params): 
     576  if hasattr(transformer, 'fit_transform'): 
    --> 577   res = transformer.fit_transform(X, y, **fit_params) 
     578  else: 
     579   res = transformer.fit(X, y, **fit_params).transform(X) 

    C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in fit_transform(self, X, y, **fit_params) 
     299   """ 
     300   last_step = self._final_estimator 
    --> 301   Xt, fit_params = self._fit(X, y, **fit_params) 
     302   if hasattr(last_step, 'fit_transform'): 
     303    return last_step.fit_transform(Xt, y, **fit_params) 

    C:\\Anaconda3\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params) 
     232     pass 
     233    elif hasattr(transform, "fit_transform"): 
    --> 234     Xt = transform.fit_transform(Xt, y, **fit_params_steps[name]) 
     235    else: 
     236     Xt = transform.fit(Xt, y, **fit_params_steps[name]) \ 

    C:\\Anaconda3\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params) 
     495   else: 
     496    # fit method of arity 2 (supervised transformation) 
    --> 497    return self.fit(X, y, **fit_params).transform(X) 
     498 
     499 

    <ipython-input-2-fdc42fd9d831> in transform(self, X) 
     10 
     11  def transform(self, X): 
    ---> 12   return X[self.column] 

    IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices 

編輯:

如果我用火車代替train.values在配合我得到以下錯誤:

TypeError: fit_transform() takes 2 positional arguments but 3 were given 

回答

0

你'將test.values(即具有原始DataFrame值的numpy數組)傳遞給classi fier.predict和classifier.fit,而你的轉換器需要一個DataFrame對象。