2017-09-05 62 views
0

我在嘗試修改this示例以使用Pandas數據框而不是測試數據集。我無法這樣做,因爲ItemSelector似乎無法識別列名稱。自定義FeatureUnion不起作用?

請大家注意數據幀df_resolved.columns收益的列:

Index(['u_category', ... ... 'resolution_time', 'rawtext'], 
     dtype='object') 

所以我明明在我的數據幀都沒有了。

然而,當我嘗試運行的解決方案,我得到的錯誤

"ValueError: no field of name u_category"

另外,我似乎不能夠修改代碼以支持ItemSelector選擇多列,所以在這個解決方案,我必須在每一列中單獨應用變壓器。

我的代碼是:

import numpy as np 

from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn.datasets import fetch_20newsgroups 
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer 
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting 
from sklearn.decomposition import TruncatedSVD 
from sklearn.feature_extraction import DictVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics import classification_report 
from sklearn.pipeline import FeatureUnion 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 


class ItemSelector(BaseEstimator, TransformerMixin): 
    def __init__(self, key): 
     self.key = key 

    def fit(self, x, y=None): 
     return self 

    def transform(self, data_dict): 
     return data_dict[self.key] 


class TextStats(BaseEstimator, TransformerMixin): 
    """Extract features from each document for DictVectorizer""" 

    def fit(self, x, y=None): 
     return self 

    def transform(self, posts): 
     return [{'length': len(text), 
       'num_sentences': text.count('.')} 
       for text in posts] 


class SubjectBodyExtractor(BaseEstimator, TransformerMixin): 
    """Extract the subject & body from a usenet post in a single pass. 

    Takes a sequence of strings and produces a dict of sequences. Keys are 
    `subject` and `body`. 
    """ 
    def fit(self, x, y=None): 
     return self 

    def transform(self, posts): 
     features = np.recarray(shape=(len(posts),), 
           dtype=[('subject', object), ('body', object)]) 
     for i, text in enumerate(posts): 
      headers, _, bod = text.partition('\n\n') 
      bod = strip_newsgroup_footer(bod) 
      bod = strip_newsgroup_quoting(bod) 
      features['body'][i] = bod 

      prefix = 'Subject:' 
      sub = '' 
      for line in headers.split('\n'): 
       if line.startswith(prefix): 
        sub = line[len(prefix):] 
        break 
      features['subject'][i] = sub 

     return features 


pipeline = Pipeline([ 
    # Extract the subject & body 
    ('subjectbody', SubjectBodyExtractor()), 

    # Use FeatureUnion to combine the features from subject and body 
    ('union', FeatureUnion(
     transformer_list=[ 

      # Pipeline for pulling features from the post's subject line 
      ('rawtext', Pipeline([ 
       ('selector', ItemSelector(key='u_category')), 
       ('labelenc', preprocessing.LabelEncoder()), 
      ])), 
      # Pipeline for standard bag-of-words model for body 
      ('features', Pipeline([ 
       ('selector', ItemSelector(key='rawtext')), 
       ('tfidf', TfidfVectorizer(max_df=0.5, min_df=1, 
              stop_words='english', 
              token_pattern=u'(?ui)\\b\\w*[a-z]{2,}\\w*\\b')), 
      ])), 
     ], 

     # weight components in FeatureUnion 
     transformer_weights={ 
      'rawtext': 1.0, 
      'features': 1.0, 
     }, 
    )), 

    # Use a SVC classifier on the combined features 
    ('linear_svc', LinearSVC(penalty="l2")), 
]) 

# limit the list of categories to make running this example faster. 
X_train, X_test, y_train, y_test = train_test_split(df_resolved.ix[:, (df_resolved.columns != 'assignment_group.name')], df_resolved['assignment_group.name'], test_size=0.2, random_state=42) 

pipeline.fit(X_train, y_train) 
print(pipeline.score(X_test, y_test)) 

如何修改這個代碼與我數據幀正常工作,並可能支持立刻將變壓器的多個列?

如果我拿ItemSelector出來,它似乎工作。所以這個工程:

ds = ItemSelector(key='u_category') 
ds.fit(df_resolved) 

labelenc = preprocessing.LabelEncoder() 
labelenc_transformed = labelenc.fit_transform(ds.transform(df_resolved)) 

完整的堆棧跟蹤:

--------------------------------------------------------------------------- 
ValueError        Traceback (most recent call last) 
<ipython-input-93-a4ba29c137ec> in <module>() 
    136 
    137 
--> 138 pipeline.fit(X_train, y_train) 
    139 #y = pipeline.predict(X_test) 
    140 #print(classification_report(y, test.target)) 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params) 
    266    This estimator 
    267   """ 
--> 268   Xt, fit_params = self._fit(X, y, **fit_params) 
    269   if self._final_estimator is not None: 
    270    self._final_estimator.fit(Xt, y, **fit_params) 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params) 
    232     pass 
    233    elif hasattr(transform, "fit_transform"): 
--> 234     Xt = transform.fit_transform(Xt, y, **fit_params_steps[name]) 
    235    else: 
    236     Xt = transform.fit(Xt, y, **fit_params_steps[name]) \ 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params) 
    732    delayed(_fit_transform_one)(trans, name, weight, X, y, 
    733           **fit_params) 
--> 734    for name, trans, weight in self._iter()) 
    735 
    736   if not result: 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable) 
    756    # was dispatched. In particular this covers the edge 
    757    # case of Parallel used with an exhausted iterator. 
--> 758    while self.dispatch_one_batch(iterator): 
    759     self._iterating = True 
    760    else: 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator) 
    606     return False 
    607    else: 
--> 608     self._dispatch(tasks) 
    609     return True 
    610 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch) 
    569   dispatch_timestamp = time.time() 
    570   cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) 
--> 571   job = self._backend.apply_async(batch, callback=cb) 
    572   self._jobs.append(job) 
    573 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback) 
    107  def apply_async(self, func, callback=None): 
    108   """Schedule a func to be run""" 
--> 109   result = ImmediateResult(func) 
    110   if callback: 
    111    callback(result) 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch) 
    324   # Don't delay the application, to avoid keeping the input 
    325   # arguments in memory 
--> 326   self.results = batch() 
    327 
    328  def get(self): 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self) 
    129 
    130  def __call__(self): 
--> 131   return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    132 
    133  def __len__(self): 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0) 
    129 
    130  def __call__(self): 
--> 131   return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    132 
    133  def __len__(self): 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params) 
    575      **fit_params): 
    576  if hasattr(transformer, 'fit_transform'): 
--> 577   res = transformer.fit_transform(X, y, **fit_params) 
    578  else: 
    579   res = transformer.fit(X, y, **fit_params).transform(X) 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params) 
    299   """ 
    300   last_step = self._final_estimator 
--> 301   Xt, fit_params = self._fit(X, y, **fit_params) 
    302   if hasattr(last_step, 'fit_transform'): 
    303    return last_step.fit_transform(Xt, y, **fit_params) 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params) 
    232     pass 
    233    elif hasattr(transform, "fit_transform"): 
--> 234     Xt = transform.fit_transform(Xt, y, **fit_params_steps[name]) 
    235    else: 
    236     Xt = transform.fit(Xt, y, **fit_params_steps[name]) \ 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params) 
    495   else: 
    496    # fit method of arity 2 (supervised transformation) 
--> 497    return self.fit(X, y, **fit_params).transform(X) 
    498 
    499 

<ipython-input-93-a4ba29c137ec> in transform(self, data_dict) 
    55 
    56  def transform(self, data_dict): 
---> 57   return data_dict[self.key] 
    58 
    59 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/numpy/core/records.py in __getitem__(self, indx) 
    497 
    498  def __getitem__(self, indx): 
--> 499   obj = super(recarray, self).__getitem__(indx) 
    500 
    501   # copy behavior of getattr, except that here 

ValueError: no field of name u_category 

UPDATE:

即使我用dataframes(NO train_test_split),問題依然存在:enter image description here

更新2: 行,所以我刪除SubjectBodyExtractor,因爲我不需要那個。現在ValueError: no field of name u_category不見了,但我有一個新的錯誤:TypeError: fit_transform() takes 2 positional arguments but 3 were given

堆棧跟蹤:

--------------------------------------------------------------------------- 
TypeError         Traceback (most recent call last) 
<ipython-input-110-292294015e44> in <module>() 
    129 
    130 
--> 131 pipeline.fit(X_train.ix[:, (X_test.columns != 'assignment_group.name')], X_test['assignment_group.name']) 
    132 #y = pipeline.predict(X_test) 
    133 #print(classification_report(y, test.target)) 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params) 
    266    This estimator 
    267   """ 
--> 268   Xt, fit_params = self._fit(X, y, **fit_params) 
    269   if self._final_estimator is not None: 
    270    self._final_estimator.fit(Xt, y, **fit_params) 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params) 
    232     pass 
    233    elif hasattr(transform, "fit_transform"): 
--> 234     Xt = transform.fit_transform(Xt, y, **fit_params_steps[name]) 
    235    else: 
    236     Xt = transform.fit(Xt, y, **fit_params_steps[name]) \ 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params) 
    732    delayed(_fit_transform_one)(trans, name, weight, X, y, 
    733           **fit_params) 
--> 734    for name, trans, weight in self._iter()) 
    735 
    736   if not result: 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable) 
    756    # was dispatched. In particular this covers the edge 
    757    # case of Parallel used with an exhausted iterator. 
--> 758    while self.dispatch_one_batch(iterator): 
    759     self._iterating = True 
    760    else: 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator) 
    606     return False 
    607    else: 
--> 608     self._dispatch(tasks) 
    609     return True 
    610 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch) 
    569   dispatch_timestamp = time.time() 
    570   cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self) 
--> 571   job = self._backend.apply_async(batch, callback=cb) 
    572   self._jobs.append(job) 
    573 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback) 
    107  def apply_async(self, func, callback=None): 
    108   """Schedule a func to be run""" 
--> 109   result = ImmediateResult(func) 
    110   if callback: 
    111    callback(result) 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch) 
    324   # Don't delay the application, to avoid keeping the input 
    325   # arguments in memory 
--> 326   self.results = batch() 
    327 
    328  def get(self): 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self) 
    129 
    130  def __call__(self): 
--> 131   return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    132 
    133  def __len__(self): 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0) 
    129 
    130  def __call__(self): 
--> 131   return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    132 
    133  def __len__(self): 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, name, weight, X, y, **fit_params) 
    575      **fit_params): 
    576  if hasattr(transformer, 'fit_transform'): 
--> 577   res = transformer.fit_transform(X, y, **fit_params) 
    578  else: 
    579   res = transformer.fit(X, y, **fit_params).transform(X) 

/Users/csanadpoda/Documents/Jupyter/anaconda/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params) 
    301   Xt, fit_params = self._fit(X, y, **fit_params) 
    302   if hasattr(last_step, 'fit_transform'): 
--> 303    return last_step.fit_transform(Xt, y, **fit_params) 
    304   elif last_step is None: 
    305    return Xt 

TypeError: fit_transform() takes 2 positional arguments but 3 were given 
+0

當你使用'train_test_split'時,數據幀被轉換成一個沒有任何列索引的numpy數組。因此錯誤。 –

+0

即使我不使用火車/測試分割,只是一個數據幀也是如此...... –

+0

哦。你可以將你讀取數據的代碼和一些示例數據一起發佈到df中。 –

回答

0

是的,這是因爲LabelEncoder只需要一個數組y,而FeatureUnion會嘗試向它發送X和y。

看到這個:https://github.com/scikit-learn/scikit-learn/issues/3956

您可以使用一個簡單的解決方法是:

定義自定義labelEncoder這樣的:

class MyLabelEncoder(BaseEstimator, TransformerMixin): 
    def __init__(self): 
     self.le = LabelEncoder() 

    def fit(self, x, y=None): 
     return self.le.fit(x) 

    def transform(self, x, y=None): 
     return self.le.transform(x).reshape(-1,1) 

    def fit_transform(self, x, y=None): 
     self.fit(x) 
     return self.transform(x) 

而在管道,這樣做:

.... 
.... 
       ('selector', ItemSelector(key='u_category')), 
       ('labelenc', MyLabelEncoder()), 

請注意在重塑(-1,1)方法。那是因爲FeatureUnion僅適用於二維數據。 FeatureUnion內的所有獨立變壓器只應返回2-D數據。

-1

您可能需要添加這些功能陣列像這樣,請嘗試添加兩個選擇在這樣的特徵,並告訴我結果

features = np.recarray(shape=(len(posts),), 
           dtype=[('u_category', object), ('rawtext', object)])