2017-03-25 50 views
0

我正在嘗試使用sklearn管道。但我在網上嘗試過各種教程,並沒有幫助我。完整的sklearn管道示例

import pandas as pd 
import numpy as np 
import json 
import seaborn as sb 
from sklearn.metrics import log_loss 
from sklearn import linear_model 
from sklearn.model_selection import StratifiedKFold 
from sklearn.svm import SVC 
from scipy.stats import zscore 
from Transformers import TextTransformer 
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.model_selection import GridSearchCV 
%matplotlib inline 
df = pd.read_json('data/train.json', encoding = 'utf-8', dtype = {'description': str}) 
len(df) 
df = df[['description', 'interest_level']] 
from sklearn.pipeline import Pipeline, FeatureUnion 
a = TextTransformer('description', max_features=50) 
b = TextTransformer('features', max_features=10) 
pipeline = Pipeline([ 
    ('description',a), # can pass in either a pipeline 
     #('features',b) # or a transformer 
J ('clf', SVC()) # classifier 
]) 
pipeline.fit(df[:,'interest_level']) 

我的文本變壓器

from sklearn.base import BaseEstimator, TransformerMixin 
from bs4 import BeautifulSoup 
from sklearn.feature_extraction.text import TfidfVectorizer 
import nltk 


class TextTransformer(BaseEstimator, TransformerMixin): 
    def __init__(self, column, max_features=5000): 
     self.tfidfVectorizer = TfidfVectorizer(use_idf=False, stop_words='english', 
               tokenizer=self._custom_tokenizer, analyzer='word', 
               max_features=max_features) 
     self._vectorizer = None 
     self._column = column 

    def _custom_tokenizer(self, string): 
     # string = re.sub('^[\w]', '', string) 
     tokens = nltk.word_tokenize(string) 
     cleaned = [x if not x.isdigit() else '_NUM_' for x in tokens] 
     return [str(x.encode('utf-8')) for x in cleaned if (x.isalpha() or x == '_NUM_')] 

    def _clean_html_tags(self, content): 
     return BeautifulSoup(content, 'lxml').text 

    def fit(self, df): 
     self._vectorizer = self.tfidfVectorizer.fit(df[self._column].apply(self._clean_html_tags)) 
     return self 

    def transform(self, df): 
     return self._vectorizer.transform(df[self._column]).todense() 

但是,我似乎無法得到它的權利。它不斷拋出此異常IPython的筆記本數據的

--------------------------------------------------------------------------- 
TypeError         Traceback (most recent call last) 
<ipython-input-11-b3788282dc5c> in <module>() 
     8  ('clf', SVC()) # classifier 
     9 ]) 
---> 10 pipeline.fit(df[:,'interest_level']) 

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key) 
    2057    return self._getitem_multilevel(key) 
    2058   else: 
-> 2059    return self._getitem_column(key) 
    2060 
    2061  def _getitem_column(self, key): 

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key) 
    2064   # get column 
    2065   if self.columns.is_unique: 
-> 2066    return self._get_item_cache(key) 
    2067 
    2068   # duplicate columns & possible reduce dimensionality 

/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item) 
    1382   """Return the cached item, item represents a label indexer.""" 
    1383   cache = self._item_cache 
-> 1384   res = cache.get(item) 
    1385   if res is None: 
    1386    values = self._data.get(item) 

TypeError: unhashable type 

說明

description interest_level 
10 A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ... medium 
10000  low 
100004 Top Top West Village location, beautiful Pre-w... high 
100007 Building Amenities - Garage - Garden - fitness... low 
100013 Beautifully renovated 3 bedroom flex 4 bedroom... low 

利息水平將是我的目標變量

回答

0

你擬合只有一列(df[:, 'interest_level]),但隨後您的第一步(變壓器a: TextTransformer)正試圖訪問列description

+0

我該怎麼寫呢 – aceminer