2017-09-24 61 views
1
## Load the data ## 

train=pd.read_csv("../kagglehouse/train.csv") 
test=pd.read_csv("../kagglehouse/test.csv") 
all_data=pd.concat((train.loc[:,"MSSubClass":"SaleCondition"],test.loc[:,"MSSubClass":"SaleCondition"])) 

NFOLDS = 5 
SEED = 0 
NROWS = None 

ntrain = train.shape[0] 
ntest = test.shape[0] 

#creating matrices for sklearn 1: 
y_train=train["SalePrice"] 
x_train = np.array(all_data[:train.shape[0]]) 
x_test = np.array(all_data[train.shape[0]:]) 

kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED) 

class SklearnWrapper(object): 
    def __init__(self, clf, seed=0, params=None): 
     params['random_state'] = seed 
     self.clf = clf(**params) 


    def train(self, x_train, y_train): 
     self.clf.fit(train_df_munged, label_df) 
     #self.clf.fit(x_train, y_train) 

    def predict(self, x): 
     return self.clf.predict(x) 

def get_oof(clf): 
    oof_train = np.zeros((ntrain,)) 
    oof_test = np.zeros((ntest,)) 
    oof_test_skf = np.empty((NFOLDS, ntest)) 

    for i, (train_index, test_index) in enumerate(kf): 

     x_tr = x_train[train_index] 

     y_tr = y_train[train_index] 

     x_te = x_train[test_index] 

     clf.train(x_tr, y_tr) 

     oof_train[test_index] = clf.predict(x_te) 
     oof_test_skf[i, :] = clf.predict(x_test) 

    oof_test[:] = oof_test_skf.mean(axis=0) 
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) 


et_params = { 
    'n_jobs': 16, 
} 

rf_params = { 
    'n_jobs': 16, 

} 

xgb_params = { 
    'seed': 0, 
    'colsample_bytree': 0.7, 
    'silent': 1, 
    'subsample': 0.7, 

} 



rd_params={ 
    'alpha': 10 
} 


ls_params={ 
    'alpha': 0.005 
} 


et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params) 
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params) 
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params) 
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params) 

et_oof_train, et_oof_test = get_oof(et) 
rf_oof_train, rf_oof_test = get_oof(rf) 
rd_oof_train, rd_oof_test = get_oof(rd) 
ls_oof_train, ls_oof_test = get_oof(ls) 

,看來這個遇到錯誤:輸入包含的NaN,無窮大或過大的D型(「float64」)

 
    ValueError        Traceback (most recent call 
     last) 
      in() 
       135 
       136 xg_oof_train, xg_oof_test = get_oof(xg) 
      --> 137 et_oof_train, et_oof_test = get_oof(et) 
       138 rf_oof_train, rf_oof_test = get_oof(rf) 
       139 rd_oof_train, rd_oof_test = get_oof(rd) 

      in get_oof(clf) 
       77   x_te = x_train[test_index] 
       78 
      ---> 79   clf.train(x_tr, y_tr) 
       80 
       81   oof_train[test_index] = clf.predict(x_te) 

      in train(self, x_train, y_train) 
       46  def train(self, x_train, y_train): 
       47   #self.clf.fit(x_train, y_train) 
      ---> 48   self.clf.fit(x_train, y_train) 
       49 
       50  def predict(self, x): 

      E:\graphLab\Anaconda2\lib\site-packages\sklearn\ensemble\forest.pyc 
     in fit(self, X, y, sample_weight) 
       245   # Validate or convert input data 
       246   X = check_array(X, accept_sparse="csc", dtype=DTYPE) 
      --> 247   y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) 
       248   if sample_weight is not None: 
       249    sample_weight = check_array(sample_weight, ensure_2d=False) 

      E:\graphLab\Anaconda2\lib\site-packages\sklearn\utils\validation.pyc 
     in check_array(array, accept_sparse, dtype, order, copy, 
     force_all_finite, ensure_2d, allow_nd, ensure_min_samples, 
     ensure_min_features, warn_on_dtype, estimator) 
       420        % (array.ndim, estimator_name)) 
       421   if force_all_finite: 
      --> 422    _assert_all_finite(array) 
       423 
       424  shape_repr = _shape_repr(array.shape) 

      E:\graphLab\Anaconda2\lib\site-packages\sklearn\utils\validation.pyc 
     in _assert_all_finite(X) 
       41    and not np.isfinite(X).all()): 
       42   raise ValueError("Input contains NaN, infinity" 
      ---> 43       " or a value too large for %r." % X.dtype) 
       44 
       45 

      ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). 

,當我使用np.isnan(all_data.all()),它返回False和np.isfinite(all_data.all()),它的值返回True,所以我很迷惑。爲什麼我得到這個錯誤?

回答

3

您沒有正確檢查all_data

np.isnan(all_data.all()) 
np.isfinite(all_data.all()) 

你應該如何檢查數據。

正在申請np.isnan()np.isfinite()到的all_data.all()其輸出是總是一個布爾值True/False並且因此總是有限和非nan

您應該檢查你的數據爲:

np.isfinite(all_data).all() 
np.isnan(all_data).all() 

注意all()被應用於np.isfinite()np.isnan()輸出,而不是周圍的其他方式。

+0

ok.First,即時通訊真的非常感謝你回答我的問題,我用你的正確方法來檢查all_data和all_data不存在南或無限值,它仍然有ValueError – zengcaifei

+0

@zengcaifei請編輯你的問題,以反映這個新的信息。 – Shai

+1

哦,我剛剛發現,當我使用x_train = np.array(all_data [:train.shape [0]])和x_test = np.array(all_data [train.shape [0]:]),我忘記y_train也需要改變爲numpy,所以我改變y_train = train [「SalePrice」]到y_train = np.array(train [「SalePrice」]),它是正確的,但我仍然不知道它爲什麼發生 – zengcaifei

相關問題