1
## Load the data ##
train=pd.read_csv("../kagglehouse/train.csv")
test=pd.read_csv("../kagglehouse/test.csv")
all_data=pd.concat((train.loc[:,"MSSubClass":"SaleCondition"],test.loc[:,"MSSubClass":"SaleCondition"]))
NFOLDS = 5
SEED = 0
NROWS = None
ntrain = train.shape[0]
ntest = test.shape[0]
#creating matrices for sklearn 1:
y_train=train["SalePrice"]
x_train = np.array(all_data[:train.shape[0]])
x_test = np.array(all_data[train.shape[0]:])
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)
class SklearnWrapper(object):
def __init__(self, clf, seed=0, params=None):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(train_df_munged, label_df)
#self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def get_oof(clf):
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((NFOLDS, ntest))
for i, (train_index, test_index) in enumerate(kf):
x_tr = x_train[train_index]
y_tr = y_train[train_index]
x_te = x_train[test_index]
clf.train(x_tr, y_tr)
oof_train[test_index] = clf.predict(x_te)
oof_test_skf[i, :] = clf.predict(x_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
et_params = {
'n_jobs': 16,
}
rf_params = {
'n_jobs': 16,
}
xgb_params = {
'seed': 0,
'colsample_bytree': 0.7,
'silent': 1,
'subsample': 0.7,
}
rd_params={
'alpha': 10
}
ls_params={
'alpha': 0.005
}
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
rd_oof_train, rd_oof_test = get_oof(rd)
ls_oof_train, ls_oof_test = get_oof(ls)
,看來這個遇到錯誤:輸入包含的NaN,無窮大或過大的D型(「float64」)
ValueError Traceback (most recent call last) in() 135 136 xg_oof_train, xg_oof_test = get_oof(xg) --> 137 et_oof_train, et_oof_test = get_oof(et) 138 rf_oof_train, rf_oof_test = get_oof(rf) 139 rd_oof_train, rd_oof_test = get_oof(rd) in get_oof(clf) 77 x_te = x_train[test_index] 78 ---> 79 clf.train(x_tr, y_tr) 80 81 oof_train[test_index] = clf.predict(x_te) in train(self, x_train, y_train) 46 def train(self, x_train, y_train): 47 #self.clf.fit(x_train, y_train) ---> 48 self.clf.fit(x_train, y_train) 49 50 def predict(self, x): E:\graphLab\Anaconda2\lib\site-packages\sklearn\ensemble\forest.pyc in fit(self, X, y, sample_weight) 245 # Validate or convert input data 246 X = check_array(X, accept_sparse="csc", dtype=DTYPE) --> 247 y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) 248 if sample_weight is not None: 249 sample_weight = check_array(sample_weight, ensure_2d=False) E:\graphLab\Anaconda2\lib\site-packages\sklearn\utils\validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator) 420 % (array.ndim, estimator_name)) 421 if force_all_finite: --> 422 _assert_all_finite(array) 423 424 shape_repr = _shape_repr(array.shape) E:\graphLab\Anaconda2\lib\site-packages\sklearn\utils\validation.pyc in _assert_all_finite(X) 41 and not np.isfinite(X).all()): 42 raise ValueError("Input contains NaN, infinity" ---> 43 " or a value too large for %r." % X.dtype) 44 45 ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
,當我使用np.isnan(all_data.all())
,它返回False和np.isfinite(all_data.all())
,它的值返回True,所以我很迷惑。爲什麼我得到這個錯誤?
ok.First,即時通訊真的非常感謝你回答我的問題,我用你的正確方法來檢查all_data和all_data不存在南或無限值,它仍然有ValueError – zengcaifei
@zengcaifei請編輯你的問題,以反映這個新的信息。 – Shai
哦,我剛剛發現,當我使用x_train = np.array(all_data [:train.shape [0]])和x_test = np.array(all_data [train.shape [0]:]),我忘記y_train也需要改變爲numpy,所以我改變y_train = train [「SalePrice」]到y_train = np.array(train [「SalePrice」]),它是正確的,但我仍然不知道它爲什麼發生 – zengcaifei