0
我想使用scikit的GridSearch來找到一個套索的最佳阿爾法,我想要它迭代的參數之一是交叉驗證分割。所以,我在做:scikit ShuffleSplit舉起大熊貓「IndexError:索引N超出軸0與大小M的界限」
# X_train := Pandas Dataframe with no index (auto numbered index) and 62064 rows
# y_train := Pandas 1-column Dataframe with no index (auto numbered index) and 62064 rows
from sklearn import linear_model as lm
from sklearn import cross_validation as cv
from sklearn import grid_search
model = lm.LassoCV(eps=0.001, n_alphas=1000)
params = {"cv": [cv.ShuffleSplit(n=len(X_train), test_size=0.2),
cv.ShuffleSplit(n=len(X_train), test_size=0.1)]}
m_model = grid_search.GridSearchCV(model, params)
m_model.fit(X_train, y_train)
但它引發異常
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-113-f791cb0644c1> in <module>()
10 m_model = grid_search.GridSearchCV(model, params)
11
---> 12 m_model.fit(X_train.as_matrix(), y_train.as_matrix())
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/grid_search.py in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/grid_search.py in _fit(self, X, y, parameter_iterable)
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
554 for train, test in cv)
555
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1529 estimator.fit(X_train, **fit_params)
1530 else:
-> 1531 estimator.fit(X_train, y_train, **fit_params)
1532
1533 except Exception as e:
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py in fit(self, X, y)
1146 for train, test in folds)
1147 mse_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
-> 1148 backend="threading")(jobs)
1149 mse_paths = np.reshape(mse_paths, (n_l1_ratio, len(folds), -1))
1150 mean_mse = np.mean(mse_paths, axis=1)
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
798 # was dispatched. In particular this covers the edge
799 # case of Parallel used with an exhausted iterator.
--> 800 while self.dispatch_one_batch(iterator):
801 self._iterating = True
802 else:
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
656 return False
657 else:
--> 658 self._dispatch(tasks)
659 return True
660
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
564
565 if self._pool is None:
--> 566 job = ImmediateComputeBatch(batch)
567 self._jobs.append(job)
568 self.n_dispatched_batches += 1
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __init__(self, batch)
178 # Don't delay the application, to avoid keeping the input
179 # arguments in memory
--> 180 self.results = batch()
181
182 def get(self):
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
/home/user/Programs/repos/pyenv/versions/3.5.2/envs/work/lib/python3.5/site-packages/sklearn/linear_model/coordinate_descent.py in _path_residuals(X, y, train, test, path, path_params, alphas, l1_ratio, X_order, dtype)
931 avoid memory copies
932 """
--> 933 X_train = X[train]
934 y_train = y[train]
935 X_test = X[test]
IndexError: index 60527 is out of bounds for axis 0 with size 41376
我試圖用X_train.as_matrix(),但沒有任何工作,給予同樣的錯誤。
奇怪的是,我可以手動使用它:
cv_split = cv.ShuffleSplit(n=len(X_train), test_size=0.2)
for tr, te in cv_split:
print(X_train.as_matrix()[tr], y_train.as_matrix()[tr])
[[0 0 0 ..., 0 0 1]
[0 0 0 ..., 0 0 1]
[0 0 0 ..., 0 0 1]
...,
[0 0 0 ..., 0 0 1]
[0 0 0 ..., 0 0 1]
[0 0 0 ..., 0 0 1]] [2 1 1 ..., 1 4 1]
[[ 0 0 0 ..., 0 0 1]
[1720 0 0 ..., 0 0 1]
[ 0 0 0 ..., 0 0 1]
...,
[ 773 0 0 ..., 0 0 1]
[ 0 0 0 ..., 0 0 1]
[ 501 1 0 ..., 0 0 1]] [1 1 1 ..., 1 2 1]
什麼我沒有看到嗎?我做錯了什麼或者是scikit錯誤?
更新1
剛剛發現CV參數不是cv.ShuffleSplit對象。這是違反直覺對我來說,因爲the docs says
是不是cross_validation班「對象被用作交叉驗證生成器」?
謝謝!
感謝您指出這一點,但我知道這一點。我只放了cv部分,因爲它是有問題的部分,但我正在爲Lasso進行alpha搜索。不過,我要試試這個參數設置。 – paulochf