2016-05-03 88 views
0

我對此很新,並看到其他人有相同的錯誤,但未能看到我如何實施解決方案。我正在嘗試使用scikit learn中的隨機網格搜索來編寫隨機森林機器學習方法。它適用於標準網格搜索,但在使用隨機網格搜索時,scikit學習中的擬合函數出現奇怪的錯誤失敗。關於如何解決這個問題的任何建議將會很好Python TypeError:range()整數結束參數預計,得到浮動。與適合功能

下面是一個顯示錯誤的例子。

import scipy 
import math 
import numpy as np 
import pandas as pd 
import plotly.plotly as py 

from time import time 
from sklearn import preprocessing, metrics, cross_validation 
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV 
from sklearn.cross_validation import KFold 

data = pd.read_csv("data.csv", sep=",") 
data = SubFeAll.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor 
header = data.columns.values # Ues the column headers as the descriptor labels 
data.head() 

# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1) 

# Random Forest results initialised 
RFr2 = [] 
RFmse = [] 
RFrmse = [] 

# Predictions results initialised 
RFpredictions = [] 

metcount = 0 

# Give the array from pandas to numpy 
npArray = np.array(data) 
print header.shape 
npheader = np.array(header[1:-1]) 
print("Array shape X = %d, Y = %d " % (npArray.shape)) 
datax, datay = npArray.shape 

# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point 
names = npArray[:,0] 
X = npArray[:,1:-1].astype(float) 
y = npArray[:,-1] .astype(float) 
X = preprocessing.scale(X) 
print X.shape 

# Open output files 
train_name = "Training.csv" 
fi_name = "Feature_importance.csv" 

with open(train_name,'w') as ftrain: 
     ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n") 
     ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n") 
     ftrain.write("Fold %d ,\n" %(metcount+1)) 
ftrain.close() 

with open(fi_name,'w') as ffeatimp: 
     ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n") 
ffeatimp.close() 

# Begin the K-fold cross validation over ten folds 
kf = KFold(datax, n_folds=10) 
print "------------------- Begining Ten Fold Cross Validation -------------------" 
for train, test in kf: 
    XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test] 
    ytestdim = yTest.shape[0] 
    i = 0 
    with open (train_name, 'a') as ftrain: 
     while i< ytestdim : 
       ftrain.write(str(round(yTest[i],2))+',\n') 
       i += 1 
    ftrain.close() 

    print "\n" 
    # random forest grid search parameters 
    print "------------------- Begining Random Forest Grid Search -------------------" 
    rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)} 
    rf = RandomForestRegressor(random_state=0,n_jobs=2) 
    RfGridSearch = RandomizedSearchCV(rf,param_distributions=rfparamgrid,scoring='mean_squared_error',n_iter=20) 
    start = time() 
    RfGridSearch.fit(XTrain,yTrain) 

    # Get best random forest parameters 
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_))) 
    RFtime = time() - start,len(RfGridSearch.grid_scores_) 
    report(RfGridSearch.grid_scores_) 
    print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators']) 
    ne = RfGridSearch.best_params_['n_estimators'] 
    print("max_features = %s " % RfGridSearch.best_params_['max_features']) 
    mf = RfGridSearch.best_params_['max_features'] 
    print("max_depth = %d " % RfGridSearch.best_params_['max_depth']) 
    md = RfGridSearch.best_params_['max_depth'] 
    with open (train_name, 'a') as ftrain: 
      ftrain.write("Random Forest") 
      ftrain.write("RF search time, %s ,\n" % (str(RFtime))) 
      ftrain.write("Number of Trees, %s ,\n" % str(ne)) 
      ftrain.write("Number of feature at split, %s ,\n" % str(mf)) 
      ftrain.write("Max depth of tree, %s ,\n" % str(md)) 
    ftrain.close() 

所給出的誤差小於

Traceback (most recent call last): 
    File "rgscv.py", line 81, in <module> 
    RfGridSearch.fit(XTrain,yTrain) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 996, in fit 
    return self._fit(X, y, sampled_params) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/grid_search.py", line 553, in _fit 
    for parameters in parameter_iterable 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 800, in __call__ 
    while self.dispatch_one_batch(iterator): 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 658, in dispatch_one_batch 
    self._dispatch(tasks) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 566, in _dispatch 
    job = ImmediateComputeBatch(batch) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 180, in __init__ 
    self.results = batch() 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 72, in __call__ 
    return [func(*args, **kwargs) for func, args, kwargs in self.items] 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/cross_validation.py", line 1531, in _fit_and_score 
    estimator.fit(X_train, y_train, **fit_params) 
    File "/Users/James/anaconda/lib/python2.7/site-packages/sklearn/ensemble/forest.py", line 276, in fit 
    for i in range(n_more_estimators): 
TypeError: range() integer end argument expected, got float. 

起初,我以爲我只是錯過了一個參數,但有一個簡單的網格搜索這個確切的方法似乎工作沒有問題。代碼如下。任何人都可以告訴我是什麼導致這個錯誤?

import scipy 
import math 
import numpy as np 
import pandas as pd 
import plotly.plotly as py 

from time import time 
from sklearn import preprocessing, metrics, cross_validation 
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV 
from sklearn.cross_validation import KFold 

data = pd.read_csv("data.csv", sep=",") 
data = data.fillna(SubFeAll.mean()) # replace the NA values with the mean of the descriptor 
header = data.columns.values # Ues the column headers as the descriptor labels 
data.head() 

# Set the numpy global random number seed (similar effect to random_state) 
np.random.seed(1) 

# Random Forest results initialised 
RFr2 = [] 
RFmse = [] 
RFrmse = [] 

# Predictions results initialised 
RFpredictions = [] 

metcount = 0 

# Give the array from pandas to numpy 
npArray = np.array(data) 
print header.shape 
npheader = np.array(header[1:-1]) 
print("Array shape X = %d, Y = %d " % (npArray.shape)) 
datax, datay = npArray.shape 

# Split the data into: names labels of the molecules ; y the True results ; X the descriptors for each data point 
names = npArray[:,0] 
X = npArray[:,1:-1].astype(float) 
y = npArray[:,-1] .astype(float) 
X = preprocessing.scale(X) 
print X.shape 

# Open output files 
train_name = "Training.csv" 
fi_name = "Feature_importance.csv" 

with open(train_name,'w') as ftrain: 
     ftrain.write("This file contains the training information for all three models (Random Forest, Support Vector Regression and Partial Least Squares),\n") 
     ftrain.write("The code use a ten fold cross validation 90% training 10% test at each fold so ten training sets are used here,\n") 
     ftrain.write("Fold %d ,\n" %(metcount+1)) 
ftrain.close() 

with open(fi_name,'w') as ffeatimp: 
     ffeatimp.write("This file contains the feature importance information for the Random Forest model,\n") 
ffeatimp.close() 

# Begin the K-fold cross validation over ten folds 
kf = KFold(datax, n_folds=10) 
print "------------------- Begining Ten Fold Cross Validation -------------------" 
for train, test in kf: 
    XTrain, XTest, yTrain, yTest = X[train], X[test], y[train], y[test] 
    ytestdim = yTest.shape[0] 
    i = 0 
    with open (train_name, 'a') as ftrain: 
     while i< ytestdim : 
       ftrain.write(str(round(yTest[i],2))+',\n') 
       i += 1 
    ftrain.close() 

    print "\n" 
    # random forest grid search parameters 
    print "------------------- Begining Random Forest Grid Search -------------------" 
    #rfparamgrid = {"n_estimators": scipy.stats.expon(scale=100), "max_features": ["auto", "sqrt", "log2"], "max_depth": scipy.stats.expon(scale=100)} 
    rfparamgrid = {"n_estimators": [10, 20, 25, 50, 100, 1000], "max_features": ["auto", "sqrt", "log2"], "max_depth": [1,2,3,5,7,10]} 
    rf = RandomForestRegressor(random_state=0,n_jobs=2) 
    RfGridSearch = GridSearchCV(rf,param_grid=rfparamgrid,scoring='mean_squared_error') 
    start = time() 
    RfGridSearch.fit(XTrain,yTrain) 

    # Get best random forest parameters 
    print("GridSearchCV took %.2f seconds for %d candidate parameter settings" %(time() - start,len(RfGridSearch.grid_scores_))) 
    RFtime = time() - start,len(RfGridSearch.grid_scores_) 
    report(RfGridSearch.grid_scores_) 
    print("n_estimators = %d " % RfGridSearch.best_params_['n_estimators']) 
    ne = RfGridSearch.best_params_['n_estimators'] 
    print("max_features = %s " % RfGridSearch.best_params_['max_features']) 
    mf = RfGridSearch.best_params_['max_features'] 
    print("max_depth = %d " % RfGridSearch.best_params_['max_depth']) 
    md = RfGridSearch.best_params_['max_depth'] 
    with open (train_name, 'a') as ftrain: 
       ftrain.write("Random Forest") 
       ftrain.write("RF search time, %s ,\n" % (str(RFtime))) 
       ftrain.write("Number of Trees, %s ,\n" % str(ne)) 
       ftrain.write("Number of feature at split, %s ,\n" % str(mf)) 
       ftrain.write("Max depth of tree, %s ,\n" % str(md)) 
    ftrain.close() 
+0

@coralv他明顯使用一個庫,它在網站包目錄中。不要問無意義的問題。 – Natecat

+1

該部分代碼是scikit學習庫的標準適配函數。這不是我編輯過的代碼,並且在GridSearchCV版本中成功使用了相同的函數,該函數按預期工作。 – James

+0

[scipy.stats.expon](http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.expon.html#scipy.stats.expon)似乎會返回一個expon對象,它不會看起來它不像第二個例子中的列表那樣起作用。是否將其更改爲列表可修復它? – Natecat

回答

1

估計數必須是整數,你的代碼產生漂浮。創建一個包含整數的n_estimators值的有效列表,它會很好。

+0

謝謝你對它進行排序。 – James

相關問題