2014-11-03 200 views
1

我在python中運行了一個隨機森林模型並能夠看到分類表。但我希望全面的代碼覆蓋從Python代碼開始的所有方面的數據準備,模型運行,模型驗證和準確性檢查? 我在模型中出現了很多誤報。任何幫助改善這一點也將非常有幫助。python中的隨機森林

+0

您需要將信息添加到您的問題。如果我說「你需要改變第五行的代碼並添加一個if」 - 那麼你將會遇到的問題是你需要爲我回答的問題。這裏有一些鏈接:http://stackoverflow.com/help/how-to-ask http://stackoverflow.com/help/mcve – User 2014-11-03 16:51:31

回答

2

請看,

import urllib2 
import numpy 
from sklearn import tree 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
import random 
from math import sqrt 
import matplotlib.pyplot as plot 


# Define function confusion matrix 
def confusionMatrix(predicted, actual, threshold): 
    if len(predicted) != len(actual): return -1 
    tp = 0.0 
    fp = 0.0 
    tn = 0.0 
    fn = 0.0 
    for i in range(len(actual)): 
     if actual[i] > 0.5: #labels that are 1.0 (positive examples) 
      if predicted[i] > threshold: 
       tp += 1.0 #correctly predicted positive 
      else: 
       fn += 1.0 #incorrectly predicted negative 
     else:    #labels that are 0.0 (negative examples) 
      if predicted[i] < threshold: 
       tn += 1.0 #correctly predicted negative 
      else: 
       fp += 1.0 #incorrectly predicted positive 
    rtn = [tp, fn, fp, tn] 
    return rtn 



#Hyperlink for Python 
target_url = ("https://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra") 
data = urllib2.urlopen(target_url) 

xList = [] 
labels = [] 
names = [] 
firstline = True 

for line in data: 
    #row strip by "," sign 
    row = line.strip().split(",") 
    # assign labels as last column 
    labels.append(float(row[-1])) 
    #remove label from row 
    row.pop() 
    #feature vector 
    floatRow = [float(num) for num in row] 
    #append on the xList 
    xList.append(floatRow) 


nrows = len(xList) 
ncols = len(xList[0]) 

#Split Data for Test and Train 
random.seed(1) 
nSample = int(nrows * 0.30) 
idxTest = random.sample(range(nrows),nSample) 
idxTest.sort() 
idxTrain = [idx for idx in range(nrows) if not(idx in idxTest)] 

xTrain = [xList[r] for r in idxTrain] 
xTest = [xList[r] for r in idxTest] 
yTrain = [labels[r] for r in idxTrain] 
yTest = [labels[r] for r in idxTest] 



numTreesMax = 30 

treeDepth = 12 

nAttr = 4 

modelList = [] 
indexList = [] 
predList = [] 
nTrainRows = len(yTrain) 


for iTrees in range(numTreesMax): 

     idxAttr = random.sample(range(ncols), nAttr) 
     idxAttr.sort() 
     indexList.append(idxAttr) 

     idxRows = [] 
     for i in range(int(0.5 * nTrainRows)): 
       idxRows.append(random.choice(range(len(xTrain)))) 
     idxRows.sort() 

     xRFTrain = [] 
     yRFTrain = [] 

     for i in range(len(idxRows)): 
       temp = [xTrain[idxRows[i]][j] for j in idxAttr] 
       xRFTrain.append(temp) 
       yRFTrain.append(yTrain[idxRows[i]]) 

     modelList.append(DecisionTreeClassifier(max_depth = treeDepth)) 

     modelList[-1].fit(xRFTrain,yRFTrain) 

     xRFTest = [] 
     for xx in xTest: 
       temp = [xx[i] for i in idxAttr] 
       xRFTest.append(temp) 

     latestOutSAmplePrediction = modelList[-1].predict(xRFTest) 
     predList.append(list(latestOutSAmplePrediction)) 



classerror = [] 
allPredictions = [] 
for iModels in range(len(modelList)): 
     prediction = [] 
     for iPred in range(len(xTest)): 
       prediction.append(sum([predList[i][iPred] for i in range(iModels +1)])/(iModels +1)) 

     allPredictions.append(prediction) 
     conMatTest = confusionMatrix(prediction,yTest,0.5) 
     errors = 1.0 - ((conMatTest[0] + conMatTest[3])/(conMatTest[0]+conMatTest[1]+conMatTest[2]+conMatTest[3])) 
     classerror.append(errors) 





nModels = [i + 1 for i in range(len(modelList))] 

plot.plot(nModels,classerror) 
plot.axis('tight') 
plot.xlabel('Number of Trees in Ensamble') 
plot.ylabel('Class Error') 
plot.ylim((0.0,max(classerror))) 
plot.show()