2017-05-28 53 views
-1

我有火車和測試數據集。我想對我的測試數據集進行預測並將其保存爲CSV。問題是,預測不同類型的數據集,而不需要一個

我無法保存我的測試數據集的結果。每次我保存訓練數據集的結果。

如果你告訴我我失蹤的地方,那將會很棒。

這是我的代碼。

import pandas as pd 
import numpy as np 
from sklearn.ensemble import RandomForestClassifier 
import random 
from sklearn.metrics import roc_curve 
from sklearn.preprocessing import LabelEncoder 
from sklearn import metrics 

Train = pd.read_csv('Dataset/train.csv', delimiter=';') 
Test = pd.read_csv('Dataset/train.csv', delimiter=';') 
Train['Type'] = 'Train' # Create a flag for Train and Test Data set 
Test['Type'] = 'Test' 
FullData = pd.concat([Train, Test], axis=0) # Combined both Train and Test Data set 

ID_Col = ['USER_ID'] # ID Variables 
Target_Col = ["ACTIVITY_DEC_16"] 
Cat_Cols = ['ACT_DATE', 'STATUS', 'TP_CURRENT', 'TP_CHANGES_NUM', 'START_PACK', 'OFFER_GROUP', 'BIRTHDAY', 'GENDER', 
      'MLLS_STATE', 
      'PORTED_IN', 'PORTED_OUT', 'OBLIG_NUM', 'OBLIG_ON_START', 'ASSET_TYPE_LAST', 'DEVICE_TYPE_BUS', 
      'USAGE_AREA', 'REFILL_OCT_16', 'REFILL_NOV_16', 
      'OUTGOING_OCT_16', 'OUTGOING_NOV_16', 'GPRS_OCT_16', 'GPRS_NOV_16', 'REVENUE_OCT_16', 
      'REVENUE_NOV_16'] # Categorical Variables 

Num_Cols = list(set(list(FullData.columns)) - set(Cat_Cols) - set(ID_Col) - set(Target_Col)) # Numerical Variables 

Other_Col = ['Type'] # Test and Train Data Set Identifier 

Num_Cat_Cols = Num_Cols + Cat_Cols # Combined numerical and Categorical variables 

# Create a new variable for each variable having missing value with VariableName_NA 
# and flag missing value with 1 and other with 0 

for var in Num_Cat_Cols: 
    if FullData[var].isnull().any() == True: 
     FullData[var + '_NA'] = FullData[var].isnull() * 1 

# Impute numercial missing values with mean 

FullData[Num_Cols] = FullData[Num_Cols].fillna(FullData[Num_Cols].mean(), inplace=True) 
FullData[Cat_Cols] = FullData[Cat_Cols].fillna(value=-9999) 

# Create a label encoders for categorical variables and split the data set to train & test. Further split the train data set to Train and Validate 

for var in Cat_Cols: 
    number = LabelEncoder() 
    FullData[var] = number.fit_transform(FullData[var].astype('str')) 

# Target Variable is also a categorical so convert it 

FullData["ACTIVITY_DEC_16"] = number.fit_transform((FullData["ACTIVITY_DEC_16"].astype('str'))) 

Train = FullData[FullData['Type'] == 'Train'] 
Test = FullData[FullData['Type'] == 'Test'] 

Train['is_train'] = np.random.uniform(0, 1, len(Train)) <= 0.75 
Train, Validate = Train[Train['is_train'] == True], Train[Train['is_train'] == False] 

Features = list(set(list(FullData.columns)) - set(ID_Col) - set(Target_Col) - set(Other_Col)) 

X_Train = Train[list(Features)] 
Y_Train = Train["ACTIVITY_DEC_16"].values 
X_Validate = Validate[list(Features)].values 
Y_Validate = Validate["ACTIVITY_DEC_16"].values 
X_Test = Test[list(Features)].values 

random.seed(100) 
rf = RandomForestClassifier(n_estimators=1000) 
rf.fit(X_Train, Y_Train) 

Status = rf.predict_proba(X_Validate) 
fpr, tpr, _ = roc_curve(Y_Validate, Status[:, 1]) 
roc_auc = metrics.auc(fpr, tpr) 

Final_Status = rf.predict_proba(X_Test) 
print(Final_Status) 
Test['ACTIVITY_DEC_16_PROB'] = Final_Status[:, 1] 
Test.to_csv('/Users/isozyesil/PycharmProjects/TaskNo2/Dataset/Output.csv', columns=['USER_ID', 'ACTIVITY_DEC_16_PROB']) 

回答

-1

重新達到訓練數據,而不是測試。

Train = pd.read_csv('Dataset/train.csv', delimiter=';') 
Test = pd.read_csv('Dataset/test.csv', delimiter=';') 
相關問題