0
文字,我想使用的文字「描述」和「類」分類算法,使用R
下面我使用的腳本歷史數據類新文檔的預測,但對於新的文件,我想預測我沒有越來越好的準確性,任何人都可以幫助我瞭解哪種算法可以用來提高準確性。請指教。
library(plyr)
library(tm)
library(e1071)
setwd("C:/Data")
past <- read.csv("Past - Copy.csv",header=T,na.strings=c(""))
future <- read.csv("Future - Copy.csv",header=T,na.strings=c(""))
training <- rbind.fill(past,future)
Res_Desc_Train <- subset(training,select=c("Class","Description"))
##Step 1 : Create Document Matrix of ticket Descriptions available past data
docs <- Corpus(VectorSource(Res_Desc_Train$Description))
docs <-tm_map(docs,content_transformer(tolower))
#remove potentially problematic symbols
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]","",x)
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeWords, stopwords('english'))
#inspect(docs[440])
dataframe<-data.frame(text=unlist(sapply(docs, `[`, "content")), stringsAsFactors=F)
dtm <- DocumentTermMatrix(docs,control=list(stopwords=FALSE,wordLengths =c(2,Inf)))
##Let's remove the variables which are 95% or more sparse.
dtm <- removeSparseTerms(dtm,sparse = 0.95)
Weighteddtm <- weightTfIdf(dtm,normalize=TRUE)
mat.df <- as.data.frame(data.matrix(Weighteddtm), stringsAsfactors = FALSE)
mat.df <- cbind(mat.df, Res_Desc_Train$Class)
colnames(mat.df)[ncol(mat.df)] <- "Class"
Assignment.Distribution <- table(mat.df$Class)
Res_Desc_Train_Assign <- mat.df$Class
Assignment.Distribution <- table(mat.df$Class)
### Feature has different ranges, normalizing to bring ranges from 0 to 1
### Another way to standardize using z-scores
normalize <- function(x) {
y <- min(x)
z <- max(x)
temp <- x - y
temp1 <- (z - y)
temp2 <- temp/temp1
return(temp2)
}
#normalize(c(1,2,3,4,5))
num_col <- ncol(mat.df)-1
mat.df_normalize <- as.data.frame(lapply(mat.df[,1:num_col], normalize))
mat.df_normalize <- cbind(mat.df_normalize, Res_Desc_Train_Assign)
colnames(mat.df_normalize)[ncol(mat.df_normalize)] <- "Class"
#names(mat.df)
outcomeName <- "Class"
train = mat.df_normalize[c(1:nrow(past)),]
test = mat.df_normalize[((nrow(past)+1):nrow(training)),]
train$Class <- as.factor(train$Class)
###SVM Model
x <- subset(train, select = -Class)
y <- train$Class
model <- svm(x, y, probability = TRUE)
test1 <- subset(test, select = -Class)
svm.pred <- predict(model, test1, decision.values = TRUE, probability = TRUE)
svm_prob <- attr(svm.pred, "probabilities")
finalresult <- cbind(test,svm.pred,svm_prob)
感謝您的幫助,我們將使用您分享的解決方案,並檢查準確度是否可以提高,實際上我的準確性非常低,約爲52% – user3734568
在這種情況下,您可能還需要增加訓練數據集,以便模型學習正常。 – Prem
感謝您的建議,將檢查我是否可以獲取更多數據集來訓練模型,目前我的火車數據集中有13383個文檔。 – user3734568