import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
sentences = ["i want take a photo", "i go to take a photo", "i go to use my camera", "i go to eat something", "i like my food"]
labels = ["photo", "photo", "photo", "eat", "eat"]
tfv = TfidfVectorizer()
# Fit TFIDF
tfv.fit(traindata)
X = tfv.transform(traindata)
lbl = LabelEncoder()
y = lbl.fit_transform(labels)
xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(X, y, stratify=y, random_state=42)
clf = LogisitcRegression()
clf.fit(xtrain, ytrain)
predictions = clf.predict(xtest)
print "Accuracy Score = ", metrics.accuracy_score(ytest, predictions)
新的數據:
new_sentence = ["this is a new sentence"]
X_Test = tfv.transform(new_sentence)
print clf.predict_proba(X_Test)
?好的,但我如何檢查所有標籤的新隨機句子? – esemve
查看最新的答案 –
Thx很多,但是我的最後一個問題是:這是工作,但是如果我搜索測試現有句子,例如:「我去吃東西」,它回答:0.55 0.44,但是爲什麼?它的一個列車數據爲吃飯類別:\第一個數字不是照片,第二個是吃飯類別?或者,如果不是,我可以得到什麼數字是什麼類別? – esemve