0
我對SVM AUC Python代碼疑問:AUC曲線繪製在python
print(__doc__)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
tfidf_vect= TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=False, ngram_range=(2,2))
from sklearn.cross_validation import train_test_split, cross_val_score
import pandas as pd
df = pd.read_csv('merged_quantized_list.csv',
header=0, sep=',', names=['id', 'content', 'label'])
X = tfidf_vect.fit_transform(df['content'].values)
y = df['label'].values
首先懷疑的是,因爲我的CSV文件包含60列和5000行,其中第一行是我的標籤和休息是內容。這個x和y是否包含內容和標籤?
第二件事是:當我運行這段代碼,我得到了錯誤:
X = tfidf_vect.fit_transform(df['content'].values)
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.py", line 1352, in fit_transform
X = super(TfidfVectorizer, self).fit_transform(raw_documents)
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.py", line 839, in fit_transform
self.fixed_vocabulary_)
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.py", line 762, in _count_vocab
for feature in analyze(doc):
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.py", line 241, in <lambda>
tokenize(preprocess(self.decode(doc))), stop_words)
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/text.py", line 207, in <lambda>
return lambda x: strip_accents(x.lower())
AttributeError: 'numpy.int64' object has no attribute 'lower'
請幫助我。 在此先感謝
對不起,我的CSV文件包含60個colomns和5000行,其中第一colomn是 – Dhara
是否「內容」一欄只包含任何整數或字符串的標籤?這個錯誤是因爲在提供的數據中有整數,所以'lower()'(小寫字符串)不能應用於它。 –