1
我遇到了需要對俄語文本進行分類的文本分類問題。對於特徵提取,我用scikit學習TfidfTransformer和CountVectorizer,但編譯代碼中有一個錯誤後:俄語語言的文本分類
'UnicodeDecodeError: 'utf8' codec can't decode byte 0xc2 in position 0:
invalid continuation byte'.
我怎樣才能糾正這個錯誤嗎?這裏是在Python代碼:
# -*- coding: utf-8 -*-
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import numpy as np
import numpy.linalg as LA
import os
import nltk
import re
import sys
from nltk import NaiveBayesClassifier
import nltk.classify
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
import re
data_path = os.path.abspath(os.path.join('/home/lena/','corpus'))
official_path = os.path.join(data_path,'official')
#print official_path
official2_path = os.path.join(data_path,'official_2')
talk_path = os.path.join(data_path,'talk')
talk2_path = os.path.join(data_path,'talk_2')
#fiction_path = os.path.join(data_path,'fiction')
#fiction2_path = os.path.join(data_path,'fiction_2')
def get_text(path):
with open(path,'rU') as file:
line = file.readlines()
return ''.join(line)
def get_textdir(path):
filelist = os.listdir(path)
all_text = [get_text(os.path.join(path,f)) for f in filelist]
return all_text
all_talk = get_textdir(talk_path)
all_official = get_textdir(official_path)
official_2 = get_textdir(official2_path)
talk_2 = get_textdir(talk2_path)
train_set = all_talk
test_set = talk_2
stopWords = stopwords.words('russian')
vectorizer = CountVectorizer(stop_words = stopWords)
print vectorizer
train = vectorizer.fit_transform(train_set).toarray()
test = vectorizer.transform(test_set).toarray()
print 'train set', train
print 'test set', test
transformer.fit(train)
print transformer.transform(train).toarray()
transformer.fit(test)
tfidf = transformer.transform(test)
print tfidf.todense()