2013-09-23 115 views
7

我已經從sklearn框架實現了LinearSVC和SVC進行文本分類。 我正在使用TfidfVectorizer獲取由兩個不同類(良性數據和惡意數據)組成的輸入數據的稀疏表示。這部分工作得很好,但現在我想通過使用OneClassSVM分類器來實現某種異常檢測,並僅使用一個類訓練模型(異常值檢測...)。不幸的是,它不適用於稀疏數據。一些開發人員正在開發一個補丁(https://github.com/scikit-learn/scikit-learn/pull/1586),但有一些缺陷,所以目前還沒有使用OneClassSVM實現的解決方案。[scikit學習]:異常檢測 - OneClassSVM的替代方案

在sklearn框架中有沒有其他方法來做類似的事情?我正在查看這些示例,但似乎沒有任何結果。

謝謝!

回答

1

不幸的是,目前scikit學習implements只有一類SVM和強大的協方差估計的異常檢測

可以通過檢查在2D數據差異嘗試這些方法(as provided in the doc)一個對比:

import numpy as np 
import pylab as pl 
import matplotlib.font_manager 
from scipy import stats 

from sklearn import svm 
from sklearn.covariance import EllipticEnvelope 

# Example settings 
n_samples = 200 
outliers_fraction = 0.25 
clusters_separation = [0, 1, 2] 

# define two outlier detection tools to be compared 
classifiers = { 
    "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, 
            kernel="rbf", gamma=0.1), 
    "robust covariance estimator": EllipticEnvelope(contamination=.1)} 

# Compare given classifiers under given settings 
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500)) 
n_inliers = int((1. - outliers_fraction) * n_samples) 
n_outliers = int(outliers_fraction * n_samples) 
ground_truth = np.ones(n_samples, dtype=int) 
ground_truth[-n_outliers:] = 0 

# Fit the problem with varying cluster separation 
for i, offset in enumerate(clusters_separation): 
    np.random.seed(42) 
    # Data generation 
    X1 = 0.3 * np.random.randn(0.5 * n_inliers, 2) - offset 
    X2 = 0.3 * np.random.randn(0.5 * n_inliers, 2) + offset 
    X = np.r_[X1, X2] 
    # Add outliers 
    X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))] 

    # Fit the model with the One-Class SVM 
    pl.figure(figsize=(10, 5)) 
    for i, (clf_name, clf) in enumerate(classifiers.iteritems()): 
     # fit the data and tag outliers 
     clf.fit(X) 
     y_pred = clf.decision_function(X).ravel() 
     threshold = stats.scoreatpercentile(y_pred, 
              100 * outliers_fraction) 
     y_pred = y_pred > threshold 
     n_errors = (y_pred != ground_truth).sum() 
     # plot the levels lines and the points 
     Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) 
     Z = Z.reshape(xx.shape) 
     subplot = pl.subplot(1, 2, i + 1) 
     subplot.set_title("Outlier detection") 
     subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), 
         cmap=pl.cm.Blues_r) 
     a = subplot.contour(xx, yy, Z, levels=[threshold], 
          linewidths=2, colors='red') 
     subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], 
         colors='orange') 
     b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white') 
     c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black') 
     subplot.axis('tight') 
     subplot.legend(
      [a.collections[0], b, c], 
      ['learned decision function', 'true inliers', 'true outliers'], 
      prop=matplotlib.font_manager.FontProperties(size=11)) 
     subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors)) 
     subplot.set_xlim((-7, 7)) 
     subplot.set_ylim((-7, 7)) 
    pl.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26) 

pl.show()