1

我用sklearn的SGDClassifier和LogisticRegression工作過很多。我現在有一個問題,它需要: 1-使用稀疏矩陣(sklearn支持那個) 2-多標籤實例,其中每個實例的標籤是所有標籤而不是0/1指標的概率分佈。支持稀疏矩陣和多標籤輸出的Logistic迴歸?

例如樣品1的標籤可能是:

0.1 0.0 0.4 0.5 

這意味着,對於4個標籤,我們有一個概率分佈,我不應該使用閾值,使他們0/1​​。

有沒有人知道如何修改sklearn的線性模型來實現這一點?或者是否有任何其他庫支持多輸出實例?

注意:我已經在lasagne/theano中實現了多輸出邏輯迴歸,但它不接受稀疏矩陣,與sklearn相比它非常慢。

+0

相關:http://stats.stackexchange.com/q/212610/842 – unutbu

+0

相關問題由我自己提問:D – Ash

回答

0

Update2:似乎將迷你批量從20增加到1000使得優化更快。所以我想我會堅持這個解決方案。

UPDATE1:以下溶液是如此之慢:

A液:

最後,我使用千層麪實現接受稀疏矩陣作爲輸入,並且支持多輸出實例邏輯迴歸模型(例如標籤上的概率分佈)作爲目標值。代碼如下。我已經將lasagne.layers.sparse 中的兩個類直接複製到我的代碼中,如果您的烤寬麪條是更新的版本,則不需要該代碼。

主要想法是,而不是每個DenseLayer和DropoutLayer你應該使用相應的類SparseInputDenseLayer和SparseInputDropoutLayer。請注意,輸入層保持不變,這就是讓我困惑的原因。我期望輸入層發生變化,但是因爲所有與輸入層相關的計算都出現在下一層(隱藏層或輸出層)中,所以第二層級如上所述發生變化。

請注意Logistic迴歸可以通過添加一個或多個密集層而輕鬆轉換爲MLP。

請注意,更改的唯一圖層是直接位於輸入圖層之後的圖層,其他圖層不會更改。

我也用glenet(python wrapper)和sklearn的MLPClassifier進行了實驗,但就我所知他們只支持1d目標標籤或1/0多輸出標籤,並且不接受標籤概率分佈作爲目標。

''' 
Created on 22 Apr 2016 

@author: af 
''' 

import pdb 
import numpy as np 
import sys 
from os import path 
import scipy as sp 
import theano 
import theano.tensor as T 
import lasagne 
from lasagne.regularization import regularize_layer_params_weighted, l2, l1 
from lasagne.regularization import regularize_layer_params 
import theano.sparse as S 
from lasagne.layers import DenseLayer, DropoutLayer 
sys.path.append(path.abspath('../geolocation')) 
import params 
import geolocate 
import logging 
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) 

class SparseInputDenseLayer(DenseLayer): 
    def get_output_for(self, input, **kwargs): 
     if not isinstance(input, (S.SparseVariable, S.SparseConstant, 
            S.sharedvar.SparseTensorSharedVariable)): 
      raise ValueError("Input for this layer must be sparse") 

     activation = S.dot(input, self.W) 
     if self.b is not None: 
      activation = activation + self.b.dimshuffle('x', 0) 
     return self.nonlinearity(activation) 
class SparseInputDropoutLayer(DropoutLayer): 
    def get_output_for(self, input, deterministic=False, **kwargs): 
     if not isinstance(input, (S.SparseVariable, S.SparseConstant, 
            S.sharedvar.SparseTensorSharedVariable)): 
      raise ValueError("Input for this layer must be sparse") 

     if deterministic or self.p == 0: 
      return input 
     else: 
      # Using Theano constant to prevent upcasting 
      one = T.constant(1, name='one') 
      retain_prob = one - self.p 

      if self.rescale: 
       input = S.mul(input, one/retain_prob) 

      input_shape = self.input_shape 
      if any(s is None for s in input_shape): 
       input_shape = input.shape 

      return input * self._srng.binomial(input_shape, p=retain_prob, 
               dtype=input.dtype) 


def load_data(): 
    from sklearn.datasets import fetch_20newsgroups 
    from sklearn.feature_extraction.text import TfidfVectorizer 
    #categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space'] 
    categories = None 
    #remove = ('headers', 'footers', 'quotes') 
    remove =() 
    train_set = fetch_20newsgroups(subset='train', 
            remove=remove, 
            categories=categories) 
    test_set = fetch_20newsgroups(subset='test', 
            remove=remove, 
            categories=categories) 
    Y_train, Y_test = train_set.target, test_set.target 
    vectorizer = TfidfVectorizer(min_df=2, stop_words='english') 
    X_train = vectorizer.fit_transform(train_set.data) 
    X_test = vectorizer.transform(test_set.data) 
    return X_train, Y_train, X_test, Y_test 

def load_geolocation_data(mindf=10, complete_prob=True): 

    geolocate.initialize(granularity=params.BUCKET_SIZE, write=False, readText=True, reload_init=False, regression=params.do_not_discretize) 
    params.X_train, params.Y_train, params.U_train, params.X_dev, params.Y_dev, params.U_dev, params.X_test, params.Y_test, params.U_test, params.categories, params.feature_names = geolocate.feature_extractor(norm=params.norm, use_mention_dictionary=False, min_df=mindf, max_df=0.2, stop_words='english', binary=True, sublinear_tf=False, vocab=None, use_idf=True, save_vectorizer=False) 

    if complete_prob: 
     Y_train = np.zeros((params.X_train.shape[0], len(params.categories)), dtype='int32') 
     Y_dev = np.zeros((params.X_dev.shape[0], len(params.categories)), dtype='int32') 
     for i in range(params.Y_dev.shape[0]): 
      Y_dev[i, params.Y_dev[i]] = 1 
     for i in range(params.Y_train.shape[0]): 
      Y_train[i, params.Y_train[i]] = 1 

     return params.X_train, Y_train, params.X_dev, Y_dev 
    else: 
     return params.X_train, params.Y_train, params.X_dev, params.Y_dev 
# ############################# Batch iterator ############################### 
# This is just a simple helper function iterating over training data in 
# mini-batches of a particular size, optionally in random order. It assumes 
# data is available as numpy arrays. For big datasets, you could load numpy 
# arrays as memory-mapped files (np.load(..., mmap_mode='r')), or write your 
# own custom data iteration function. For small datasets, you can also copy 
# them to GPU at once for slightly improved performance. This would involve 
# several changes in the main program, though, and is not demonstrated here. 

def iterate_minibatches(inputs, targets, batchsize, shuffle=False): 
    assert inputs.shape[0] == targets.shape[0] 
    if shuffle: 
     indices = np.arange(inputs.shape[0]) 
     np.random.shuffle(indices) 
    for start_idx in range(0, inputs.shape[0] - batchsize + 1, batchsize): 
     if shuffle: 
      excerpt = indices[start_idx:start_idx + batchsize] 
     else: 
      excerpt = slice(start_idx, start_idx + batchsize) 
     yield inputs[excerpt], targets[excerpt] 

def nn_model(X_train, Y_train, X_test, Y_test, n_epochs=20, batch_size=50, init_parameters=None, complete_prob=True, add_hidden=False, regul_coef=5e-5): 

    logging.info('building the network...') 
    in_size = X_train.shape[1] 

    if complete_prob: 
     out_size = Y_train.shape[1] 
    else: 
     out_size = len(set(Y_train.tolist())) 
    # Prepare Theano variables for inputs and targets 
    if not sp.sparse.issparse(X_train): 
     X_sym = T.matrix() 
    else: 
     X_sym = S.csr_matrix(name='inputs', dtype='float32') 

    if complete_prob: 
     y_sym = T.matrix() 
    else: 
     y_sym = T.ivector()  
    l_in = lasagne.layers.InputLayer(shape=(None, in_size), 
            input_var=X_sym) 

    drop_input = False 
    if drop_input: 
     l_in = lasagne.layers.dropout(l_in, p=0.2) 

    if add_hidden: 
     if not sp.sparse.issparse(X_train): 
      l_hid1 = lasagne.layers.DenseLayer(
       l_in, num_units=300, 
       nonlinearity=lasagne.nonlinearities.rectify, 
       W=lasagne.init.GlorotUniform()) 
     else: 
      l_hid1 = SparseInputDenseLayer(
       l_in, num_units=300, 
       nonlinearity=lasagne.nonlinearities.rectify, 
       W=lasagne.init.GlorotUniform()) 


     l_out = lasagne.layers.DenseLayer(
     l_hid1, num_units=out_size, 
     nonlinearity=lasagne.nonlinearities.softmax) 
    else: 
     if not sp.sparse.issparse(X_train): 
      l_out = lasagne.layers.DenseLayer(
       l_in, num_units=out_size, 
       nonlinearity=lasagne.nonlinearities.softmax) 
     else: 
      l_out = SparseInputDenseLayer(
       l_in, num_units=out_size, 
       nonlinearity=lasagne.nonlinearities.softmax) 




    if add_hidden: 
     embedding = lasagne.layers.get_output(l_hid1, X_sym) 
     f_get_embeddings = theano.function([X_sym], embedding) 
    output = lasagne.layers.get_output(l_out, X_sym) 
    pred = output.argmax(-1) 
    loss = lasagne.objectives.categorical_crossentropy(output, y_sym) 
    #loss = lasagne.objectives.multiclass_hinge_loss(output, y_sym) 

    l1_share = 0.9 
    l1_penalty = lasagne.regularization.regularize_layer_params(l_out, l1) * regul_coef * l1_share 
    l2_penalty = lasagne.regularization.regularize_layer_params(l_out, l2) * regul_coef * (1-l1_share) 
    loss = loss + l1_penalty + l2_penalty 
    loss = loss.mean() 
    if complete_prob: 
     y_sym_one_hot = y_sym.argmax(-1) 
     acc = T.mean(T.eq(pred, y_sym_one_hot)) 
    else: 
     acc = T.mean(T.eq(pred, y_sym)) 
    if init_parameters: 
     lasagne.layers.set_all_param_values(l_out, init_parameters) 
    parameters = lasagne.layers.get_all_params(l_out, trainable=True) 

    #print(params) 
    #updates = lasagne.updates.nesterov_momentum(loss, parameters, learning_rate=0.01, momentum=0.9) 
    #updates = lasagne.updates.sgd(loss, parameters, learning_rate=0.01) 
    #updates = lasagne.updates.adagrad(loss, parameters, learning_rate=0.1, epsilon=1e-6) 
    #updates = lasagne.updates.adadelta(loss, parameters, learning_rate=0.1, rho=0.95, epsilon=1e-6) 
    updates = lasagne.updates.adam(loss, parameters, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8) 

    f_train = theano.function([X_sym, y_sym], [loss, acc], updates=updates) 
    f_val = theano.function([X_sym, y_sym], [loss, acc]) 
    f_predict = theano.function([X_sym], pred) 
    f_predict_proba = theano.function([X_sym], output) 


    do_scale = False 
    #X_train = X_train.todense() 
    #X_text = X_text.todense() 
    if do_scale: 
     from sklearn import preprocessing 
     scaler = preprocessing.StandardScaler().fit(X_train) 
     X_train = scaler.transform(X_train) 
     X_text = scaler.transform(X_text)   
    #X = X_train.todense().astype(theano.config.floatX) 
    #Xt = X_test.todense().astype(theano.config.floatX) 
    X = X_train.astype('float32') 
    Xt = X_test.astype('float32') 
    #X = X_train.astype(theano.config.floatX) 
    #Xt = X_test.astype(theano.config.floatX) 
    if complete_prob: 
     Y = Y_train.astype('float32') 
     Yt = Y_test.astype('float32') 
    else: 
     Y = Y_train.astype('int32') 
     Yt = Y_test.astype('int32') 

    logging.info('training (n_epochs, batch_size) = (' + str(n_epochs) + ', ' + str(batch_size) + ')') 
    for n in xrange(n_epochs): 
     for batch in iterate_minibatches(X, Y, batch_size, shuffle=True): 
      x_batch, y_batch = batch 
      l_train, acc_train = f_train(x_batch, y_batch) 

     l_val, acc_val = f_val(Xt, Yt) 
     logging.info('epoch ' + str(n) + ' ,train_loss ' + str(l_train) + ' ,acc ' + str(acc_train) + ' ,val_loss ' + str(l_val) + ' ,acc ' + str(acc_val)) 
     geolocate.loss(f_predict(Xt), U_eval=params.U_dev) 
    logging.info(str(regul_coef)) 
    if add_hidden: 
     X_embs = f_get_embeddings(X) 
     Xt_embs = f_get_embeddings(Xt) 
    train_probs = f_predict_proba(X) 
    return train_probs 
    #pdb.set_trace() 

if __name__ == '__main__': 
    #X_train, Y_train, X_test, Y_test = load_data() 
    X_train, Y_train, X_test, Y_test = load_geolocation_data(complete_prob=True) 

    if params.DATASET_NUMBER == 1: 
     if params.TEXT_ONLY: 
      _coef = 1e-4 
     else: 
      _coef = 9e-5 
    #for _coef in [1e-7, 2e-7, 4e-7, 8e-7, 1e-6, 8e-6]: 
    #for _coef in [1e-5, 2e-5, 4e-5, 6e-5, 8e-5, 1e-4, 2e-4]: 
    for _coef in [_coef]: 
     logging.info(str(_coef)) 
     nn_model(X_train, Y_train, X_test, Y_test, complete_prob=True, regul_coef=_coef)