2017-10-12 116 views
1

在從https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb移植PyTorch代碼到GPU

教程有一個USE_CUDA標誌被用於控制CPU之間的變量和張量類型(假時)到GPU(當真)類型。

en-fr.tsv使用數據和轉換的判決變量:

import unicodedata 
import string 
import re 
import random 
import time 
import math 

from gensim.corpora.dictionary import Dictionary 

import torch 
import torch.nn as nn 
from torch.autograd import Variable 
from torch import LongTensor, FloatTensor 
from torch import optim 
import torch.nn.functional as F 

import numpy as np 

MAX_LENGTH = 10 
USE_CUDA = False 

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427 
def unicode_to_ascii(s): 
    return ''.join(
     c for c in unicodedata.normalize('NFD', s) 
     if unicodedata.category(c) != 'Mn' 
    ) 

# Lowercase, trim, and remove non-letter characters 
def normalize_string(s): 
    s = unicode_to_ascii(s.lower().strip()) 
    s = re.sub(r"([.!?])", r" \1", s) 
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) 
    return s 

SOS_IDX, SOS_TOKEN = 0, '<s>' 
EOS_IDX, EOS_TOKEN = 1, '</s>' 
UNK_IDX, UNK_TOKEN = 2, '<unk>' 
PAD_IDX, PAD_TOKEN = 3, '<blank>' 

lines = open('en-fr.tsv').read().strip().split('\n') 
pairs = [[normalize_string(s).split() for s in l.split('\t')] for l in lines] 
src_sents, trg_sents = zip(*pairs) 

src_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]]) 
src_dict.add_documents(src_sents) 

trg_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]]) 
trg_dict.add_documents(trg_sents) 

def variablize_sentences(sentence, dictionary): 
    indices = [dictionary.token2id[tok] for tok in sentence] + [dictionary.token2id[EOS_TOKEN]] 
    var = Variable(LongTensor(indices).view(-1, 1)) 
    return var.cuda() if USE_CUDA else var 

input_variables = [variablize_sentences(sent, src_dict) for sent in src_sents] 
output_variables = [variablize_sentences(sent, trg_dict) for sent in trg_sents] 

,並使用編碼器 - 經辦人 - 解碼器網絡:

class EncoderRNN(nn.Module): 
    def __init__(self, input_size, hidden_size, n_layers=1): 
     super(EncoderRNN, self).__init__() 

     self.input_size = input_size 
     self.hidden_size = hidden_size 
     self.n_layers = n_layers 

     self.embedding = nn.Embedding(input_size, hidden_size)  
     self.gru = nn.GRU(hidden_size, hidden_size, n_layers) 

     self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding 
     self.gru = self.gru.cuda() if USE_CUDA else self.gru 

    def forward(self, word_inputs, hidden): 
     seq_len = len(word_inputs) 

     embedded = self.embedding(word_inputs).view(seq_len, 1, -1) 
     embedded = embedded.cuda() if USE_CUDA else embedded 

     output, hidden = self.gru(embedded, hidden) 
     output = output.cuda() if USE_CUDA else output 
     hiddne = hidden.cuda() if USE_CUDA else hidden 

     return output, hidden 

    def init_hidden(self): 
     hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size)) 
     return hidden.cuda() if USE_CUDA else hidden 

class Attn(nn.Module): 
    def __init__(self, method, hidden_size, max_length=MAX_LENGTH): 
     super(Attn, self).__init__() 

     self.method = method 
     self.hidden_size = hidden_size 

     if self.method == 'general': 
      self.attn = nn.Linear(self.hidden_size, hidden_size) 

     elif self.method == 'concat': 
      self.attn = nn.Linear(self.hidden_size * 2, hidden_size) 
      self.other = nn.Parameter(FloatTensor(1, hidden_size)) 

    def forward(self, hidden, encoder_outputs): 
     seq_len = len(encoder_outputs) 

     # Create variable to store attention energies 
     attn_energies = Variable(torch.zeros(seq_len)) # B x 1 x S 
     attn_energies = attn_energies.cuda() if USE_CUDA else attn_energies 
     # Calculate energies for each encoder output 
     for i in range(seq_len): 
      attn_energies[i] = self.score(hidden, encoder_outputs[i]) 

     # Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len 
     return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0) 

    def score(self, hidden, encoder_output): 
     if self.method == 'dot': 
      energy =torch.dot(hidden.view(-1), encoder_output.view(-1)) 
     elif self.method == 'general': 
      energy = self.attn(encoder_output) 
      energy = torch.dot(hidden.view(-1), energy.view(-1)) 
     elif self.method == 'concat': 
      energy = self.attn(torch.cat((hidden, encoder_output), 1)) 
      energy = torch.dot(self.v.view(-1), energy.view(-1)) 
     return energy 

class AttnDecoderRNN(nn.Module): 
    def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_p=0.1): 
     super(AttnDecoderRNN, self).__init__() 

     # Keep parameters for reference 
     self.attn_model = attn_model 
     self.hidden_size = hidden_size 
     self.output_size = output_size 
     self.n_layers = n_layers 
     self.dropout_p = dropout_p 

     # Define layers 
     self.embedding = nn.Embedding(output_size, hidden_size) 
     self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p) 
     self.out = nn.Linear(hidden_size * 2, output_size) 

     self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding 
     self.gru = self.gru.cuda() if USE_CUDA else self.gru 
     self.out = self.out.cuda() if USE_CUDA else self.out 


     # Choose attention model 
     if attn_model != 'none': 
      self.attn = Attn(attn_model, hidden_size) 
      self.attn = self.attn.cuda() if USE_CUDA else self.attn 

    def forward(self, word_input, last_context, last_hidden, encoder_outputs): 
     # Note: we run this one step at a time 

     # Get the embedding of the current input word (last output word) 
     word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N 

     # Combine embedded input word and last context, run through RNN 
     rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2) 
     rnn_output, hidden = self.gru(rnn_input, last_hidden) 

     # Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs 
     attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs) 
     context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N 

     # Final output layer (next word prediction) using the RNN hidden state and context vector 
     rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N 
     context = context.squeeze(1)  # B x S=1 x N -> B x N 
     output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1))) 

     if USE_CUDA: 
      return output.cuda(), context.cuda(), hidden.cuda(), attn_weights.cuda() 
     else: 
      return output, context, hidden, attn_weights 

和測試網絡:

encoder_test = EncoderRNN(10, 10, 2) # I, H , L 
decoder_test = AttnDecoderRNN('general', 10, 10, 2) # A, H, O, L 

encoder_hidden = encoder_test.init_hidden() 
if USE_CUDA: 
    word_inputs = Variable(torch.LongTensor([1, 2, 3]).cuda()) 
else: 
    word_inputs = Variable(torch.LongTensor([1, 2, 3])) 
encoder_outputs, encoder_hidden = encoder_test(word_inputs, encoder_hidden) 
decoder_attns = torch.zeros(1, 3, 3) 
decoder_hidden = encoder_hidden 
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)) 

decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs) 
print(decoder_output) 
print(decoder_hidden) 
print(decoder_attn) 

代碼在CPU上正常工作,

[OUT]:

EncoderRNN (
    (embedding): Embedding(10, 10) 
    (gru): GRU(10, 10, num_layers=2) 
) 
AttnDecoderRNN (
    (embedding): Embedding(10, 10) 
    (gru): GRU(20, 10, num_layers=2, dropout=0.1) 
    (out): Linear (20 -> 10) 
    (attn): Attn (
    (attn): Linear (10 -> 10) 
) 
) 
Variable containing: 
-2.4378 -2.3556 -2.3391 -2.5070 -2.3439 -2.3415 -2.3976 -2.1832 -1.9976 -2.2213 
[torch.FloatTensor of size 1x10] 

Variable containing: 
(0 ,.,.) = 

Columns 0 to 8 
    -0.2325 0.0775 0.5415 0.4876 -0.5771 -0.0687 0.1832 -0.5285 0.2508 

Columns 9 to 9 
    -0.1837 

(1 ,.,.) = 

Columns 0 to 8 
    -0.1389 -0.2605 -0.0518 0.3405 0.0774 0.1815 0.0297 -0.1304 -0.1015 

Columns 9 to 9 
    0.2602 
[torch.FloatTensor of size 2x1x10] 

Variable containing: 
(0 ,.,.) = 
    0.3334 0.3291 0.3374 
[torch.FloatTensor of size 1x1x3] 

但改變標誌USE_GPU=True時,它初始化decoder_test對象時引發錯誤,它將引發TypeError

--------------------------------------------------------------------------- 
TypeError         Traceback (most recent call last) 
<ipython-input-76-b3c660013934> in <module>() 
    12 decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)) 
    13 
---> 14 decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs) 
    15 print(decoder_output) 
    16 print(decoder_hidden) 

~/.local/lib/python3.5/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs) 
    222   for hook in self._forward_pre_hooks.values(): 
    223    hook(self, input) 
--> 224   result = self.forward(*input, **kwargs) 
    225   for hook in self._forward_hooks.values(): 
    226    hook_result = hook(self, input, result) 

<ipython-input-75-34ecfe9b3112> in forward(self, word_input, last_context, last_hidden, encoder_outputs) 
    32 
    33   # Combine embedded input word and last context, run through RNN 
---> 34   rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2) 
    35   rnn_output, hidden = self.gru(rnn_input, last_hidden) 
    36 

~/.local/lib/python3.5/site-packages/torch/autograd/variable.py in cat(iterable, dim) 
    895   @staticmethod 
    896   def cat(iterable, dim=0): 
--> 897    return Concat.apply(dim, *iterable) 
    898 
    899   @staticmethod 

~/.local/lib/python3.5/site-packages/torch/autograd/_functions/tensor.py in forward(ctx, dim, *inputs) 
    315   ctx.dim = dim 
    316   ctx.input_sizes = [i.size(dim) for i in inputs] 
--> 317   return torch.cat(inputs, dim) 
    318 
    319  @staticmethod 

TypeError: cat received an invalid combination of arguments - got (tuple, int), but expected one of: 
* (sequence[torch.cuda.FloatTensor] seq) 
* (sequence[torch.cuda.FloatTensor] seq, int dim) 
     didn't match because some of the arguments have invalid types: (tuple, int) 

問題是爲什麼那些類型在CUDA中不匹配,但它在CPU上工作以及如何解決這個問題?

PyTorch是否有一個全局標誌只是將所有類型改爲CUDA類型而不是亂七八糟的CPU/GPU類型?

+0

也許你忘了'.cuda()'在你的模型'encoder_test'和' decoder_test'和'Variable'' decoder_context' –

+0

也在https://discuss.pytorch.org/t/porting-seq2seq-tutorial-from-spro-practical-pytorh-from-cpu-to-gpu/8604 – alvas

+0

謝謝@MauelLagunas!確實,'encoder_hidden'和'decoder_context'沒有'.cuda()' – alvas

回答

3

確實PyTorch有一個全球性的標誌,只是改變所有類型CUDA類型,而不是亂用的CPU/GPU類型身邊?

都能跟得上=(

(來源:https://discuss.pytorch.org/t/porting-seq2seq-tutorial-from-spro-practical-pytorh-from-cpu-to-gpu/8604


具體到例如:

輸入變量給decoder_test對象需要在.cuda()類型更具體地說:

encoder_hidden = encoder_test.init_hidden() 
---> encoder_hidden = encoder_test.init_hidden().cuda() 


decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)) 
---> decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)).cuda() 

所以測試網絡的代碼應該是:

encoder_test = EncoderRNN(10, 10, 2) # I, H , L 
decoder_test = AttnDecoderRNN('general', 10, 10, 2) # A, H, O, L 

encoder_hidden = encoder_test.init_hidden().cuda() 
if USE_CUDA: 
    word_inputs = Variable(torch.LongTensor([1, 2, 3]).cuda()) 
else: 
    word_inputs = Variable(torch.LongTensor([1, 2, 3])) 
encoder_outputs, encoder_hidden = encoder_test(word_inputs, encoder_hidden) 
decoder_attns = torch.zeros(1, 3, 3) 
decoder_hidden = encoder_hidden 
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)).cuda() 

decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs) 
print(decoder_output) 
print(decoder_hidden) 
print(decoder_attn) 

[出]:

Variable containing: 
-2.1412 -2.4589 -2.4042 -2.1591 -2.5080 -2.0839 -2.5058 -2.3831 -2.4468 -2.0804 
[torch.cuda.FloatTensor of size 1x10 (GPU 0)] 

Variable containing: 
(0 ,.,.) = 

Columns 0 to 8 
    -0.0264 -0.0689 0.1049 0.0760 0.1017 -0.4585 -0.1273 0.0449 -0.3271 

Columns 9 to 9 
    -0.0104 

(1 ,.,.) = 

Columns 0 to 8 
    -0.0308 -0.0690 -0.0258 -0.2759 0.1403 -0.0468 -0.0205 0.0126 -0.1729 

Columns 9 to 9 
    0.0599 
[torch.cuda.FloatTensor of size 2x1x10 (GPU 0)] 

Variable containing: 
(0 ,.,.) = 
    0.3328 0.3328 0.3344 
[torch.cuda.FloatTensor of size 1x1x3 (GPU 0)]