2016-11-22 58 views
8

爲了學習如何編寫自定義TensorFlow操作,我按照Adding a New Op教程做了一個「add_b」操作,它爲每個輸入值添加一個標量b爲什麼選擇我的自定義操作系統的CPU實現?

add_b_op.cc

#define EIGEN_USE_THREADS 

#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 

#include "tensorflow/core/framework/common_shape_fns.h" 
#include "tensorflow/core/framework/op.h" 
#include "tensorflow/core/framework/op_kernel.h" 
#include "tensorflow/core/framework/shape_inference.h" 

using namespace tensorflow; 

REGISTER_OP("AddB") 
    .Attr("T: {float, double}") 
    .Input("input: T") 
    .Input("b: T") 
    .Output("output: T") 
    .SetShapeFn([] (shape_inference::InferenceContext* c) -> Status { 
     shape_inference::ShapeHandle out; 
     TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &out)); 
     return shape_inference::UnchangedShape(c); 
    }) 
//---------------------------------------------------------------------- 
    .Doc(R"doc(
Adds `b` to each input. 

input: The input values. 
b: A number to add to each input value. 
)doc"); 


template <typename T> 
class AddBCpuOp : public OpKernel { 
public: 
    explicit AddBCpuOp(OpKernelConstruction* context) : OpKernel(context) {} 

    void Compute(OpKernelContext* context) override { 
    const Tensor& input_tensor = context->input(0); 
    const auto input = input_tensor.flat<T>(); 

    Tensor* output_tensor = nullptr; 
    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(), 
                &output_tensor)); 
    auto output = output_tensor->flat<T>(); 

    const Eigen::ThreadPoolDevice& d = context->eigen_device<Eigen::ThreadPoolDevice>(); 

    // Note: The mistake of adding 1 instead of `b` is intentional to be able to distinguish 
    // the CPU and GPU implementations. 
    output.device(d) = input + static_cast<T>(1); 
    } 
}; 

REGISTER_KERNEL_BUILDER(
    Name("AddB") 
    .Device(DEVICE_CPU) 
    .TypeConstraint<float>("T"), 
    AddBCpuOp<float>); 
REGISTER_KERNEL_BUILDER(
    Name("AddB") 
    .Device(DEVICE_CPU) 
    .TypeConstraint<double>("T"), 
    AddBCpuOp<double>); 


#if GOOGLE_CUDA 

template <typename T> 
bool LaunchAddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output); 

template <typename T> 
class AddBGpuOp : public OpKernel { 
public: 
    explicit AddBGpuOp(OpKernelConstruction* context) : OpKernel(context) {} 

    void Compute(OpKernelContext* context) override { 
    const Tensor& input_tensor = context->input(0); 
    const auto input = input_tensor.flat<T>(); 

    const Tensor& b_tensor = context->input(1); 
    OP_REQUIRES(context, TensorShapeUtils::IsScalar(b_tensor.shape()), 
       errors::InvalidArgument("add_b expects a scalar for `b`.")); 
    const auto b = b_tensor.scalar<T>(); 

    Tensor* output_tensor = nullptr; 
    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(), 
                &output_tensor)); 
    auto output = output_tensor->flat<T>(); 

    OP_REQUIRES(context, LaunchAddBKernel(input.data(), input.dimension(0), b.data(), output.data()), 
       errors::Internal("add_b: LaunchAddBKernel() failed.")); 
    } 
}; 

REGISTER_KERNEL_BUILDER(
    Name("AddB") 
    .Device(DEVICE_GPU) 
    .TypeConstraint<float>("T"), 
    AddBGpuOp<float>); 
REGISTER_KERNEL_BUILDER(
    Name("AddB") 
    .Device(DEVICE_GPU) 
    .TypeConstraint<double>("T"), 
    AddBGpuOp<double>); 

#endif // if GOOGLE_CUDA 

add_b_op.cu.cc

template <typename T, int BLOCK_DIM_X> 
__global__ void AddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output) { 
    const int i = blockIdx.x * BLOCK_DIM_X + threadIdx.x; 
    if (i < n) { 
    d_output[i] = d_input[i] + *d_b; 
    } 
} 

template <typename T> 
bool LaunchAddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output) { 
    if (n <= 0) return true; 

    constexpr int BLOCK_DIM_X = 256; 
    AddBKernel<T, BLOCK_DIM_X><<<n/BLOCK_DIM_X + (n % BLOCK_DIM_X != 0), BLOCK_DIM_X>>>(d_input, n, d_b, d_output); 
    return true; 
} 

// Explicit instantiations. 
template bool LaunchAddBKernel<float>(const float *__restrict__, int, const float *__restrict__, float *__restrict__); 
template bool LaunchAddBKernel<double>(const double *__restrict__, int, const double *__restrict__, double *__restrict__); 

我已有意引入的CPU執行的誤差,以便能夠區分是否正在使用的CPU或GPU執行。

當我測試了我的自定義運算:

from __future__ import print_function 
import tensorflow as tf 

module = tf.load_op_library('custom_ops.so') 
with tf.Session(config = tf.ConfigProto(log_device_placement = True)): 
    print(module.add_b([5., 4., 3., 2., 1.], 8.).eval()) 

我得到以下輸出:

 
I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] OS X does not support NUMA - returning NUMA node zero 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:951] Found device 0 with properties: 
name: GeForce GT 750M 
major: 3 minor: 0 memoryClockRate (GHz) 0.9255 
pciBusID 0000:01:00.0 
Total memory: 2.00GiB 
Free memory: 1.80GiB 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:972] DMA: 0 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] 0: Y 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:1041] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0) 
Device mapping: 
/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0 
I tensorflow/core/common_runtime/direct_session.cc:252] Device mapping: 
/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0 

AddB: /job:localhost/replica:0/task:0/gpu:0 
I tensorflow/core/common_runtime/simple_placer.cc:819] AddB: /job:localhost/replica:0/task:0/gpu:0 
AddB/b: /job:localhost/replica:0/task:0/gpu:0 
I tensorflow/core/common_runtime/simple_placer.cc:819] AddB/b: /job:localhost/replica:0/task:0/gpu:0 
AddB/input: /job:localhost/replica:0/task:0/gpu:0 
I tensorflow/core/common_runtime/simple_placer.cc:819] AddB/input: /job:localhost/replica:0/task:0/gpu:0 
[ 6. 5. 4. 3. 2.] 

「設備放置日誌」似乎表明,運正在對執行GPU,但輸出表明正在使用CPU實現。

當我註釋掉兩個REGISTER_KERNEL_BUILDER()註冊爲DEVICE_CPU執行,重新編譯和重新測試,我得到的[ 13. 12. 11. 10. 9.]預期的輸出,但有一個錯誤:

 
E tensorflow/core/common_runtime/executor.cc:334] Executor failed to create kernel. Not found: No registered 'AddB' OpKernel for CPU devices compatible with node AddB = AddB[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](AddB/input, AddB/b) 
    . Registered: device='GPU'; T in [DT_FLOAT] 
    device='GPU'; T in [DT_DOUBLE] 

    [[Node: AddB = AddB[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](AddB/input, AddB/b)]] 

該錯誤信息看起來像是一個bug,因爲雖然錯誤提示「Executor無法創建內核」,但是顯然創建了一個內核來在GPU上運行該操作。

爲什麼使用CPU實現而不是GPU實現?

在此情況下,重要的是,這裏有我的發展設置細節:

  • 我使用的MacBook Pro有一個內置的NVIDIA GeForce GT 750M(CUDA計算能力3.0)。
  • MACOS塞拉利昂版10.12.1(16B2555)
  • cuda_8.0.47_mac,cudnn-8.0-OSX-x64的V5.1
  • TensorFlow經由安裝0.11.0rc2:export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc2-py2-none-any.whl

UPDATE我發現是否選擇CPU或GPU實現取決於輸入的大小。使用此測試腳本:

from __future__ import print_function 
import numpy as np 
import tensorflow as tf 
from time import time 

NUM_VALUES = 1310720 

input = np.arange(0, NUM_VALUES, dtype = float) 

module = tf.load_op_library('custom_ops.so') 
with tf.Session(config = tf.ConfigProto(log_device_placement = True)): 
    start = time(); print(module.add_b(input, 8.).eval()); end = time(); print(end - start) 

..當NUM_VALUES爲1310720或更低,則使用CPU執行。當NUM_VALUES爲1310721或更多時,則使用GPU實現。

有沒有(1310720 * 8字節每雙=)10 MiB截止?如果是這樣,我該如何覆蓋它? AddB()op足夠簡單,但對於更復雜的自定義操作,10 MiB可能太大以至於不能選擇GPU實現的閾值。

回答

1

我剛纔讀TensorFlow issue #2054 - Manual placement on GPU of a custom operator with both CPU and GPU implementation will always run the CPU version和運行CPU執行的行爲似乎是TensorFlow的功能,稱爲「常量摺疊」。當TensorFlow在第一次運行之前優化圖形時,涉及常量的操作數通常在CPU上進行評估,因爲思考的是CPU和GPU實現應該產生相同的結果。說得通。

禁用此行爲的

兩種方式是:

  1. 禁用圖形優化:

    from __future__ import print_function 
    import numpy as np 
    import tensorflow as tf 
    from time import time 
    
    NUM_VALUES = 10 
    
    input = np.arange(0, NUM_VALUES, dtype = float) 
    
    custom_ops_module = tf.load_op_library('custom_ops.so') 
    
    config = tf.ConfigProto(log_device_placement = True) 
    config.graph_options.optimizer_options.opt_level = -1 
    
    with tf.Session(config = config): 
        start = time(); print(custom_ops_module.add_b(input, 8.).eval()); end = time(); print(end - start) 
    
  2. 不使用的常量,通過,例如,供給值插入佔位符:

    from __future__ import print_function 
    import numpy as np 
    import tensorflow as tf 
    from time import time 
    
    NUM_VALUES = 10 
    
    custom_ops_module = tf.load_op_library('custom_ops.so') 
    
    graph = tf.Graph() 
    with graph.as_default(): 
        input = tf.placeholder(tf.float64, shape = (NUM_VALUES,)) 
        b = tf.placeholder(tf.float64, shape =()) 
        result = custom_ops_module.add_b(input, b) 
    
    with tf.Session(graph = graph, config = tf.ConfigProto(log_device_placement = True)) as session: 
        feed_dict = { 
        input: np.arange(0, NUM_VALUES, dtype = float), 
        b: 8., 
        } 
        start = time(); print(session.run([result], feed_dict = feed_dict)); end = time(); print(end - start) 
    
1

我覺得模板實例可能是不正確的:

template <typename Device, typename T> 
class AddBOp : public OpKernel { 
... 
} 

REGISTER_KERNEL_BUILDER(
    Name("AddB") 
    .Device(DEVICE_CPU) 
    .TypeConstraint<float>("T"), 
    AddBOp<CPUDevice, float>); 

然後:

template <typename T> 
class AddBOp<GPUDevice, T> : public OpKernel { 
... 
} 

REGISTER_KERNEL_BUILDER(
    Name("AddB") 
    .Device(DEVICE_GPU) 
    .TypeConstraint<float>("T"), 
    AddBOp<GPUDevice, float>); 

我認爲ADDB爲GPU登記實例率先執行,而不是第二相匹配的對象(第一個實現有兩個模板參數,第二個實現有一個)。

您可以通過在第二次註冊時調用AddBp < float>來解決此問題,但我會建議更好的名稱以避免混淆。

+0

謝謝你fo你的答案。只要'CPUDevice'和'GPUDevice'實際上是不同的類型,我認爲模板專門化[沒關係](http://ideone.com/0nKxNd)。無論如何,我剛纔嘗試將類名更改爲「AddBCpuOp」和「AddBGpuOp」,並觀察到相同的行爲。 (我已編輯我的問題,包括修改後的代碼。) –

0

根據this它可能是由於內存碎片同治,嘗試:

with tf.device('/gpu:0'): 

或者在鏈接的頁面中的片段爲內存碎片選項調整。

編輯:要查看是否是這樣嘗試:

from __future__ import print_function 
import numpy as np 
import tensorflow as tf 
from time import time 

NUM_VALUES = 10 

input = np.arange(0, NUM_VALUES, dtype = float) 

custom_ops_module = tf.load_op_library('custom_ops.so') 

config = tf.ConfigProto(log_device_placement = True) 
config.gpu_options.allow_growth = True 

with tf.Session(config = config): 
    start = time(); print(custom_ops_module.add_b(input, 8.).eval()); end = time(); print(end - start) 
+0

嗨弗洛朗,謝謝你的回答。我剛剛嘗試了使用'config.gpu_options.allow_growth = True'的腳本,並且觀察到CPU實現仍處於選中狀態。我也嘗試將代碼放置在'with tf.device('/ gpu:0')'內,但未選中GPU實現。 –

+0

好的,這是我第一次猜測,我沒有辦法在昨天重現這個問題。很高興你解決它,並學到了一些東西! – florentbuisson

相關問題