爲什麼選擇我的自定義操作系統的CPU實現？

爲了學習如何編寫自定義TensorFlow操作，我按照Adding a New Op教程做了一個「add_b」操作，它爲每個輸入值添加一個標量b。爲什麼選擇我的自定義操作系統的CPU實現？

add_b_op.cc：

#define EIGEN_USE_THREADS 

#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 

#include "tensorflow/core/framework/common_shape_fns.h" 
#include "tensorflow/core/framework/op.h" 
#include "tensorflow/core/framework/op_kernel.h" 
#include "tensorflow/core/framework/shape_inference.h" 

using namespace tensorflow; 

REGISTER_OP("AddB") 
    .Attr("T: {float, double}") 
    .Input("input: T") 
    .Input("b: T") 
    .Output("output: T") 
    .SetShapeFn([] (shape_inference::InferenceContext* c) -> Status { 
     shape_inference::ShapeHandle out; 
     TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &out)); 
     return shape_inference::UnchangedShape(c); 
    }) 
//---------------------------------------------------------------------- 
    .Doc(R"doc(
Adds `b` to each input. 

input: The input values. 
b: A number to add to each input value. 
)doc"); 


template <typename T> 
class AddBCpuOp : public OpKernel { 
public: 
    explicit AddBCpuOp(OpKernelConstruction* context) : OpKernel(context) {} 

    void Compute(OpKernelContext* context) override { 
    const Tensor& input_tensor = context->input(0); 
    const auto input = input_tensor.flat<T>(); 

    Tensor* output_tensor = nullptr; 
    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(), 
                &output_tensor)); 
    auto output = output_tensor->flat<T>(); 

    const Eigen::ThreadPoolDevice& d = context->eigen_device<Eigen::ThreadPoolDevice>(); 

    // Note: The mistake of adding 1 instead of `b` is intentional to be able to distinguish 
    // the CPU and GPU implementations. 
    output.device(d) = input + static_cast<T>(1); 
    } 
}; 

REGISTER_KERNEL_BUILDER(
    Name("AddB") 
    .Device(DEVICE_CPU) 
    .TypeConstraint<float>("T"), 
    AddBCpuOp<float>); 
REGISTER_KERNEL_BUILDER(
    Name("AddB") 
    .Device(DEVICE_CPU) 
    .TypeConstraint<double>("T"), 
    AddBCpuOp<double>); 


#if GOOGLE_CUDA 

template <typename T> 
bool LaunchAddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output); 

template <typename T> 
class AddBGpuOp : public OpKernel { 
public: 
    explicit AddBGpuOp(OpKernelConstruction* context) : OpKernel(context) {} 

    void Compute(OpKernelContext* context) override { 
    const Tensor& input_tensor = context->input(0); 
    const auto input = input_tensor.flat<T>(); 

    const Tensor& b_tensor = context->input(1); 
    OP_REQUIRES(context, TensorShapeUtils::IsScalar(b_tensor.shape()), 
       errors::InvalidArgument("add_b expects a scalar for `b`.")); 
    const auto b = b_tensor.scalar<T>(); 

    Tensor* output_tensor = nullptr; 
    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(), 
                &output_tensor)); 
    auto output = output_tensor->flat<T>(); 

    OP_REQUIRES(context, LaunchAddBKernel(input.data(), input.dimension(0), b.data(), output.data()), 
       errors::Internal("add_b: LaunchAddBKernel() failed.")); 
    } 
}; 

REGISTER_KERNEL_BUILDER(
    Name("AddB") 
    .Device(DEVICE_GPU) 
    .TypeConstraint<float>("T"), 
    AddBGpuOp<float>); 
REGISTER_KERNEL_BUILDER(
    Name("AddB") 
    .Device(DEVICE_GPU) 
    .TypeConstraint<double>("T"), 
    AddBGpuOp<double>); 

#endif // if GOOGLE_CUDA

add_b_op.cu.cc

template <typename T, int BLOCK_DIM_X> 
__global__ void AddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output) { 
    const int i = blockIdx.x * BLOCK_DIM_X + threadIdx.x; 
    if (i < n) { 
    d_output[i] = d_input[i] + *d_b; 
    } 
} 

template <typename T> 
bool LaunchAddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output) { 
    if (n <= 0) return true; 

    constexpr int BLOCK_DIM_X = 256; 
    AddBKernel<T, BLOCK_DIM_X><<<n/BLOCK_DIM_X + (n % BLOCK_DIM_X != 0), BLOCK_DIM_X>>>(d_input, n, d_b, d_output); 
    return true; 
} 

// Explicit instantiations. 
template bool LaunchAddBKernel<float>(const float *__restrict__, int, const float *__restrict__, float *__restrict__); 
template bool LaunchAddBKernel<double>(const double *__restrict__, int, const double *__restrict__, double *__restrict__);

我已有意引入的CPU執行的誤差，以便能夠區分是否正在使用的CPU或GPU執行。

當我測試了我的自定義運算：

from __future__ import print_function 
import tensorflow as tf 

module = tf.load_op_library('custom_ops.so') 
with tf.Session(config = tf.ConfigProto(log_device_placement = True)): 
    print(module.add_b([5., 4., 3., 2., 1.], 8.).eval())

我得到以下輸出：

 
I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] OS X does not support NUMA - returning NUMA node zero 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:951] Found device 0 with properties: 
name: GeForce GT 750M 
major: 3 minor: 0 memoryClockRate (GHz) 0.9255 
pciBusID 0000:01:00.0 
Total memory: 2.00GiB 
Free memory: 1.80GiB 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:972] DMA: 0 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] 0: Y 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:1041] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0) 
Device mapping: 
/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0 
I tensorflow/core/common_runtime/direct_session.cc:252] Device mapping: 
/job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0 

AddB: /job:localhost/replica:0/task:0/gpu:0 
I tensorflow/core/common_runtime/simple_placer.cc:819] AddB: /job:localhost/replica:0/task:0/gpu:0 
AddB/b: /job:localhost/replica:0/task:0/gpu:0 
I tensorflow/core/common_runtime/simple_placer.cc:819] AddB/b: /job:localhost/replica:0/task:0/gpu:0 
AddB/input: /job:localhost/replica:0/task:0/gpu:0 
I tensorflow/core/common_runtime/simple_placer.cc:819] AddB/input: /job:localhost/replica:0/task:0/gpu:0 
[ 6. 5. 4. 3. 2.]

「設備放置日誌」似乎表明，運正在對執行GPU，但輸出表明正在使用CPU實現。

當我註釋掉兩個REGISTER_KERNEL_BUILDER（）註冊爲DEVICE_CPU執行，重新編譯和重新測試，我得到的[ 13. 12. 11. 10. 9.]預期的輸出，但有一個錯誤：

 
E tensorflow/core/common_runtime/executor.cc:334] Executor failed to create kernel. Not found: No registered 'AddB' OpKernel for CPU devices compatible with node AddB = AddB[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](AddB/input, AddB/b) 
    . Registered: device='GPU'; T in [DT_FLOAT] 
    device='GPU'; T in [DT_DOUBLE] 

    [[Node: AddB = AddB[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](AddB/input, AddB/b)]]

該錯誤信息看起來像是一個bug，因爲雖然錯誤提示「Executor無法創建內核」，但是顯然創建了一個內核來在GPU上運行該操作。

爲什麼使用CPU實現而不是GPU實現？

在此情況下，重要的是，這裏有我的發展設置細節：

我使用的MacBook Pro有一個內置的NVIDIA GeForce GT 750M（CUDA計算能力3.0）。
MACOS塞拉利昂版10.12.1（16B2555）
cuda_8.0.47_mac，cudnn-8.0-OSX-x64的V5.1
TensorFlow經由安裝0.11.0rc2：export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc2-py2-none-any.whl

UPDATE我發現是否選擇CPU或GPU實現取決於輸入的大小。使用此測試腳本：

from __future__ import print_function 
import numpy as np 
import tensorflow as tf 
from time import time 

NUM_VALUES = 1310720 

input = np.arange(0, NUM_VALUES, dtype = float) 

module = tf.load_op_library('custom_ops.so') 
with tf.Session(config = tf.ConfigProto(log_device_placement = True)): 
    start = time(); print(module.add_b(input, 8.).eval()); end = time(); print(end - start)

..當NUM_VALUES爲1310720或更低，則使用CPU執行。當NUM_VALUES爲1310721或更多時，則使用GPU實現。

有沒有（1310720 * 8字節每雙=）10 MiB截止？如果是這樣，我該如何覆蓋它？ AddB（）op足夠簡單，但對於更復雜的自定義操作，10 MiB可能太大以至於不能選擇GPU實現的閾值。

來源

2016-11-22 Daniel Trebbien

我剛纔讀TensorFlow issue #2054 - Manual placement on GPU of a custom operator with both CPU and GPU implementation will always run the CPU version和運行CPU執行的行爲似乎是TensorFlow的功能，稱爲「常量摺疊」。當TensorFlow在第一次運行之前優化圖形時，涉及常量的操作數通常在CPU上進行評估，因爲思考的是CPU和GPU實現應該產生相同的結果。說得通。

禁用此行爲的

兩種方式是：

禁用圖形優化：

from __future__ import print_function 
import numpy as np 
import tensorflow as tf 
from time import time 

NUM_VALUES = 10 

input = np.arange(0, NUM_VALUES, dtype = float) 

custom_ops_module = tf.load_op_library('custom_ops.so') 

config = tf.ConfigProto(log_device_placement = True) 
config.graph_options.optimizer_options.opt_level = -1 

with tf.Session(config = config): 
    start = time(); print(custom_ops_module.add_b(input, 8.).eval()); end = time(); print(end - start)

不使用的常量，通過，例如，供給值插入佔位符：

from __future__ import print_function 
import numpy as np 
import tensorflow as tf 
from time import time 

NUM_VALUES = 10 

custom_ops_module = tf.load_op_library('custom_ops.so') 

graph = tf.Graph() 
with graph.as_default(): 
    input = tf.placeholder(tf.float64, shape = (NUM_VALUES,)) 
    b = tf.placeholder(tf.float64, shape =()) 
    result = custom_ops_module.add_b(input, b) 

with tf.Session(graph = graph, config = tf.ConfigProto(log_device_placement = True)) as session: 
    feed_dict = { 
    input: np.arange(0, NUM_VALUES, dtype = float), 
    b: 8., 
    } 
    start = time(); print(session.run([result], feed_dict = feed_dict)); end = time(); print(end - start)

來源

2016-11-24 16:07:01

我覺得模板實例可能是不正確的：

template <typename Device, typename T> 
class AddBOp : public OpKernel { 
... 
} 

REGISTER_KERNEL_BUILDER(
    Name("AddB") 
    .Device(DEVICE_CPU) 
    .TypeConstraint<float>("T"), 
    AddBOp<CPUDevice, float>);

然後：

template <typename T> 
class AddBOp<GPUDevice, T> : public OpKernel { 
... 
} 

REGISTER_KERNEL_BUILDER(
    Name("AddB") 
    .Device(DEVICE_GPU) 
    .TypeConstraint<float>("T"), 
    AddBOp<GPUDevice, float>);

我認爲ADDB爲GPU登記實例率先執行，而不是第二相匹配的對象（第一個實現有兩個模板參數，第二個實現有一個）。

您可以通過在第二次註冊時調用AddBp < float>來解決此問題，但我會建議更好的名稱以避免混淆。

來源

2016-11-23 00:17:05 vrv

謝謝你fo你的答案。只要'CPUDevice'和'GPUDevice'實際上是不同的類型，我認爲模板專門化[沒關係]（http://ideone.com/0nKxNd）。無論如何，我剛纔嘗試將類名更改爲「AddBCpuOp」和「AddBGpuOp」，並觀察到相同的行爲。（我已編輯我的問題，包括修改後的代碼。） –

根據this它可能是由於內存碎片同治，嘗試：

with tf.device('/gpu:0'):

或者在鏈接的頁面中的片段爲內存碎片選項調整。

編輯：要查看是否是這樣嘗試：

from __future__ import print_function 
import numpy as np 
import tensorflow as tf 
from time import time 

NUM_VALUES = 10 

input = np.arange(0, NUM_VALUES, dtype = float) 

custom_ops_module = tf.load_op_library('custom_ops.so') 

config = tf.ConfigProto(log_device_placement = True) 
config.gpu_options.allow_growth = True 

with tf.Session(config = config): 
    start = time(); print(custom_ops_module.add_b(input, 8.).eval()); end = time(); print(end - start)

來源

2016-11-24 16:06:15 florentbuisson

嗨弗洛朗，謝謝你的回答。我剛剛嘗試了使用'config.gpu_options.allow_growth = True'的腳本，並且觀察到CPU實現仍處於選中狀態。我也嘗試將代碼放置在'with tf.device（'/ gpu：0'）'內，但未選中GPU實現。 –

好的，這是我第一次猜測，我沒有辦法在昨天重現這個問題。很高興你解決它，並學到了一些東西！ – florentbuisson

爲什麼選擇我的自定義操作系統的CPU實現？

回答

相關問題