爲了學習如何編寫自定義TensorFlow操作,我按照Adding a New Op教程做了一個「add_b」操作,它爲每個輸入值添加一個標量b
。爲什麼選擇我的自定義操作系統的CPU實現?
add_b_op.cc
:
#define EIGEN_USE_THREADS
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/common_shape_fns.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/shape_inference.h"
using namespace tensorflow;
REGISTER_OP("AddB")
.Attr("T: {float, double}")
.Input("input: T")
.Input("b: T")
.Output("output: T")
.SetShapeFn([] (shape_inference::InferenceContext* c) -> Status {
shape_inference::ShapeHandle out;
TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &out));
return shape_inference::UnchangedShape(c);
})
//----------------------------------------------------------------------
.Doc(R"doc(
Adds `b` to each input.
input: The input values.
b: A number to add to each input value.
)doc");
template <typename T>
class AddBCpuOp : public OpKernel {
public:
explicit AddBCpuOp(OpKernelConstruction* context) : OpKernel(context) {}
void Compute(OpKernelContext* context) override {
const Tensor& input_tensor = context->input(0);
const auto input = input_tensor.flat<T>();
Tensor* output_tensor = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
&output_tensor));
auto output = output_tensor->flat<T>();
const Eigen::ThreadPoolDevice& d = context->eigen_device<Eigen::ThreadPoolDevice>();
// Note: The mistake of adding 1 instead of `b` is intentional to be able to distinguish
// the CPU and GPU implementations.
output.device(d) = input + static_cast<T>(1);
}
};
REGISTER_KERNEL_BUILDER(
Name("AddB")
.Device(DEVICE_CPU)
.TypeConstraint<float>("T"),
AddBCpuOp<float>);
REGISTER_KERNEL_BUILDER(
Name("AddB")
.Device(DEVICE_CPU)
.TypeConstraint<double>("T"),
AddBCpuOp<double>);
#if GOOGLE_CUDA
template <typename T>
bool LaunchAddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output);
template <typename T>
class AddBGpuOp : public OpKernel {
public:
explicit AddBGpuOp(OpKernelConstruction* context) : OpKernel(context) {}
void Compute(OpKernelContext* context) override {
const Tensor& input_tensor = context->input(0);
const auto input = input_tensor.flat<T>();
const Tensor& b_tensor = context->input(1);
OP_REQUIRES(context, TensorShapeUtils::IsScalar(b_tensor.shape()),
errors::InvalidArgument("add_b expects a scalar for `b`."));
const auto b = b_tensor.scalar<T>();
Tensor* output_tensor = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
&output_tensor));
auto output = output_tensor->flat<T>();
OP_REQUIRES(context, LaunchAddBKernel(input.data(), input.dimension(0), b.data(), output.data()),
errors::Internal("add_b: LaunchAddBKernel() failed."));
}
};
REGISTER_KERNEL_BUILDER(
Name("AddB")
.Device(DEVICE_GPU)
.TypeConstraint<float>("T"),
AddBGpuOp<float>);
REGISTER_KERNEL_BUILDER(
Name("AddB")
.Device(DEVICE_GPU)
.TypeConstraint<double>("T"),
AddBGpuOp<double>);
#endif // if GOOGLE_CUDA
add_b_op.cu.cc
template <typename T, int BLOCK_DIM_X>
__global__ void AddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output) {
const int i = blockIdx.x * BLOCK_DIM_X + threadIdx.x;
if (i < n) {
d_output[i] = d_input[i] + *d_b;
}
}
template <typename T>
bool LaunchAddBKernel(const T *__restrict__ d_input, int n, const T *__restrict__ d_b, T *__restrict__ d_output) {
if (n <= 0) return true;
constexpr int BLOCK_DIM_X = 256;
AddBKernel<T, BLOCK_DIM_X><<<n/BLOCK_DIM_X + (n % BLOCK_DIM_X != 0), BLOCK_DIM_X>>>(d_input, n, d_b, d_output);
return true;
}
// Explicit instantiations.
template bool LaunchAddBKernel<float>(const float *__restrict__, int, const float *__restrict__, float *__restrict__);
template bool LaunchAddBKernel<double>(const double *__restrict__, int, const double *__restrict__, double *__restrict__);
我已有意引入的CPU執行的誤差,以便能夠區分是否正在使用的CPU或GPU執行。
當我測試了我的自定義運算:
from __future__ import print_function
import tensorflow as tf
module = tf.load_op_library('custom_ops.so')
with tf.Session(config = tf.ConfigProto(log_device_placement = True)):
print(module.add_b([5., 4., 3., 2., 1.], 8.).eval())
我得到以下輸出:
I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] OS X does not support NUMA - returning NUMA node zero I tensorflow/core/common_runtime/gpu/gpu_device.cc:951] Found device 0 with properties: name: GeForce GT 750M major: 3 minor: 0 memoryClockRate (GHz) 0.9255 pciBusID 0000:01:00.0 Total memory: 2.00GiB Free memory: 1.80GiB I tensorflow/core/common_runtime/gpu/gpu_device.cc:972] DMA: 0 I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] 0: Y I tensorflow/core/common_runtime/gpu/gpu_device.cc:1041] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0) Device mapping: /job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0 I tensorflow/core/common_runtime/direct_session.cc:252] Device mapping: /job:localhost/replica:0/task:0/gpu:0 -> device: 0, name: GeForce GT 750M, pci bus id: 0000:01:00.0 AddB: /job:localhost/replica:0/task:0/gpu:0 I tensorflow/core/common_runtime/simple_placer.cc:819] AddB: /job:localhost/replica:0/task:0/gpu:0 AddB/b: /job:localhost/replica:0/task:0/gpu:0 I tensorflow/core/common_runtime/simple_placer.cc:819] AddB/b: /job:localhost/replica:0/task:0/gpu:0 AddB/input: /job:localhost/replica:0/task:0/gpu:0 I tensorflow/core/common_runtime/simple_placer.cc:819] AddB/input: /job:localhost/replica:0/task:0/gpu:0 [ 6. 5. 4. 3. 2.]
「設備放置日誌」似乎表明,運正在對執行GPU,但輸出表明正在使用CPU實現。
當我註釋掉兩個REGISTER_KERNEL_BUILDER()註冊爲DEVICE_CPU
執行,重新編譯和重新測試,我得到的[ 13. 12. 11. 10. 9.]
預期的輸出,但有一個錯誤:
E tensorflow/core/common_runtime/executor.cc:334] Executor failed to create kernel. Not found: No registered 'AddB' OpKernel for CPU devices compatible with node AddB = AddB[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](AddB/input, AddB/b) . Registered: device='GPU'; T in [DT_FLOAT] device='GPU'; T in [DT_DOUBLE] [[Node: AddB = AddB[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"](AddB/input, AddB/b)]]
該錯誤信息看起來像是一個bug,因爲雖然錯誤提示「Executor無法創建內核」,但是顯然創建了一個內核來在GPU上運行該操作。
爲什麼使用CPU實現而不是GPU實現?
在此情況下,重要的是,這裏有我的發展設置細節:
- 我使用的MacBook Pro有一個內置的NVIDIA GeForce GT 750M(CUDA計算能力3.0)。
- MACOS塞拉利昂版10.12.1(16B2555)
- cuda_8.0.47_mac,cudnn-8.0-OSX-x64的V5.1
- TensorFlow經由安裝0.11.0rc2:
export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.11.0rc2-py2-none-any.whl
UPDATE我發現是否選擇CPU或GPU實現取決於輸入的大小。使用此測試腳本:
from __future__ import print_function
import numpy as np
import tensorflow as tf
from time import time
NUM_VALUES = 1310720
input = np.arange(0, NUM_VALUES, dtype = float)
module = tf.load_op_library('custom_ops.so')
with tf.Session(config = tf.ConfigProto(log_device_placement = True)):
start = time(); print(module.add_b(input, 8.).eval()); end = time(); print(end - start)
..當NUM_VALUES
爲1310720或更低,則使用CPU執行。當NUM_VALUES
爲1310721或更多時,則使用GPU實現。
有沒有(1310720 * 8字節每雙=)10 MiB截止?如果是這樣,我該如何覆蓋它? AddB()op足夠簡單,但對於更復雜的自定義操作,10 MiB可能太大以至於不能選擇GPU實現的閾值。
謝謝你fo你的答案。只要'CPUDevice'和'GPUDevice'實際上是不同的類型,我認爲模板專門化[沒關係](http://ideone.com/0nKxNd)。無論如何,我剛纔嘗試將類名更改爲「AddBCpuOp」和「AddBGpuOp」,並觀察到相同的行爲。 (我已編輯我的問題,包括修改後的代碼。) –