Question

Следуя онлайн-примеру , предоставленному Tensorflow, у меня возникли проблемы при использовании пользовательских операций, которые они определяют в ядрах графического процессора .Инструкции для построения примера списка трех обязательных файлов:

заголовочный файл

// kernel_example.h
#ifndef KERNEL_EXAMPLE_H_
#define KERNEL_EXAMPLE_H_

template <typename Device, typename T>
struct ExampleFunctor {
  void operator()(const Device& d, int size, const T* in, T* out);
};

#if GOOGLE_CUDA
// Partially specialize functor for GpuDevice.
template <typename Eigen::GpuDevice, typename T>
struct ExampleFunctor {
  void operator()(const Eigen::GpuDevice& d, int size, const T* in, T* out);
};
#endif

#endif //KERNEL_EXAMPLE_H_ [1] commented out

((1) Здесь я закомментировал KERNEL_EXAMPLE_H_ в последней строке, так как это вызывает ошибку компиляции.)

.cc file

// kernel_example.cc
#include "kernel_example.h"    <--------[2] replaced example.h
#include "tensorflow/core/framework/op_kernel.h"

using namespace tensorflow;

using CPUDevice = Eigen::ThreadPoolDevice;
using GPUDevice = Eigen::GpuDevice;

// CPU specialization of actual computation.
template <typename T>
struct ExampleFunctor<CPUDevice, T> {
  void operator()(const CPUDevice& d, int size, const T* in, T* out) {
    for (int i = 0; i < size; ++i) {
      out[i] = 2 * in[i];
    }
  }
};

// OpKernel definition.
// template parameter <T> is the datatype of the tensors.
template <typename Device, typename T>
class ExampleOp : public OpKernel {
 public:
  explicit ExampleOp(OpKernelConstruction* context) : OpKernel(context) {}

  void Compute(OpKernelContext* context) override {
    // Grab the input tensor
    const Tensor& input_tensor = context->input(0);

    // Create an output tensor
    Tensor* output_tensor = NULL;
    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                     &output_tensor));

    // Do the computation.
    OP_REQUIRES(context, input_tensor.NumElements() <= tensorflow::kint32max,
                errors::InvalidArgument("Too many elements in tensor"));
    ExampleFunctor<Device, T>()(
        context->eigen_device<Device>(),
        static_cast<int>(input_tensor.NumElements()),
        input_tensor.flat<T>().data(),
        output_tensor->flat<T>().data());
  }
};

// Register the CPU kernels.
#define REGISTER_CPU(T)                                          \
  REGISTER_KERNEL_BUILDER(                                       \
      Name("Example").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
      ExampleOp<CPUDevice, T>);
REGISTER_CPU(float);
REGISTER_CPU(int32);

// Register the GPU kernels.
#ifdef GOOGLE_CUDA
#define REGISTER_GPU(T)                                          \
  /* Declare explicit instantiations in kernel_example.cu.cc. */ \
  extern template ExampleFunctor<GPUDevice, T>;                  \
  REGISTER_KERNEL_BUILDER(                                       \
      Name("Example").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
      ExampleOp<GPUDevice, T>);
REGISTER_GPU(float);
REGISTER_GPU(int32);
#endif  // GOOGLE_CUDA

([2] Здесь я изменил имя файла заголовка, чтобы соответствоватьимя файла.) и

.cu.cc файл

// kernel_example.cu.cc
#ifdef GOOGLE_CUDA
#define EIGEN_USE_GPU
#include "kernel_example.h"    //[3] replaced example.h
#include "tensorflow/core/util/cuda_kernel_helper.h"

using namespace tensorflow;

using GPUDevice = Eigen::GpuDevice;

// Define the CUDA kernel.
template <typename T>
__global__ void ExampleCudaKernel(const int size, const T* in, T* out) {
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
       i += blockDim.x * gridDim.x) {
    out[i] = 2 * ldg(in + i);
  }
}

// Define the GPU implementation that launches the CUDA kernel.
template <typename T>
void ExampleFunctor<GPUDevice, T>::operator()(
    const GPUDevice& d, int size, const T* in, T* out) {
  // Launch the cuda kernel.
  //
  // See core/util/cuda_kernel_helper.h for example of computing
  // block count and thread_per_block count.
  int block_count = 1024;
  int thread_per_block = 20;
  ExampleCudaKernel<T>
      <<<block_count, thread_per_block, 0, d.stream()>>>(size, in, out);
}

// Explicitly instantiate functors for the types of OpKernels registered.
template struct ExampleFunctor<GPUDevice, float>;
template struct ExampleFunctor<GPUDevice, int32>;

#endif  // GOOGLE_CUDA

[3] Здесь я изменил имя файла заголовка в соответствии с именем файла.

Единственные 3 незначительных изменения, которые я сделал, перечислены ниже каждого сценария.

Построение библиотеки op с использованием предложенного метода:

TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
g++ -std=c++11 -shared kernel_example.cc kernel_example.cu.cc -o gpu_op.so -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]} -O2

выглядит успешно.И gpu_op.so генерируется.Но при импорте этой библиотеки операций и попытке ее использовать:

# run_op.py
import tensorflow as tf
import numpy as np
my_module = tf.load_op_library('./gpu_op.so')

a = np.ones((20,5,5))
in1 = tf.convert_to_tensor(a, dtype = float)

print("input1: ", in1)

with tf.Session() as sess:
    ans = sess.run(my_module.example(in1))
print("output:", ans)

приводит к тому, что операция не найдена:

  File "run_op.py", line 11, in <module>
    ans = sess.run(my_module.example(in1))
AttributeError: module '33c9073b4d33739023b5757fe9acdd79' has no attribute 'example'

Я относительно новичок в C ++ и, возможно, не компилирую это правильно,Так что же мне делать, чтобы этот модуль был импортируем?И правильно ли я сделал 3 изменения в коде, упомянутом выше?

FinleyGibson · Answer 1 · 03 октября 2018

Оказывается, я упустил из виду, что использование кода CUDA в этом примере требует использования компилятора nvidia nvcc.

может быть скомпилировано с использованием:

TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
nvcc -std=c++11 cuda_op_kernel.cc cuda_op_kernel.cu.cc -o cuda_op_kernel.so -shared -Xcompiler -fPIC ${TF_CFLAGS[@]} ${TF_LFLAGS[@]} -O2

Пример компоновки Tensor Flow для пользовательской операции на GPU

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

1 Ответ

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Пример компоновки Tensor Flow для пользовательской операции на GPU

Пожалуйста, войдите или зарегистрируйтесь чтобы ответить на этот вопрос.

1 Ответ

Пожалуйста, войдите или зарегистрируйтесь что бы добавить комментарий.

Похожие темы