namespace nbla
-
namespace nbla
Utilities for CUDA.
Base class of unary operations for CUDA.
Base class of binary operations for CUDA.
Unpooling.
Transpose.
TopNError.
Swish.
Sum.
Stack.
Split.
SoftmaxCrossEntropy.
Slice.
Sigmoid cross entropy.
SELU.
Round.
Randn.
Randint.
Rand.
Prod.
PReLU.
NotEqual.
Min.
MeanSubtraction.
Mean.
Max.
MatrixDiagPart.
MatrixDiag.
LogicalXor.
LogicalOr.
LogicalNot.
LogicalAnd.
LessEqual.
Less.
INQConvolution.
INQAffine.
ImageAugmentation.
Identity.
GreaterEqual.
Greater.
Floor.
Flip.
Equal.
EpsilonInsensitiveLoss.
Embed.
Dropout.
Deconvolution.
cReLU
Concatenate.
ClipGradByValue.
ClipGradByNorm.
CELU.
Ceil.
CategoricalCrossEntropy.
Broadcast.
BinaryWeightConvolution.
BinaryWeightAffine.
BinaryError.
Binary cross entropy.
BinaryConnectConvolution.
BinaryConnectAffine.
BatchMatmul.
Inverse.
AbsoluteError.
Abs.
Tanh.
Sigmoid.
MaxPooling.
AveragePooling.
Utilities for CUDA CUDNN.
Cpu resources.
Communicator interface class.
Enums
-
enum [anonymous]
Values:
-
enumerator CUDA_WARP_SIZE
-
enumerator CUDA_WARP_MASK
-
enumerator CUDA_WARP_BITS
-
enumerator CUDA_WARP_SIZE
-
enum CudaStreamId
Enum for nbla global streams.
Values:
-
enumerator CONVOLUTION_BWD
-
enumerator MAX_COUNT
-
enumerator CONVOLUTION_BWD
Functions
- NBLA_CUDA_API void synchronizer_cuda_array_cpu_array (Array *src, Array *dst, const int async_flags=AsyncFlag::NONE)
- NBLA_CUDA_API void synchronizer_cpu_array_cuda_array (Array *src, Array *dst, const int async_flags=AsyncFlag::NONE)
-
inline string cublas_status_to_string(cublasStatus_t status)
-
inline string cusolver_status_to_string(cusolverStatus_t status)
-
inline string curand_status_to_string(curandStatus_t status)
-
CUBLAS_TYPE_T(double, DOUBLE)
-
CUBLAS_TYPE_T(float, FLOAT)
-
CUBLAS_TYPE_T(half, HALF)
-
CUBLAS_TYPE_T(Half, HALF)
-
CUBLAS_TYPE_T(HalfCuda, HALF)
-
inline int cuda_get_blocks_by_size(int size)
Get an appropriate block size given a size of elements.
The kernel is assumed to contain a grid-strided loop.
-
inline Size_t cuda_get_blocks_by_size_with_size_t(const Size_t size)
Get an appropriate block size given a size of elements with nbla::Size_t.
The kernel is assumed to contain a grid-strided loop.
-
int cuda_set_device(int device)
CUDA device setter.
- 戻り値:
index of device before change
-
int cuda_get_device()
Get current CUDA device.
- 戻り値:
index of device
- NBLA_CUDA_API vector< size_t > cuda_mem_get_info ()
Get free and total device memory size.
-
cudaDeviceProp cuda_get_current_device_properties()
Get device properties of current CUDA device.
Note that using `cuda_get_current_device_properties` is extremely slower than `cuda_get_current_device_attribute`, since some props require PCIe reads to query. Keep in mind that sometime using this function could lead to huge slowdowns in your implementation.
-
int cuda_get_current_device_attribute(cudaDeviceAttr attr)
-
template<typename Tc>
ncclDataType_t get_nccl_dtype()
-
template<>
inline ncclDataType_t get_nccl_dtype<float>()
-
template<>
inline ncclDataType_t get_nccl_dtype<Half>()
-
template<>
inline ncclDataType_t get_nccl_dtype<HalfCuda>()
-
template<typename T>
void cublas_gemm(cublasHandle_t handle, cublasOperation_t op_x, cublasOperation_t op_y, int m, int n, int k, float alpha, const T *x, int lda, const T *y, int ldb, float beta, T *z, int ldc)
-
template<typename T>
void cublas_gemv(cublasHandle_t handle, cublasOperation_t trans, int m, int n, float alpha, const T *A, int lda, const T *x, int incx, float beta, T *y, int incy)
-
template<typename T>
void cublas_dot(cublasHandle_t handle, int n, const T *x, int incx, const T *y, int incy, T *out)
-
template<typename T>
void cublas_gemm_batched(cublasHandle_t handle, cublasOperation_t op_x, cublasOperation_t op_y, int m, int n, int k, float alpha, const T **x, int lda, const T **y, int ldb, float beta, T **z, int ldc, int batchCount)
-
template<typename T>
void cublas_gemm_strided_batched(cublasHandle_t handle, cublasOperation_t op_x, cublasOperation_t op_y, int m, int n, int k, float alpha, const T *x, int lda, int stride_a, const T *y, int ldb, int stride_b, float beta, T *z, int ldc, int stride_c, int batchCount)
-
template<typename T>
void cublas_getrf_batched(cublasHandle_t handle, int n, T **x, int lda, int *pivot, int *info, int batchSize)
-
template<typename T>
void cublas_getri_batched(cublasHandle_t handle, int n, const T **x, int lda, int *pivot, T **y, int ldc, int *info, int batchSize)
-
inline dtypes get_dtype_by_cudnn_data_type(cudnnDataType_t dtype)
Convert cuDNN enum dtype to NNabla enum dtype.
-
template<class T>
CudaTypeForceFloat<T>::type get_cudnn_scalar_arg(float val) Return scalar value used in alpha and beta in cuDNN APIs.
This implementation is based on the fact that cuDNN algorithm APIs take alpha and beta as float when storage data type is half.
-
inline string cudnn_status_to_string(cudnnStatus_t status)
-
void cudnn_set_tensor_nd_descriptor_force_dim(cudnnTensorDescriptor_t &desc, cudnnDataType_t dtype, vector<int> dims, size_t force_ndim = 4, bool channel_last = false, bool expand_left = false)
Wrapper function of cudnnSetTensorNdDescriptor with ensuring least dims.
http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnSetTensorNdDescriptor
According to the doc above, cudnnSetTensorNdDescriptor does not support a tensor less than 4 dimensions. This wrapper function adds unused dimensions with a value of 1 at last.
- パラメータ:
force_ndim --
-
template<typename T>
inline void cudnn_set_tensor_descriptor(cudnnTensorDescriptor_t desc, std::vector<int> shape)
-
std::ostream &operator<<(std::ostream &os, const CudnnConvDesc &desc)
- NBLA_CUDA_API void init_cudnn ()
Initialize CUDNN features.
- NBLA_CUDA_API void set_conv_fwd_algo_blacklist (int id)
Set conv algo to blacklist.
- NBLA_CUDA_API void set_conv_bwd_data_algo_blacklist (int id)
- NBLA_CUDA_API void set_conv_bwd_filter_algo_blacklist (int id)
- NBLA_CUDA_API void unset_conv_fwd_algo_blacklist (int id)
Unset conv algo from blacklist.
- NBLA_CUDA_API void unset_conv_bwd_data_algo_blacklist (int id)
- NBLA_CUDA_API void unset_conv_bwd_filter_algo_blacklist (int id)
-
template<typename T>
void cusolverdn_potrf_batched(cusolverDnHandle_t handle, int n, T **x, int lda, int *info, int batchSize)
-
NBLA_DECLARE_TRANSFORM_BINARY_CUDA(AbsoluteError)
- NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1_INPLACE (AddScalar, double, true)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA(ATanh)
- NBLA_DECLARE_TRANSFORM_BINARY_CUDA_INPLACE (BcAdd2, true)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA(BinarySigmoid)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA(BinaryTanh)
- NBLA_DECLARE_TRANSFORM_BINARY_CUDA_INPLACE (Div2, true)
-
NBLA_DECLARE_TRANSFORM_BINARY_CUDA_1(EpsilonInsensitiveLoss, float)
-
NBLA_DECLARE_TRANSFORM_BINARY_CUDA(Equal)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(EqualScalar, double)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA(GELU)
-
NBLA_DECLARE_TRANSFORM_BINARY_CUDA(Greater)
-
NBLA_DECLARE_TRANSFORM_BINARY_CUDA(GreaterEqual)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(GreaterEqualScalar, double)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(GreaterScalar, double)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA(HardSigmoid)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA(IsInf)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA(IsNaN)
-
NBLA_DECLARE_TRANSFORM_BINARY_CUDA(Less)
-
NBLA_DECLARE_TRANSFORM_BINARY_CUDA(LessEqual)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(LessEqualScalar, double)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(LessScalar, double)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA(LogSigmoid)
-
NBLA_DECLARE_TRANSFORM_BINARY_CUDA(LogicalAnd)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(LogicalAndScalar, bool)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA(LogicalNot)
-
NBLA_DECLARE_TRANSFORM_BINARY_CUDA(LogicalOr)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(LogicalOrScalar, bool)
-
NBLA_DECLARE_TRANSFORM_BINARY_CUDA(LogicalXor)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(LogicalXorScalar, bool)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(MaximumScalar, double)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(MinimumScalar, double)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA(Mish)
- NBLA_DECLARE_TRANSFORM_BINARY_CUDA_INPLACE (Mul2, true)
- NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1_INPLACE (MulScalar, double, true)
-
void my_cudaMemset(void *devPtr, int value, size_t count)
-
NBLA_DECLARE_TRANSFORM_BINARY_CUDA(NotEqual)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(NotEqualScalar, double)
- NBLA_DECLARE_TRANSFORM_BINARY_CUDA_INPLACE (Pow2, true)
- NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1_INPLACE (PowScalar, double, true)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(RDivScalar, double)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(RPowScalar, double)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(RSubScalar, double)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA(ReLU6)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(ResetInf, double)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(ResetNaN, double)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA_1(SoftPlus, double)
-
NBLA_DECLARE_TRANSFORM_BINARY_CUDA(SquaredError)
- NBLA_DECLARE_TRANSFORM_BINARY_CUDA_INPLACE (Sub2, true)
-
NBLA_DECLARE_TRANSFORM_UNARY_CUDA(TanhShrink)
-
static const char *cufftGetErrorString(cufftResult error)
- struct NBLA_ALIGN (2) HalfCuda
Operator overloaded class for CUDA half type.
-
template<typename T>
CudaNativeType<T>::type get_native_arg(const T &v) Get a scalar value of CUDA's native type from NNabla's data type.
-
template<>
inline CudaNativeType<HalfCuda>::type get_native_arg<HalfCuda>(const HalfCuda &v)
-
template<>
inline dtypes get_dtype<HalfCuda>() Template specialization for nbla::HalfCuda of a function nbla::get_dtype which gets an enum value of nbla::dtypes.
-
template<class T>
CudaNativeType<T>::type get_cuda_native_scalar(float val) Return CUDA's scalar value from float.
Returns as a type corresponding to the specified template argument. Specifically, Half —> half, HalfCuda —> half.
-
template<>
inline CudaNativeType<Half>::type get_cuda_native_scalar<Half>(float val)
-
template<>
inline CudaNativeType<HalfCuda>::type get_cuda_native_scalar<HalfCuda>(float val)
- NBLA_CUDA_API void init_cuda ()
Initialize CUDA features.
- NBLA_CUDA_API void clear_cuda_memory_cache ()
Clear all CUDA memory from cache.
- NBLA_CUDA_API void print_cuda_memory_cache_map ()
Print cache map for CUDA cached memory .
- NBLA_CUDA_API size_t get_cuda_caching_allocator_fragmentation_bytes (const string &device_id)
APIs to analyse cache map in CUDA CachingAllocator.
- NBLA_CUDA_API size_t get_cuda_caching_allocator_max_available_bytes (const string &device_id)
- NBLA_CUDA_API vector< int > get_cuda_cached_memory_used_counts (const string &device_id)
- NBLA_CUDA_API void print_cuda_virtual_memory_cache_map ()
Print cache map for CUDA virtual memory.
- NBLA_CUDA_API void clear_cuda_virtual_memory_cache ()
Clear all CUDA virtual memory from cache.
- NBLA_CUDA_API size_t get_cuda_virtual_caching_allocator_fragmentation_bytes (const string &device_id)
APIs to analyse cache map in CUDA VirtualCachingAllocator.
- NBLA_CUDA_API size_t get_cuda_virtual_caching_allocator_max_available_bytes (const string &device_id)
- NBLA_CUDA_API vector< int > get_cuda_virtual_memory_used_counts (const string &device_id)
- NBLA_CUDA_API bool is_cuda_tf32_enabled ()
Check if tf32 is enabled or not.
- NBLA_CUDA_API vector< string > cuda_array_classes ()
Get CUDA array classes.
- NBLA_CUDA_API void _cuda_set_array_classes (const vector< string > &a)
Set CUDA array classes.
- NBLA_CUDA_API void cuda_device_synchronize (const string &device)
Wrapper of cudaDeviceSynchronize.
- NBLA_CUDA_API int cuda_get_device_count ()
Wrapper of cudaGetDeviceCount.
- NBLA_CUDA_API vector< string > cuda_get_devices ()
get available devices.
- NBLA_CUDA_API shared_ptr< void > cuda_create_stream (int device_id=-1)
cudaStream wrapper functions.
- NBLA_CUDA_API void * cuda_stream_shared_to_void (shared_ptr< void > stream)
- NBLA_CUDA_API void print_stream_flag (shared_ptr< void > stream)
- NBLA_CUDA_API void print_stream_priority (shared_ptr< void > stream)
- NBLA_CUDA_API void cuda_stream_synchronize (shared_ptr< void > stream)
- NBLA_CUDA_API void cuda_nullstream_synchronize ()
- NBLA_CUDA_API void cuda_stream_destroy (shared_ptr< void > stream)
- NBLA_CUDA_API shared_ptr< void > cuda_create_event (int device_id=-1, unsigned int flags=0x02)
cudaEvent wrapper functions.
- NBLA_CUDA_API void cuda_default_stream_event (shared_ptr< void > event)
- NBLA_CUDA_API void cuda_stream_wait_event (shared_ptr< void > stream, shared_ptr< void > event)
- NBLA_CUDA_API void cuda_event_synchronize (shared_ptr< void > event)
- NBLA_CUDA_API void cuda_event_record (shared_ptr< void > event)
- NBLA_CUDA_API float cuda_event_elapsed_time (shared_ptr< void > event_s, shared_ptr< void > event_e)
- NBLA_CUDA_API void set_cuda_vma_chunk_size (size_t size)
Utils for Virtual memory allocator.
-
template<typename T>
void cuda_gemm(int device, T *z, bool transpose_z, const T *x, int row_x, int col_x, bool transpose_x, const T *y, int row_y, int col_y, bool transpose_y, float alpha, float beta)
-
template<typename T>
void cuda_gemv(int device, T *z, const T *x, int row_x, int col_x, bool transpose_x, const T *y, int row_y, float alpha, float beta, int incy = 1, int incz = 1)
-
template<>
inline void cuda_gemv<HalfCuda>(int device, HalfCuda *z, const HalfCuda *x, int row_x, int col_x, bool transpose_x, const HalfCuda *y, int row_y, float alpha, float beta, int incy, int incz)
-
template<typename T>
void cuda_dot(int device, T *z, const T *x, int n, const T *y, int incx = 1, int incy = 1)
-
template<typename T>
void cuda_gemm_batched(int device, T **z, bool transpose_z, const T **x, int row_x, int col_x, bool transpose_x, const T **y, int row_y, int col_y, bool transpose_y, float alpha, float beta, int batch_count)
-
template<typename T>
void cuda_getrf_batched(int device, int n, T **x, int *pivot, int *info, int batchSize)
-
template<typename T>
void cuda_getri_batched(int device, int n, const T **x, int *pivot, T **y, int *info, int batchSize)
-
static Size_t next_pow2_floor(Size_t n)
This function calculates the largest power of 2 less than or equal to n.
n must not be negative;
-
int dl_nvtx_init(void)
-
int dl_nvtx_finish(void)
- NBLA_CUDA_API void nvtx_mark_A (string msg)
- NBLA_CUDA_API void nvtx_range_push_A (string msg)
- NBLA_CUDA_API void nvtx_range_push_with_C (string msg)
- NBLA_CUDA_API void nvtx_range_pop ()
- template<typename T> __global__ void col2im_kernel (const int n, const T *col, const int height, const int width, const int channels, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int height_col, const int width_col, T *img)
-
template<typename T>
void col2im_cuda(const T *col, const int c_i, const int *shape, const int *k, const int *p, const int *s, const int *d, T *img)
-
template<typename T>
void col2im_nd_cuda(const T *col, const int c, const int spatial_dims, const int *spatial_shape, const int *kernel, const int *pad, const int *stride, const int *dilation, T *img)
- template<typename T> __device__ T im2col_bilinear_cuda (const T *bottom_data, const int data_width, const int height, const int width, T h, T w)
- template<typename T> __device__ T get_gradient_weight_cuda (T argmax_h, T argmax_w, const int h, const int w, const int height, const int width)
- template<typename T> __device__ T get_coordinate_weight_cuda (T argmax_h, T argmax_w, const int height, const int width, const T *im_data, const int data_width, const int bp_dir)
- template<typename T, bool MODULATED> __global__ void modulated_deformable_im2col_gpu_kernel (const int n, const T *data_im, const T *data_offset, const T *data_mask, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int num_channels, const int deformable_group, const int height_col, const int width_col, T *data_col)
- template<typename T, bool MODULATED> __global__ void modulated_deformable_col2im_gpu_kernel (const int n, const T *data_col, const T *data_offset, const T *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int deformable_group, const int height_col, const int width_col, T *grad_im)
- template<typename T, bool MODULATED> __global__ void modulated_deformable_col2im_coord_gpu_kernel (const int n, const T *data_col, const T *data_im, const T *data_offset, const T *data_mask, const int channels, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int channel_per_deformable_group, const int deformable_group, const int height_col, const int width_col, T *grad_offset, T *grad_mask)
-
template<typename T, bool MODULATED>
void modulated_deformable_im2col_cuda(const T *data_im, const T *data_offset, const T *data_mask, const int c_i, const int *shape, const int *k, const int *p, const int *s, const int *d, const int deformable_group, T *data_col)
-
template<typename T, bool MODULATED>
void modulated_deformable_col2im_cuda(const T *data_col, const T *data_offset, const T *data_mask, const int c_i, const int *shape, const int *k, const int *p, const int *s, const int *d, const int deformable_group, T *grad_im)
-
template<typename T, bool MODULATED>
void modulated_deformable_col2im_coord_cuda(const T *data_col, const T *data_im, const T *data_offset, const T *data_mask, const int c_i, const int *shape, const int *k, const int *p, const int *s, const int *d, const int deformable_group, T *grad_offset, T *grad_mask)
- template<typename T> __global__ void im2col_kernel (const int n, const T *img, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int h_o, const int w_o, T *col)
-
template<typename T>
void im2col_cuda(const T *img, const int c_i, const int *shape, const int *k, const int *p, const int *s, const int *d, T *col)
-
template<typename T>
void im2col_nd_cuda(const T *img, const int c, const int spatial_dims, const int *spatial_shape, const int *kernel, const int *pad, const int *stride, const int *dilation, T *col)
-
NBLA_DIAG_SUPPRESS(inline_qualifier_ignored)
- template<> inline __global__ void im2col_kernel (const int col_size, const HalfCuda *img, const int height, const int width, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, const int dilation_h, const int dilation_w, const int h_o, const int w_o, HalfCuda *col)
-
NBLA_DIAG_DEFAULT(inline_qualifier_ignored)
- template<int NDIM> __device__ NdIndex< NDIM > device_flat2nd (int64_t idx, const int64_t *stride)
- template<int NDIM> __device__ int64_t device_nd2flat (NdIndex< NDIM > &nd_index, const int64_t *stride)
- template<int NDIM> __device__ NdIndex< NDIM > device_flat_to_nd (int64_t idx, const NdIndex< NDIM > strides)
- template<int NDIM> __device__ int64_t device_nd_to_flat (NdIndex< NDIM > &nd_index, const NdIndex< NDIM > strides)
- template<int NDIM, typename T> __host__ NdIndex< NDIM > to_nd_index (const T &vec)
-
curandGenerator_t curand_create_generator(int seed = -1)
Returns a new cuRand generator initialized with a given seed.
- パラメータ:
seed -- [in] Seed of rng. When -1 given, the generator is not initialized without seed.
- 戻り値:
cuRand -- generator.
-
void curand_destroy_generator(curandGenerator_t gen)
Destroy a cuRand generator object.
- パラメータ:
gen -- [inout] cuRand generator.
-
void curand_set_seed(curandGenerator_t gen, int seed)
Set random seed to cuRand generator object.
- パラメータ:
gen -- [inout] cuRand generator.
seed -- [in] Seed.
-
template<typename T>
void curand_generate_rand(curandGenerator_t gen, T low, T high, T *dev_ptr, size_t size) Generate random values from uniform distribution in [low, high).
注釈
T=int generates random integers in the range of [low, high).
- パラメータ:
gen -- [inout] cuRand generator.
low -- [in] Minimum value of uniform dist.
high -- [in] Upperbound of uniform dist.
dev_ptr -- [out] Device array pointer.
size -- [in] Array size.
-
template<typename T>
void curand_generate_randn(curandGenerator_t gen, T mu, T sigma, T *dev_ptr, size_t size) Generate random values from normal distribution with mean and stddev.
- パラメータ:
gen -- [inout] cuRand generator.
mu -- [in] Mean of normal dist.
sigma -- [in] Standard deviation of normal dist.
dev_ptr -- [out] Device array pointer.
size -- [in] Array size.
-
void curand_initialize(const int size, const int seed, const int offset, curandState *state)
-
class CudaArray : public Array
- #include <cuda_array.hpp>
Array on CUDA devices.
Subclassed by nbla::CudaCachedArray, nbla::CudaCachedUnifiedArray
-
class CudaCachedArray : public nbla::CudaArray
- #include <cuda_array.hpp>
Array allocated on CUDA device with a CudaMemory obtained by Cuda::caching_allocator().
Public Functions
-
explicit CudaCachedArray(const Size_t size, dtypes dtype, const Context &ctx, const AllocatorMemoryPtr mem = nullptr, const Size_t offset = 0)
Constructor.
- パラメータ:
size -- Length of array.
dtype -- Data type.
ctx -- Context specifies device ID.
-
explicit CudaCachedArray(const Size_t size, dtypes dtype, const Context &ctx, const AllocatorMemoryPtr mem = nullptr, const Size_t offset = 0)
-
class CudaCachedUnifiedArray : public nbla::CudaArray
- #include <cuda_array.hpp>
Array allocated on unified memory with a CudaUnifiedMemory obtained by Cuda::unified_allocator().
Public Functions
-
explicit CudaCachedUnifiedArray(const Size_t size, dtypes dtype, const Context &ctx, const AllocatorMemoryPtr mem = nullptr, const Size_t offset = 0)
Constructor.
- パラメータ:
size -- Length of array.
dtype -- Data type.
ctx -- Context specifies device ID.
-
explicit CudaCachedUnifiedArray(const Size_t size, dtypes dtype, const Context &ctx, const AllocatorMemoryPtr mem = nullptr, const Size_t offset = 0)
-
class CudaCachedHostArray : public CpuArray
- #include <cuda_array.hpp>
Array allocated on host with a CudaHostMemory obtained by Cuda::pinned_allocator().
Public Functions
-
explicit CudaCachedHostArray(const Size_t size, dtypes dtype, const Context &ctx, const AllocatorMemoryPtr mem = nullptr, const Size_t offset = 0)
Constructor.
- パラメータ:
size -- Length of array.
dtype -- Data type.
ctx -- Context.
-
explicit CudaCachedHostArray(const Size_t size, dtypes dtype, const Context &ctx, const AllocatorMemoryPtr mem = nullptr, const Size_t offset = 0)
-
class CudaDlpackArray : public DlpackArray
- #include <cuda_dlpack_array.hpp>
Array allocated on CUDA device, which memory is borrowed from other frameworks via DLPack.
-
template<typename T>
struct cuda_data_type Data type.
-
template<typename T>
class DataParallelCommunicatorNccl : public DataParallelCommunicator<T> - #include <data_parallel_communicator.hpp>
Communicator interface which is extended to implement a new Communicator class.
Communicator exchanges gradients parameters or parameters itself.
Public Functions
-
void add_context_and_parameters(const pair<Context, vector<pair<string, VariablePtr>>> &ctx_params)
Adding context and parameters communicated via this class.
- パラメータ:
cparams -- pair<Context, vector of pair<name, VariablePtr>
-
void remove_context_parameters(const pair<Context, vector<string>> &ctx_keys)
Remove previously registered parameters by keys.
-
void clear_context_parameters()
Clear all parameters.
-
virtual void init()
Initall or initrank, depending multi-threads or multi-processes.
This function MUST be called after all parameters communicated are added by `add_context_and_parameters` method.
-
void sync_all_params()
Sync all parameters added in this communicator based on `Context`.
Coerce to copy all parameters to the device specified by `Context`.
-
vector<string> allowed_array_classes()
Get array classes that are allowed to be specified by Context.
-
void add_context_and_parameters(const pair<Context, vector<pair<string, VariablePtr>>> &ctx_params)
-
template<typename T>
class MultiProcessDataParallelCommunicatorNccl : public MultiProcessDataParallelCommunicator<T> - #include <multi_process_data_parallel_communicator.hpp>
Communicator interface which is extended to implement a new Communicator class.
Communicator exchanges gradients parameters or parameters itself.
Public Functions
-
void add_context_and_parameters(const pair<Context, vector<pair<string, VariablePtr>>> &ctx_params)
Adding context and parameters communicated via this class.
- パラメータ:
cparams -- pair<Context, vector of pair<name, VariablePtr>
-
void remove_context_parameters(const pair<Context, vector<string>> &ctx_keys)
Remove previously registered parameters by keys.
-
void clear_context_parameters()
Clear all parameters.
-
virtual void init()
Initall or initrank, depending multi-threads or multi-processes.
This function MUST be called after all parameters communicated are added by `add_context_and_parameters` method.
-
void sync_all_params()
Sync all parameters added in this communicator based on `Context`.
Coerce to copy all parameters to the device specified by `Context`.
-
vector<string> allowed_array_classes()
Get array classes that are allowed to be specified by Context.
-
void add_context_and_parameters(const pair<Context, vector<pair<string, VariablePtr>>> &ctx_params)
-
class Cuda : public BackendBase
- #include <cuda.hpp>
Singleton class for storing some handles or configs for CUDA Computation.
Public Functions
-
cublasHandle_t cublas_handle(int device = -1)
Get cuBLAS handle of a specified device.
-
cusolverDnHandle_t cusolverdn_handle(int device = -1)
Get cuSOLVER Dn handle of a specified device.
-
cutensorHandle_t cutensor_handle(int device = -1)
Get cuTENSOR handle of a specified device.
-
bool cutensor_available(int device = -1)
Get cuTENSOR availability.
-
std::shared_ptr<cudaEvent_t> cuda_event(unsigned int flags, int device = -1)
Get or create cuda event.
-
curandGenerator_t &curand_generator()
Get cuRAND global generator.
-
vector<string> array_classes() const
Available array class list used in CUDA Function implementations.
-
void _set_array_classes(const vector<string> &a)
Set array class list.
注釈
Dangerous to call. End users shouldn't call.
-
void register_array_class(const string &name)
Register array class to available list by name.
-
shared_ptr<Allocator> caching_allocator()
Get a caching allocator.
-
shared_ptr<Allocator> naive_allocator()
Get a no-cache allocator.
-
shared_ptr<Allocator> unified_allocator()
Get a caching unified-memory allocator.
-
shared_ptr<Allocator> pinned_allocator()
Get a caching pinned-host-memory allocator.
-
shared_ptr<Allocator> virtual_caching_allocator()
Get a caching virtual-memory allocator.
-
void free_unused_host_caches()
Free all unused host memory caches.
-
void device_synchronize(const string &device)
Synchronize host to device.
-
void default_stream_synchronize(const string &device)
Synchronize host to default stream of device.
-
shared_ptr<cudaStream_t> get_stream(unsigned int flag, CudaStreamId streamId, int device = -1)
Get auxilliary stream.
-
void create_lms_streams(int device = -1)
Create non blockuing streams for data transfer.
-
void set_vma_chunk_size(size_t size)
Change a chunk size of physical memory used in virtual memory allocator.
-
shared_ptr<cudaDeviceProp> get_device_properties(int device = -1)
get cuda device property
Public Members
-
cudaStream_t stream_HtoD = 0
Non blockuing streams for data transfer.
-
cublasHandle_t cublas_handle(int device = -1)
-
template<class T>
class cudnn_data_type Convert template type to cudnnDataType_t.
-
template<>
class cudnn_data_type<float>
-
template<>
class cudnn_data_type<double>
-
template<>
class cudnn_data_type<half>
-
template<>
class cudnn_data_type<Half>
-
template<>
class cudnn_data_type<HalfCuda>
-
struct CudnnConvDesc
- #include <cudnn.hpp>
cuDNN Convolution Descriptor used as a key to find previously used (cached) config.
Public Functions
-
bool operator==(const CudnnConvDesc &right) const
Operator == compares all elements.
Public Members
-
int ndim
Dimension of spatial samples.
-
int device
Device ID.
-
cudnnDataType_t dtype
Data type.
-
cudnnConvolutionMode_t mode
CUDNN_CONVOLUTION or CUDNN_CROSS_CORRELATION;.
-
int n
Batch size.
-
int c
Channels of input.
-
int o
Channels of output.
-
int group
Number of groups.
-
bool channel_last
Channels at last dimension (NHWC).
-
vector<int> sample
Sample size of each dimension.
-
vector<int> kernel
Kernel size of each dimension.
-
vector<int> pad
Padding size of each dimension.
-
vector<int> stride
Stride size of each dimension.
-
vector<int> dilation
Dilation size of each dimension.
-
class Hash
- #include <cudnn.hpp>
Custom hash function for CudnnConvDesc.
-
bool operator==(const CudnnConvDesc &right) const
-
struct CudnnConvolutionDescriptor
- #include <cudnn.hpp>
CUDNN Convolution descriptor wrapper.
-
struct CudnnPoolingDescriptor
- #include <cudnn.hpp>
CUDNN Pooling descriptor wrapper.
-
struct CudnnTensorDescriptor
- #include <cudnn.hpp>
CUDNN tensor descriptor wrapper.
-
struct CudnnActivationDescriptor
- #include <cudnn.hpp>
CUDNN activation descriptor wrapper.
-
class CudnnPooling
- #include <cudnn.hpp>
Common CUDNN pooling function wrapper.
-
class CudnnSoftmax
- #include <cudnn.hpp>
CUDNN softmax function wrapper.
-
struct CudnnConvResource
- #include <cudnn.hpp>
cuDNN Convolution resource cache.
Public Functions
-
size_t max_workspace_size() const
Get maximum workspace size.
-
size_t fwd_workspace_size() const
Get forward workspace size.
-
size_t bwd_filter_workspace_size() const
Get backward-filter workspace size.
-
size_t bwd_data_workspace_size() const
Get backward-data workspace size.
Public Members
-
int device
Device ID.
-
cudnnTensorDescriptor_t x_desc
Input desc.
-
cudnnTensorDescriptor_t y_desc
Output desc.
-
cudnnTensorDescriptor_t b_desc
Bias desc.
-
cudnnTensorDescriptor_t b_desc_deconv
Bias desc for deconvolution.
-
cudnnFilterDescriptor_t w_desc
Weight desc.
-
CudnnConvolutionDescriptor conv_desc
Conv desc.
-
CudnnConvolutionDescriptor conv_dgrad_desc
Conv backward data desc.
-
CudnnConvolutionDescriptor conv_wgrad_desc
Conv backward filter desc.
-
cudnnConvolutionFwdAlgo_t fwd_algo
Best forward algorithm found.
-
cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo
Best Backward filter algorithm found.
-
cudnnConvolutionBwdDataAlgo_t bwd_data_algo
Best backward data algorithm found.
-
size_t max_workspace_size() const
-
class CudnnHandleManager
- #include <cudnn.hpp>
Singleton class for storing cudnn handle for CUDA CUDNN Computation.
Public Functions
-
cudnnHandle_t handle(int device = -1, cudaStream_t stream = 0)
Get cuDNN handle for device.
-
Size_t get_workspace_limit_in_bytes()
Get a workspace limit.
The negative value means no limit of workspace size.
注釈
The default value is -1. The default value is overwritten if an environment variable NNABLA_CUDNN_WORKSPACE_LIMIT is specified.
-
void set_workspace_limit_in_bytes(Size_t bytes)
Set a workspace limit.
The negative value means no limit of workspace size.
- パラメータ:
Limit -- [in] in bytes.
Public Members
-
unordered_map<CudnnConvDesc, shared_ptr<CudnnConvResource>, typename CudnnConvDesc::Hash> conv_resource
Hash map for CudnnConvResource.
-
cudnnHandle_t handle(int device = -1, cudaStream_t stream = 0)
-
template<typename T>
class AffineGridCudaCudnn : public nbla::AffineGridCuda<T>
-
template<typename T>
class AveragePoolingCudaCudnn : public nbla::BasePoolingCudaCudnn<AveragePooling<T>>
-
template<typename T>
class BatchNormalizationCudaCudnn : public nbla::BatchNormalizationCuda<T>
-
template<typename T>
class ConvolutionCudaCudnn : public Convolution<T>
-
template<typename T>
class DeconvolutionCudaCudnn : public Deconvolution<T> - #include <deconvolution.hpp>
-
template<typename T>
class FusedBatchNormalizationCudaCudnn : public nbla::FusedBatchNormalizationCuda<T>
-
template<typename T>
class GroupNormalizationCudaCudnn : public nbla::GroupNormalizationCuda<T>
-
template<typename T>
class InstanceNormalizationCudaCudnn : public nbla::InstanceNormalizationCuda<T>
-
template<typename T>
class LayerNormalizationCudaCudnn : public nbla::LayerNormalizationCuda<T>
-
template<typename T>
class LogSoftmaxCudaCudnn : public LogSoftmax<T> - #include <log_softmax.hpp>
注釈
The default algorithm is set as ACCURATE. TODO: Set an algorithm by context.
-
template<typename T>
class MaxPoolingCudaCudnn : public nbla::BasePoolingCudaCudnn<MaxPooling<T>::base_pooling_type>
-
struct WCudnnTensorDescArray
-
struct WCudnnTensorDesc
-
struct WCudnnFilterDesc
-
struct WCudnnDropoutDesc
-
struct WCudnnRNNDesc
-
template<typename T>
class SoftmaxCudaCudnn : public Softmax<T> - #include <softmax.hpp>
注釈
The default algorithm is set as ACCURATE.
-
template<typename T>
class SumPoolingCudaCudnn : public SumPooling<T>
-
template<typename T>
class SyncBatchNormalizationCudaCudnn : public nbla::SyncBatchNormalizationCuda<T>
-
template<typename T>
class TensorNormalizationCudaCudnn : public TensorNormalization<T>
-
template<typename BasePoolingType>
class BasePoolingCudaCudnn : public BasePoolingType - #include <base_pooling.hpp>
Base class of CUDNN Pooling functions.
-
template<typename T>
class WarpByGridCudaCudnn : public nbla::WarpByGridCuda<T>
-
template<typename T>
class WeightStandardizationCudaCudnn : public WeightStandardization<T>
-
template<typename T>
class AdaptiveSeparableConvolutionCuda : public AdaptiveSeparableConvolution<T>
-
template<typename T>
class Add2Cuda : public Add2<T> Subclassed by nbla::Add2CudaCudnn< T >
-
template<typename T>
class AffineGridCuda : public AffineGrid<T> Subclassed by nbla::AffineGridCudaCudnn< T >
-
template<typename T>
class BatchCholeskyCuda : public BatchCholesky<T>
-
template<typename T>
class BatchLogdetCuda : public BatchLogdet<T>
-
template<typename T>
class BatchMatmulCuda : public BatchMatmul<T> - #include <batch_matmul.hpp>
-
template<typename T>
class BatchNormalizationCuda : public BatchNormalization<T> Subclassed by nbla::BatchNormalizationCudaCudnn< T >
-
template<typename T>
class BinaryConnectAffineCuda : public BinaryConnectAffine<T> - #include <binary_connect_affine.hpp>
-
template<typename T>
class BinaryConnectConvolutionCuda : public BinaryConnectConvolution<T> - #include <binary_connect_convolution.hpp>
-
template<typename T>
class BinaryCrossEntropyCuda : public BinaryCrossEntropy<T>
-
template<typename T>
class BinaryErrorCuda : public BinaryError<T> - #include <binary_error.hpp>
-
template<typename T>
class BinaryWeightAffineCuda : public BinaryWeightAffine<T>
-
template<typename T>
class BinaryWeightConvolutionCuda : public BinaryWeightConvolution<T>
-
template<typename T>
class BoolGatherCuda : public BoolGather<T>
-
template<typename T>
class BoolScatterCuda : public BoolScatter<T>
-
template<typename T, typename Tl = int>
class CategoricalCrossEntropyCuda : public CategoricalCrossEntropy<T>
-
template<typename T>
class ClipGradByNormCuda : public ClipGradByNorm<T> - #include <clip_grad_by_norm.hpp>
-
template<typename T>
class ClipGradByValueCuda : public ClipGradByValue<T> - #include <clip_grad_by_value.hpp>
-
template<typename T>
class ConcatenateCuda : public Concatenate<T>
-
template<typename T>
class ConvolutionCuda : public Convolution<T>
-
template<typename T>
class DeconvolutionCuda : public Deconvolution<T> - #include <deconvolution.hpp>
-
template<typename T>
class DeformableConvolutionCuda : public DeformableConvolution<T>
-
template<typename T>
class DepthwiseConvolutionCuda : public DepthwiseConvolution<T>
-
template<typename T>
class DepthwiseDeconvolutionCuda : public DepthwiseDeconvolution<T>
-
template<typename T>
class DequantizeLinearCuda : public DequantizeLinear<T>
-
template<typename T>
class FixedPointQuantizeCuda : public FixedPointQuantize<T> - #include <fixed_point_quantize.hpp>
- Todo:
PLACE HERE FUNCTION DOCUMENTATION.
-
template<typename T>
class FusedBatchNormalizationCuda : public FusedBatchNormalization<T> Subclassed by nbla::FusedBatchNormalizationCudaCudnn< T >
-
template<typename T>
class GroupNormalizationCuda : public GroupNormalization<T> Subclassed by nbla::GroupNormalizationCudaCudnn< T >
-
template<typename T>
class ImageAugmentationCuda : public ImageAugmentation<T> - #include <image_augmentation.hpp>
-
template<typename T, typename T1>
class INQAffineCuda : public INQAffine<T, T1> - #include <inq_affine.hpp>
-
template<typename T, typename T1>
class INQConvolutionCuda : public INQConvolution<T, T1> - #include <inq_convolution.hpp>
-
template<typename T>
class InstanceNormalizationCuda : public InstanceNormalization<T> Subclassed by nbla::InstanceNormalizationCudaCudnn< T >
-
template<typename T>
class InterpolateCuda : public Interpolate<T>
-
template<typename T>
class LayerNormalizationCuda : public LayerNormalization<T> Subclassed by nbla::LayerNormalizationCudaCudnn< T >
-
template<typename T>
class MatrixDiagCuda : public MatrixDiag<T> - #include <matrix_diag.hpp>
-
template<typename T>
class MatrixDiagPartCuda : public MatrixDiagPart<T> - #include <matrix_diag_part.hpp>
-
template<typename T>
class MaxPoolingBackwardCuda : public MaxPoolingBackward<T>
-
template<typename T>
class MeanCuda : public Mean<T> - #include <mean.hpp>
Subclassed by nbla::MeanCudaCudnn< T >
-
template<typename T>
class MeanSubtractionCuda : public MeanSubtraction<T> - #include <mean_subtraction.hpp>
-
template<typename T>
class MinMaxQuantizeCuda : public MinMaxQuantize<T>
-
template<typename T>
class NormNormalizationCuda : public NormNormalization<T>
-
template<typename T>
class PackPaddedSequenceCuda : public PackPaddedSequence<T>
-
template<typename T>
class PadPackedSequenceCuda : public PadPackedSequence<T>
-
template<typename T>
class PatchCorrelationCuda : public PatchCorrelation<T>
-
template<typename T>
class Pow2QuantizeCuda : public Pow2Quantize<T> - #include <pow2_quantize.hpp>
- Todo:
PLACE HERE FUNCTION DOCUMENTATION.
-
template<typename T>
class ProdCuda : public Prod<T> - #include <prod.hpp>
Subclassed by nbla::ProdCudaCudnn< T >
-
template<typename T>
class QuantizeLinearCuda : public QuantizeLinear<T>
-
template<typename T>
class RandomChoiceCuda : public RandomChoice<T>
-
template<typename T>
class RandomCropCuda : public RandomCrop<T>
-
template<typename T>
class RandomEraseCuda : public RandomErase<T>
-
template<typename T>
class RandomFlipCuda : public RandomFlip<T>
-
template<typename T>
class ReduceMeanCuda : public ReduceMean<T>
-
template<typename T>
class ScatterAddCuda : public ScatterAdd<T>
-
template<typename T>
class SearchSortedCuda : public SearchSorted<T>
-
template<typename T, typename Tl>
class SigmoidCrossEntropyCuda : public SigmoidCrossEntropy<T, Tl>
-
template<typename T, typename Tl = int>
class SoftmaxCrossEntropyCuda : public SoftmaxCrossEntropy<T>
-
template<typename T>
class SpectralNormCuda : public SpectralNorm<T>
-
template<typename T>
class SumCuda : public Sum<T> - #include <sum.hpp>
Subclassed by nbla::SumCudaCudnn< T >
-
template<typename T>
class SyncBatchNormalizationCuda : public SyncBatchNormalization<T> Subclassed by nbla::SyncBatchNormalizationCudaCudnn< T >
-
template<typename T>
class TensorNormalizationCuda : public TensorNormalization<T>
-
template<typename T, typename T1>
class TopNErrorCuda : public TopNError<T, T1> - #include <top_n_error.hpp>
-
template<typename T, typename ...Args>
class TransformBinaryCuda : public BaseTransformBinary<Args...>
-
template<typename T, typename ...Args>
class TransformUnaryCuda : public BaseTransformUnary<Args...>
-
template<typename T>
class WarpByFlowCuda : public WarpByFlow<T>
-
template<typename T>
class WarpByGridCuda : public WarpByGrid<T> Subclassed by nbla::WarpByGridCudaCudnn< T >
-
template<typename T>
class WeightNormalizationCuda : public WeightNormalization<T>
-
template<typename T>
class WeightStandardizationCuda : public WeightStandardization<T>
-
template<typename T>
struct CudaType - #include <half.hpp>
Infer NNabla's CUDA type.
When nbla::Half is passed, it's converted to nbla::HalfCuda which can be used in kernel functions as if it's a built-in scalar type with overloaded operators.
-
template<>
struct CudaType<Half>
-
template<typename T>
struct CudaTypeForceFloat - #include <half.hpp>
Infer NNabla's CUDA type while force half to float.
This is used when a particular operation doesn't support fp16 computation (e.g. GEMV in cuBLAS at least until ver 9.1)
-
template<>
struct CudaTypeForceFloat<Half>
-
template<>
struct CudaTypeForceFloat<HalfCuda>
-
template<>
struct CudaTypeForceFloat<half>
-
template<typename T>
struct CudaNativeType - #include <half.hpp>
Infer CUDA's native data type from NNabla's data type.
In particular, nbla::Half and nbla::HalfCuda are converted to CUDA's half. Otherwise passed through.
-
template<>
struct CudaNativeType<Half>
-
template<>
struct CudaNativeType<HalfCuda>
-
template<class T>
class numeric_limits_cuda
-
template<>
class numeric_limits_cuda<HalfCuda>
-
template<>
class numeric_limits_cuda<float>
-
class CudaMemory : public Memory
- #include <cuda_memory.hpp>
CUDA memory implementation.
A CUDA device memory block allocated by cudaMalloc function is managed by this.
The device passed to constructor is a device id as as string such as "0" and "1".
Subclassed by nbla::CudaUnifiedMemory
-
class CudaUnifiedMemory : public nbla::CudaMemory
- #include <cuda_memory.hpp>
CUDA memory implementation using unified memory.
-
class CudaPinnedHostMemory : public CpuMemory
- #include <cuda_memory.hpp>
Pinned host memory implementation.
-
template<typename T>
class RMSpropGravesCuda : public RMSpropGraves<T>
-
template<int NDIM>
struct NdIndex
-
class ReduceSetup
- #include <reduce.hpp>
This class can be used in Function::setup to prepare for the reduction CUDA kernel called in Function::forward or Function::backward.
The input shape are integrated to the two dimension (size_y, size_x) according to reduction axes; one part is the collection of reduction axes and the other part is the collection of the other axes. However the terminology x and y are determined for easier understanding of the implemented algorithm as follow.
x: the integrated dimensions including memory continuous dimension.
y: otherwise
For example, let an input shape (2, 3, 4, 5) and an reduction axes (0, 2). The dimensional part of x is (3, 5). That of y is (2, 4). Then
(ndim_y, ndim_x) = (2, 2)
(size_y, size_x) = (8, 15).
The original strides are (60, 20, 5, 1). Then
strides_x_input = (20, 1)
strides_y_input = (60, 5)
strides_x = (5, 1), which is the strides of the x-part shape (3, 5)
strides_y = (4, 1), which is the strides of the y-part shape (2, 4)
-
class ScanSetup
- #include <scan_setup.hpp>
Scan configulation class.
size_input: Total size of the input. size_outer: Total product of input_shape[0:axis-1] size_scan: input_shape[axis] size_innter: Total product of inpute_shape[axis+1:] e.g. For the input_shape == (2, 3, 5, 7) and axis == 2 size_input: 210 size_outer: 6 size_scan: 5 size_inner: 7
axis: Axis scan will be performed.
exclusive: output[i] includes input[i] or not. e.g.) exclusive == false: [1, 2, 3, 4] -> [1, 3, 6, 10] e.g.) exclusive == true: [1, 2, 3, 4] -> [0, 1, 3, 6]
reverse: Scan direction. e.g.) reverse == false: [1, 2, 3, 4] -> [1, 3, 6, 10] e.g.) reverse == true: [1, 2, 3, 4] -> [10, 9, 7, 4]
require_64bit_index: Whether 64bit integer is needed for indexing or size description.
Public Functions
-
void operator()(const Shape_t &shape_input, const int axis, const bool exclusive, const bool reverse)
Initialize member variables from input_shape and other scan conditions.
-
void operator()(const Shape_t &shape_input, const int axis, const bool exclusive, const bool reverse)
-
namespace warp_by_grid
Functions
-
inline bool cudnn_condition(const int size, const std::string mode, const PADDING_MODE padding_mode_t, const bool align_corners, const bool channel_last)
-
inline bool cudnn_condition(const int size, const std::string mode, const PADDING_MODE padding_mode_t, const bool align_corners, const bool channel_last)
-
enum [anonymous]