cuDNN �?深度神经网络加速库
cuDNN(CUDA Deep Neural Network library)是 NVIDIA 专为深度学习设计�?GPU 加速原语库,是 PyTorch、TensorFlow、MXNet 等框架的底层计算引擎�?
cuDNN 的定�?
PyTorch / TensorFlow / JAX
�?
cuDNN API
�?
卷积 / 池化 / 归一�?/ 激�?/ RNN / Attention
�?
Tensor Core / CUDA Core
�?
GPU 硬件cuDNN 提供的不只是算法实现,更重要的是自动选择最优算�?*(Algorithm Selection)和融合优化**(Kernel Fusion)�?
初始化与句柄
cpp
#include <cudnn.h>
cudnnHandle_t handle;
cudnnCreate(&handle);
// 绑定�?Stream
cudaStream_t stream;
cudaStreamCreate(&stream);
cudnnSetStream(handle, stream);
// 清理
cudnnDestroy(handle);Tensor 描述�?
cuDNN 使用描述符(Descriptor)来描述张量的形状和数据类型�?
cpp
cudnnTensorDescriptor_t inputDesc;
cudnnCreateTensorDescriptor(&inputDesc);
// NCHW 格式:Batch × Channel × Height × Width
cudnnSetTensor4dDescriptor(
inputDesc,
CUDNN_TENSOR_NCHW, // 内存布局
CUDNN_DATA_FLOAT, // 数据类型
batch_size, // N
channels, // C
height, // H
width // W
);
cudnnDestroyTensorDescriptor(inputDesc);内存布局格式
| 格式 | 说明 | 适用场景 |
|---|---|---|
| NCHW | 标准格式,Channel 优先 | 训练(NVIDIA GPU 默认�? |
| NHWC | Channel 最�? | 推理优化,Tensor Core 友好 |
| NCHW_VECT_C | 向量�?Channel | INT8 推理 |
卷积(Convolution�?
卷积�?CNN 的核心操作,cuDNN 提供多种算法实现�?
前向传播
cpp
// 创建描述�?
cudnnFilterDescriptor_t filterDesc;
cudnnCreateFilterDescriptor(&filterDesc);
cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
out_channels, in_channels, kernel_h, kernel_w);
cudnnConvolutionDescriptor_t convDesc;
cudnnCreateConvolutionDescriptor(&convDesc);
cudnnSetConvolution2dDescriptor(convDesc,
pad_h, pad_w, // padding
stride_h, stride_w, // stride
dilation_h, dilation_w,
CUDNN_CROSS_CORRELATION, // 卷积模式
CUDNN_DATA_FLOAT); // 计算精度
// 启用 Tensor Core(FP16/BF16 时自动使用)
cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH);
// 查询输出尺寸
int n, c, h, w;
cudnnGetConvolution2dForwardOutputDim(convDesc, inputDesc, filterDesc, &n, &c, &h, &w);
// 算法选择(自动寻找最优算法)
cudnnConvolutionFwdAlgoPerf_t perfResults[10];
int returnedAlgoCount;
cudnnFindConvolutionForwardAlgorithm(handle,
inputDesc, filterDesc, convDesc, outputDesc,
10, &returnedAlgoCount, perfResults);
cudnnConvolutionFwdAlgo_t algo = perfResults[0].algo; // 最快算�?
// 工作空间
size_t workspaceSize;
cudnnGetConvolutionForwardWorkspaceSize(handle,
inputDesc, filterDesc, convDesc, outputDesc, algo, &workspaceSize);
void* workspace;
cudaMalloc(&workspace, workspaceSize);
// 执行卷积
float alpha = 1.0f, beta = 0.0f;
cudnnConvolutionForward(handle,
&alpha, inputDesc, d_input,
filterDesc, d_filter,
convDesc, algo,
workspace, workspaceSize,
&beta, outputDesc, d_output);卷积算法对比
| 算法 | 原理 | 适用场景 |
|---|---|---|
| IMPLICIT_GEMM | 隐式 im2col + GEMM | 通用 |
| IMPLICIT_PRECOMP_GEMM | 预计算索�? | �?kernel |
| GEMM | 显式 im2col + GEMM | �?batch |
| DIRECT | 直接卷积 | �?kernel,小 batch |
| FFT | 频域卷积 | �?kernel |
| FFT_TILING | 分块 FFT | 大输�? |
| WINOGRAD | Winograd 算法 | 3×3 kernel(最常用�? |
Winograd 算法
对于 3×3 卷积(ResNet、VGG 等最常见�?kernel 大小),Winograd 算法将乘法次数从 9 减少�?4,速度提升�?2.25x�?
批归一化(Batch Normalization�?
cpp
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
// 前向(训练模式)
cudnnBatchNormalizationForwardTraining(handle,
mode,
&alpha, &beta,
inputDesc, d_input,
outputDesc, d_output,
bnScaleBiasMeanVarDesc,
d_scale, d_bias,
exponentialAverageFactor,
d_runningMean, d_runningVariance,
epsilon,
d_savedMean, d_savedInvVariance);
// 前向(推理模式,使用运行时统计量�?
cudnnBatchNormalizationForwardInference(handle,
mode, &alpha, &beta,
inputDesc, d_input,
outputDesc, d_output,
bnScaleBiasMeanVarDesc,
d_scale, d_bias,
d_estimatedMean, d_estimatedVariance,
epsilon);激活函�?
cpp
cudnnActivationDescriptor_t activDesc;
cudnnCreateActivationDescriptor(&activDesc);
cudnnSetActivationDescriptor(activDesc,
CUDNN_ACTIVATION_RELU, // ReLU / SIGMOID / TANH / ELU / SWISH
CUDNN_PROPAGATE_NAN,
0.0); // coef(用�?CLIPPED_RELU 等)
cudnnActivationForward(handle,
activDesc,
&alpha, inputDesc, d_input,
&beta, outputDesc, d_output);池化(Pooling�?
cpp
cudnnPoolingDescriptor_t poolDesc;
cudnnCreatePoolingDescriptor(&poolDesc);
cudnnSetPooling2dDescriptor(poolDesc,
CUDNN_POOLING_MAX, // MAX / AVERAGE_COUNT_INCLUDE_PADDING
CUDNN_PROPAGATE_NAN,
window_h, window_w,
pad_h, pad_w,
stride_h, stride_w);
cudnnPoolingForward(handle,
poolDesc,
&alpha, inputDesc, d_input,
&beta, outputDesc, d_output);Multi-Head Attention(cuDNN 8+�?
cpp
cudnnAttnDescriptor_t attnDesc;
cudnnCreateAttnDescriptor(&attnDesc);
cudnnSetAttnDescriptor(attnDesc,
CUDNN_ATTN_QUERYMAP_ALL_TO_ALL,
nHeads,
smScaler, // softmax 缩放因子 1/√d_k
CUDNN_DATA_FLOAT,
CUDNN_DATA_FLOAT,
CUDNN_DEFAULT_MATH,
NULL, // dropout descriptor
NULL,
qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize,
seqLenQ, seqLenK,
batchSize, beamSize);
cudnnMultiHeadAttnForward(handle,
attnDesc,
currIdx,
loWinIdx, hiWinIdx,
devSeqLengthsQO, devSeqLengthsKV,
qDesc, queries,
NULL, residuals,
kDesc, keys,
vDesc, values,
oDesc, output,
weightSizeInBytes, weights,
workSpaceSizeInBytes, workSpace,
reserveSpaceSizeInBytes, reserveSpace);cuDNN 8.x 新特性:Graph API
cuDNN 8 引入了基于计算图�?API,支�?自动 Kernel 融合*�?
cpp
// 构建计算�?
cudnn_frontend::graph::Graph graph;
graph.set_io_data_type(cudnn_frontend::DataType_t::HALF)
.set_intermediate_data_type(cudnn_frontend::DataType_t::FLOAT)
.set_compute_data_type(cudnn_frontend::DataType_t::FLOAT);
// 添加操作节点
auto X = graph.tensor(cudnn_frontend::graph::Tensor_attributes()
.set_name("X").set_dim({N, C, H, W}).set_stride({C*H*W, H*W, W, 1}));
auto W = graph.tensor(...);
auto conv_output = graph.conv_fprop(X, W,
cudnn_frontend::graph::Conv_fprop_attributes()
.set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1}));
auto Y = graph.pointwise(conv_output,
cudnn_frontend::graph::Pointwise_attributes()
.set_mode(cudnn_frontend::PointwiseMode_t::RELU_FWD));
// cuDNN 自动�?Conv + ReLU 融合为单�?Kernel
graph.validate();
graph.build_operation_graph(handle);
graph.create_execution_plans({cudnn_frontend::HeurMode_t::A});
graph.build_plans(handle);
graph.execute(handle, variant_pack, workspace);性能对比(ResNet-50,A100�?
| 操作 | 手写 CUDA | cuDNN | 提升 |
|---|---|---|---|
| Conv 3×3 | ~85% 峰�? | ~95% 峰�? | 1.1x |
| BN + ReLU | 分开执行 | 融合执行 | 1.8x |
| 完整前向 | �? | �? | 2-3x vs CPU |