Skip to content

cuDNN �?深度神经网络加速库

cuDNN(CUDA Deep Neural Network library)是 NVIDIA 专为深度学习设计�?GPU 加速原语库,是 PyTorch、TensorFlow、MXNet 等框架的底层计算引擎�?

cuDNN 的定�?

PyTorch / TensorFlow / JAX
         �?
      cuDNN API
         �?
  卷积 / 池化 / 归一�?/ 激�?/ RNN / Attention
         �?
  Tensor Core / CUDA Core
         �?
      GPU 硬件

cuDNN 提供的不只是算法实现,更重要的是自动选择最优算�?*(Algorithm Selection)和融合优化**(Kernel Fusion)�?

初始化与句柄

cpp
#include <cudnn.h>

cudnnHandle_t handle;
cudnnCreate(&handle);

// 绑定�?Stream
cudaStream_t stream;
cudaStreamCreate(&stream);
cudnnSetStream(handle, stream);

// 清理
cudnnDestroy(handle);

Tensor 描述�?

cuDNN 使用描述符(Descriptor)来描述张量的形状和数据类型�?

cpp
cudnnTensorDescriptor_t inputDesc;
cudnnCreateTensorDescriptor(&inputDesc);

// NCHW 格式:Batch × Channel × Height × Width
cudnnSetTensor4dDescriptor(
    inputDesc,
    CUDNN_TENSOR_NCHW,    // 内存布局
    CUDNN_DATA_FLOAT,     // 数据类型
    batch_size,           // N
    channels,             // C
    height,               // H
    width                 // W
);

cudnnDestroyTensorDescriptor(inputDesc);

内存布局格式

格式说明适用场景
NCHW标准格式,Channel 优先训练(NVIDIA GPU 默认�?
NHWCChannel 最�?推理优化,Tensor Core 友好
NCHW_VECT_C向量�?ChannelINT8 推理

卷积(Convolution�?

卷积�?CNN 的核心操作,cuDNN 提供多种算法实现�?

前向传播

cpp
// 创建描述�?
cudnnFilterDescriptor_t filterDesc;
cudnnCreateFilterDescriptor(&filterDesc);
cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW,
    out_channels, in_channels, kernel_h, kernel_w);

cudnnConvolutionDescriptor_t convDesc;
cudnnCreateConvolutionDescriptor(&convDesc);
cudnnSetConvolution2dDescriptor(convDesc,
    pad_h, pad_w,       // padding
    stride_h, stride_w, // stride
    dilation_h, dilation_w,
    CUDNN_CROSS_CORRELATION,  // 卷积模式
    CUDNN_DATA_FLOAT);        // 计算精度

// 启用 Tensor Core(FP16/BF16 时自动使用)
cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH);

// 查询输出尺寸
int n, c, h, w;
cudnnGetConvolution2dForwardOutputDim(convDesc, inputDesc, filterDesc, &n, &c, &h, &w);

// 算法选择(自动寻找最优算法)
cudnnConvolutionFwdAlgoPerf_t perfResults[10];
int returnedAlgoCount;
cudnnFindConvolutionForwardAlgorithm(handle,
    inputDesc, filterDesc, convDesc, outputDesc,
    10, &returnedAlgoCount, perfResults);

cudnnConvolutionFwdAlgo_t algo = perfResults[0].algo;  // 最快算�?

// 工作空间
size_t workspaceSize;
cudnnGetConvolutionForwardWorkspaceSize(handle,
    inputDesc, filterDesc, convDesc, outputDesc, algo, &workspaceSize);

void* workspace;
cudaMalloc(&workspace, workspaceSize);

// 执行卷积
float alpha = 1.0f, beta = 0.0f;
cudnnConvolutionForward(handle,
    &alpha, inputDesc, d_input,
    filterDesc, d_filter,
    convDesc, algo,
    workspace, workspaceSize,
    &beta, outputDesc, d_output);

卷积算法对比

算法原理适用场景
IMPLICIT_GEMM隐式 im2col + GEMM通用
IMPLICIT_PRECOMP_GEMM预计算索�?�?kernel
GEMM显式 im2col + GEMM�?batch
DIRECT直接卷积�?kernel,小 batch
FFT频域卷积�?kernel
FFT_TILING分块 FFT大输�?
WINOGRADWinograd 算法3×3 kernel(最常用�?

Winograd 算法

对于 3×3 卷积(ResNet、VGG 等最常见�?kernel 大小),Winograd 算法将乘法次数从 9 减少�?4,速度提升�?2.25x�?

批归一化(Batch Normalization�?

cpp
cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;

// 前向(训练模式)
cudnnBatchNormalizationForwardTraining(handle,
    mode,
    &alpha, &beta,
    inputDesc, d_input,
    outputDesc, d_output,
    bnScaleBiasMeanVarDesc,
    d_scale, d_bias,
    exponentialAverageFactor,
    d_runningMean, d_runningVariance,
    epsilon,
    d_savedMean, d_savedInvVariance);

// 前向(推理模式,使用运行时统计量�?
cudnnBatchNormalizationForwardInference(handle,
    mode, &alpha, &beta,
    inputDesc, d_input,
    outputDesc, d_output,
    bnScaleBiasMeanVarDesc,
    d_scale, d_bias,
    d_estimatedMean, d_estimatedVariance,
    epsilon);

激活函�?

cpp
cudnnActivationDescriptor_t activDesc;
cudnnCreateActivationDescriptor(&activDesc);
cudnnSetActivationDescriptor(activDesc,
    CUDNN_ACTIVATION_RELU,  // ReLU / SIGMOID / TANH / ELU / SWISH
    CUDNN_PROPAGATE_NAN,
    0.0);  // coef(用�?CLIPPED_RELU 等)

cudnnActivationForward(handle,
    activDesc,
    &alpha, inputDesc, d_input,
    &beta, outputDesc, d_output);

池化(Pooling�?

cpp
cudnnPoolingDescriptor_t poolDesc;
cudnnCreatePoolingDescriptor(&poolDesc);
cudnnSetPooling2dDescriptor(poolDesc,
    CUDNN_POOLING_MAX,  // MAX / AVERAGE_COUNT_INCLUDE_PADDING
    CUDNN_PROPAGATE_NAN,
    window_h, window_w,
    pad_h, pad_w,
    stride_h, stride_w);

cudnnPoolingForward(handle,
    poolDesc,
    &alpha, inputDesc, d_input,
    &beta, outputDesc, d_output);

Multi-Head Attention(cuDNN 8+�?

cpp
cudnnAttnDescriptor_t attnDesc;
cudnnCreateAttnDescriptor(&attnDesc);
cudnnSetAttnDescriptor(attnDesc,
    CUDNN_ATTN_QUERYMAP_ALL_TO_ALL,
    nHeads,
    smScaler,           // softmax 缩放因子 1/√d_k
    CUDNN_DATA_FLOAT,
    CUDNN_DATA_FLOAT,
    CUDNN_DEFAULT_MATH,
    NULL,               // dropout descriptor
    NULL,
    qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize,
    seqLenQ, seqLenK,
    batchSize, beamSize);

cudnnMultiHeadAttnForward(handle,
    attnDesc,
    currIdx,
    loWinIdx, hiWinIdx,
    devSeqLengthsQO, devSeqLengthsKV,
    qDesc, queries,
    NULL, residuals,
    kDesc, keys,
    vDesc, values,
    oDesc, output,
    weightSizeInBytes, weights,
    workSpaceSizeInBytes, workSpace,
    reserveSpaceSizeInBytes, reserveSpace);

cuDNN 8.x 新特性:Graph API

cuDNN 8 引入了基于计算图�?API,支�?自动 Kernel 融合*�?

cpp
// 构建计算�?
cudnn_frontend::graph::Graph graph;
graph.set_io_data_type(cudnn_frontend::DataType_t::HALF)
     .set_intermediate_data_type(cudnn_frontend::DataType_t::FLOAT)
     .set_compute_data_type(cudnn_frontend::DataType_t::FLOAT);

// 添加操作节点
auto X = graph.tensor(cudnn_frontend::graph::Tensor_attributes()
    .set_name("X").set_dim({N, C, H, W}).set_stride({C*H*W, H*W, W, 1}));

auto W = graph.tensor(...);

auto conv_output = graph.conv_fprop(X, W,
    cudnn_frontend::graph::Conv_fprop_attributes()
        .set_padding({1, 1}).set_stride({1, 1}).set_dilation({1, 1}));

auto Y = graph.pointwise(conv_output,
    cudnn_frontend::graph::Pointwise_attributes()
        .set_mode(cudnn_frontend::PointwiseMode_t::RELU_FWD));

// cuDNN 自动�?Conv + ReLU 融合为单�?Kernel
graph.validate();
graph.build_operation_graph(handle);
graph.create_execution_plans({cudnn_frontend::HeurMode_t::A});
graph.build_plans(handle);
graph.execute(handle, variant_pack, workspace);

性能对比(ResNet-50,A100�?

操作手写 CUDAcuDNN提升
Conv 3×3~85% 峰�?~95% 峰�?1.1x
BN + ReLU分开执行融合执行1.8x
完整前向�?�?2-3x vs CPU

下一篇:NCCL �?多GPU集合通信 →

基于 NVIDIA CUDA 官方文档整理