Thrust �?GPU 并行算法�?

Thrust �?NVIDIA 提供的类 C++ STL �?GPU 并行算法库，让开发者无需编写 CUDA Kernel 即可使用高性能�?GPU 并行算法�?

Thrust 的设计哲�?

Thrust �?API �?C++ STL 高度相似，降低了 GPU 编程门槛�?

cpp

// STL（CPU�?
std::sort(vec.begin(), vec.end());
std::reduce(vec.begin(), vec.end(), 0);

// Thrust（GPU�?
thrust::sort(d_vec.begin(), d_vec.end());
thrust::reduce(d_vec.begin(), d_vec.end(), 0);

Thrust 支持多种执行后端�?

thrust::device �?CUDA GPU（默认）
thrust::host �?CPU（串行）
thrust::omp �?OpenMP 多线�?
thrust::tbb �?Intel TBB

核心容器

device_vector �?host_vector

cpp

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>

// 主机向量（CPU 内存�?
thrust::host_vector<float> h_vec(1000, 1.0f);  // 1000�?.0

// 设备向量（GPU 内存�?
thrust::device_vector<float> d_vec(1000);

// 自动内存传输（赋值触�?H2D 拷贝�?
d_vec = h_vec;

// �?STL vector 初始�?
std::vector<int> stl_vec = {1, 2, 3, 4, 5};
thrust::device_vector<int> d_stl(stl_vec);

// 访问元素（触�?D2H 拷贝，慎用）
float val = d_vec[0];

// 获取原始指针（传�?CUDA Kernel�?
float* raw_ptr = thrust::raw_pointer_cast(d_vec.data());

排序算法

cpp

#include <thrust/sort.h>

thrust::device_vector<int> d_keys = {5, 3, 1, 4, 2};
thrust::device_vector<float> d_vals = {0.5, 0.3, 0.1, 0.4, 0.2};

// 简单排�?
thrust::sort(d_keys.begin(), d_keys.end());
// 结果：[1, 2, 3, 4, 5]

// 降序排序
thrust::sort(d_keys.begin(), d_keys.end(), thrust::greater<int>());

// 按键排序（同时重排值）
thrust::sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());

// 稳定排序
thrust::stable_sort(d_keys.begin(), d_keys.end());

排序性能（A100，FP32�?亿元素）

算法	时间	吞吐�?
thrust::sort	~180 ms	~2.2 GB/s
thrust::sort_by_key	~350 ms	~1.1 GB/s
std::sort (CPU)	~8000 ms	~50 MB/s

归约算法

cpp

#include <thrust/reduce.h>

thrust::device_vector<float> d_vec(N, 1.0f);

// 求和
float sum = thrust::reduce(d_vec.begin(), d_vec.end(), 0.0f);

// 求最大�?
float maxVal = thrust::reduce(d_vec.begin(), d_vec.end(),
    -FLT_MAX, thrust::maximum<float>());

// 自定义归约操�?
struct AbsMax {
    __host__ __device__
    float operator()(float a, float b) {
        return fabs(a) > fabs(b) ? fabs(a) : fabs(b);
    }
};
float absMax = thrust::reduce(d_vec.begin(), d_vec.end(), 0.0f, AbsMax());

前缀扫描（Scan�?

cpp

#include <thrust/scan.h>

thrust::device_vector<int> d_in  = {1, 2, 3, 4, 5};
thrust::device_vector<int> d_out(5);

// 包含扫描（inclusive scan）：每个位置包含自身
thrust::inclusive_scan(d_in.begin(), d_in.end(), d_out.begin());
// 结果：[1, 3, 6, 10, 15]

// 排除扫描（exclusive scan）：每个位置不包含自�?
thrust::exclusive_scan(d_in.begin(), d_in.end(), d_out.begin(), 0);
// 结果：[0, 1, 3, 6, 10]

// 分段扫描（按 key 分组�?
thrust::device_vector<int> d_keys = {0, 0, 1, 1, 2};
thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(),
    d_in.begin(), d_out.begin());
// 结果：[1, 3, 3, 7, 5]

变换（Transform�?

cpp

#include <thrust/transform.h>
#include <thrust/functional.h>

thrust::device_vector<float> d_x(N, 2.0f);
thrust::device_vector<float> d_y(N, 3.0f);
thrust::device_vector<float> d_z(N);

// 元素级加�?
thrust::transform(d_x.begin(), d_x.end(),
                  d_y.begin(),
                  d_z.begin(),
                  thrust::plus<float>());

// 自定义变换（Lambda �?Functor�?
thrust::transform(d_x.begin(), d_x.end(), d_z.begin(),
    [] __host__ __device__ (float x) { return x * x + 1.0f; });

// SAXPY: z = alpha * x + y
float alpha = 2.0f;
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), d_z.begin(),
    [alpha] __host__ __device__ (float x, float y) {
        return alpha * x + y;
    });

过滤（Copy If / Remove If�?

cpp

#include <thrust/copy.h>
#include <thrust/remove.h>

thrust::device_vector<int> d_in = {1, -2, 3, -4, 5, -6};
thrust::device_vector<int> d_out(6);

// 只保留正�?
auto end = thrust::copy_if(d_in.begin(), d_in.end(), d_out.begin(),
    [] __host__ __device__ (int x) { return x > 0; });
// d_out: [1, 3, 5, ...]，end 指向有效数据末尾

// 移除负数（原地）
auto new_end = thrust::remove_if(d_in.begin(), d_in.end(),
    [] __host__ __device__ (int x) { return x < 0; });
d_in.erase(new_end, d_in.end());
// d_in: [1, 3, 5]

计数与查�?

cpp

#include <thrust/count.h>
#include <thrust/find.h>

// 计数满足条件的元�?
int count = thrust::count_if(d_vec.begin(), d_vec.end(),
    [] __host__ __device__ (int x) { return x > 0; });

// 查找第一个满足条件的元素
auto it = thrust::find_if(d_vec.begin(), d_vec.end(),
    [] __host__ __device__ (int x) { return x == 42; });

if (it != d_vec.end()) {
    int pos = it - d_vec.begin();
    printf("Found at position %d\n", pos);
}

去重与集合操�?

cpp

#include <thrust/unique.h>
#include <thrust/set_operations.h>

// 去重（需要先排序�?
thrust::sort(d_vec.begin(), d_vec.end());
auto new_end = thrust::unique(d_vec.begin(), d_vec.end());
d_vec.erase(new_end, d_vec.end());

// 集合操作
thrust::device_vector<int> A = {1, 2, 3, 4};
thrust::device_vector<int> B = {2, 4, 6};
thrust::device_vector<int> C(7);

// 并集
auto end = thrust::set_union(A.begin(), A.end(), B.begin(), B.end(), C.begin());
// C: [1, 2, 3, 4, 6]

// 交集
end = thrust::set_intersection(A.begin(), A.end(), B.begin(), B.end(), C.begin());
// C: [2, 4]

�?CUDA Kernel 混合使用

cpp

// 获取原始指针传给自定�?Kernel
thrust::device_vector<float> d_data(N);

// �?Thrust 初始�?
thrust::sequence(d_data.begin(), d_data.end(), 0.0f);

// 传给自定�?Kernel
float* raw = thrust::raw_pointer_cast(d_data.data());
myCustomKernel<<<grid, block>>>(raw, N);

// 再用 Thrust 后处�?
float sum = thrust::reduce(d_data.begin(), d_data.end(), 0.0f);

执行策略

cpp

#include <thrust/execution_policy.h>

// 在指�?Stream 上执行（避免同步�?
cudaStream_t stream;
cudaStreamCreate(&stream);

thrust::sort(thrust::cuda::par.on(stream),
    d_vec.begin(), d_vec.end());

// CPU 执行（调试用�?
thrust::sort(thrust::host, h_vec.begin(), h_vec.end());

实战：Top-K 选取

cpp

// �?N 个元素中选出最大的 K �?
void topK(thrust::device_vector<float>& d_scores,
          thrust::device_vector<int>& d_indices, int K) {
    int N = d_scores.size();
    
    // 创建索引序列
    thrust::sequence(d_indices.begin(), d_indices.end());
    
    // 按分数降序排列索�?
    thrust::sort_by_key(d_scores.begin(), d_scores.end(),
                        d_indices.begin(),
                        thrust::greater<float>());
    
    // 截取�?K �?
    d_scores.resize(K);
    d_indices.resize(K);
}

下一篇：CUDA Streams 与异步执�?→

Thrust �?GPU 并行算法�? ​

Thrust 的设计哲�? ​

核心容器 ​

device_vector �?host_vector ​

排序算法 ​

排序性能（A100，FP32�?亿元素） ​

归约算法 ​

前缀扫描（Scan�? ​

变换（Transform�? ​

过滤（Copy If / Remove If�? ​

计数与查�? ​

去重与集合操�? ​

�?CUDA Kernel 混合使用 ​

执行策略 ​

实战：Top-K 选取 ​

Thrust �?GPU 并行算法�?

Thrust 的设计哲�?

核心容器

device_vector �?host_vector

排序算法

排序性能（A100，FP32�?亿元素）

归约算法

前缀扫描（Scan�?

变换（Transform�?

过滤（Copy If / Remove If�?

计数与查�?

去重与集合操�?

�?CUDA Kernel 混合使用

执行策略

实战：Top-K 选取