Thrust �?GPU 并行算法�?
Thrust �?NVIDIA 提供的类 C++ STL �?GPU 并行算法库,让开发者无需编写 CUDA Kernel 即可使用高性能�?GPU 并行算法�?
Thrust 的设计哲�?
Thrust �?API �?C++ STL 高度相似,降低了 GPU 编程门槛�?
cpp
// STL(CPU�?
std::sort(vec.begin(), vec.end());
std::reduce(vec.begin(), vec.end(), 0);
// Thrust(GPU�?
thrust::sort(d_vec.begin(), d_vec.end());
thrust::reduce(d_vec.begin(), d_vec.end(), 0);Thrust 支持多种执行后端�?
thrust::device�?CUDA GPU(默认)thrust::host�?CPU(串行)thrust::omp�?OpenMP 多线�?thrust::tbb�?Intel TBB
核心容器
device_vector �?host_vector
cpp
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
// 主机向量(CPU 内存�?
thrust::host_vector<float> h_vec(1000, 1.0f); // 1000�?.0
// 设备向量(GPU 内存�?
thrust::device_vector<float> d_vec(1000);
// 自动内存传输(赋值触�?H2D 拷贝�?
d_vec = h_vec;
// �?STL vector 初始�?
std::vector<int> stl_vec = {1, 2, 3, 4, 5};
thrust::device_vector<int> d_stl(stl_vec);
// 访问元素(触�?D2H 拷贝,慎用)
float val = d_vec[0];
// 获取原始指针(传�?CUDA Kernel�?
float* raw_ptr = thrust::raw_pointer_cast(d_vec.data());排序算法
cpp
#include <thrust/sort.h>
thrust::device_vector<int> d_keys = {5, 3, 1, 4, 2};
thrust::device_vector<float> d_vals = {0.5, 0.3, 0.1, 0.4, 0.2};
// 简单排�?
thrust::sort(d_keys.begin(), d_keys.end());
// 结果:[1, 2, 3, 4, 5]
// 降序排序
thrust::sort(d_keys.begin(), d_keys.end(), thrust::greater<int>());
// 按键排序(同时重排值)
thrust::sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
// 稳定排序
thrust::stable_sort(d_keys.begin(), d_keys.end());排序性能(A100,FP32�?亿元素)
| 算法 | 时间 | 吞吐�? |
|---|---|---|
| thrust::sort | ~180 ms | ~2.2 GB/s |
| thrust::sort_by_key | ~350 ms | ~1.1 GB/s |
| std::sort (CPU) | ~8000 ms | ~50 MB/s |
归约算法
cpp
#include <thrust/reduce.h>
thrust::device_vector<float> d_vec(N, 1.0f);
// 求和
float sum = thrust::reduce(d_vec.begin(), d_vec.end(), 0.0f);
// 求最大�?
float maxVal = thrust::reduce(d_vec.begin(), d_vec.end(),
-FLT_MAX, thrust::maximum<float>());
// 自定义归约操�?
struct AbsMax {
__host__ __device__
float operator()(float a, float b) {
return fabs(a) > fabs(b) ? fabs(a) : fabs(b);
}
};
float absMax = thrust::reduce(d_vec.begin(), d_vec.end(), 0.0f, AbsMax());前缀扫描(Scan�?
cpp
#include <thrust/scan.h>
thrust::device_vector<int> d_in = {1, 2, 3, 4, 5};
thrust::device_vector<int> d_out(5);
// 包含扫描(inclusive scan):每个位置包含自身
thrust::inclusive_scan(d_in.begin(), d_in.end(), d_out.begin());
// 结果:[1, 3, 6, 10, 15]
// 排除扫描(exclusive scan):每个位置不包含自�?
thrust::exclusive_scan(d_in.begin(), d_in.end(), d_out.begin(), 0);
// 结果:[0, 1, 3, 6, 10]
// 分段扫描(按 key 分组�?
thrust::device_vector<int> d_keys = {0, 0, 1, 1, 2};
thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(),
d_in.begin(), d_out.begin());
// 结果:[1, 3, 3, 7, 5]变换(Transform�?
cpp
#include <thrust/transform.h>
#include <thrust/functional.h>
thrust::device_vector<float> d_x(N, 2.0f);
thrust::device_vector<float> d_y(N, 3.0f);
thrust::device_vector<float> d_z(N);
// 元素级加�?
thrust::transform(d_x.begin(), d_x.end(),
d_y.begin(),
d_z.begin(),
thrust::plus<float>());
// 自定义变换(Lambda �?Functor�?
thrust::transform(d_x.begin(), d_x.end(), d_z.begin(),
[] __host__ __device__ (float x) { return x * x + 1.0f; });
// SAXPY: z = alpha * x + y
float alpha = 2.0f;
thrust::transform(d_x.begin(), d_x.end(), d_y.begin(), d_z.begin(),
[alpha] __host__ __device__ (float x, float y) {
return alpha * x + y;
});过滤(Copy If / Remove If�?
cpp
#include <thrust/copy.h>
#include <thrust/remove.h>
thrust::device_vector<int> d_in = {1, -2, 3, -4, 5, -6};
thrust::device_vector<int> d_out(6);
// 只保留正�?
auto end = thrust::copy_if(d_in.begin(), d_in.end(), d_out.begin(),
[] __host__ __device__ (int x) { return x > 0; });
// d_out: [1, 3, 5, ...],end 指向有效数据末尾
// 移除负数(原地)
auto new_end = thrust::remove_if(d_in.begin(), d_in.end(),
[] __host__ __device__ (int x) { return x < 0; });
d_in.erase(new_end, d_in.end());
// d_in: [1, 3, 5]计数与查�?
cpp
#include <thrust/count.h>
#include <thrust/find.h>
// 计数满足条件的元�?
int count = thrust::count_if(d_vec.begin(), d_vec.end(),
[] __host__ __device__ (int x) { return x > 0; });
// 查找第一个满足条件的元素
auto it = thrust::find_if(d_vec.begin(), d_vec.end(),
[] __host__ __device__ (int x) { return x == 42; });
if (it != d_vec.end()) {
int pos = it - d_vec.begin();
printf("Found at position %d\n", pos);
}去重与集合操�?
cpp
#include <thrust/unique.h>
#include <thrust/set_operations.h>
// 去重(需要先排序�?
thrust::sort(d_vec.begin(), d_vec.end());
auto new_end = thrust::unique(d_vec.begin(), d_vec.end());
d_vec.erase(new_end, d_vec.end());
// 集合操作
thrust::device_vector<int> A = {1, 2, 3, 4};
thrust::device_vector<int> B = {2, 4, 6};
thrust::device_vector<int> C(7);
// 并集
auto end = thrust::set_union(A.begin(), A.end(), B.begin(), B.end(), C.begin());
// C: [1, 2, 3, 4, 6]
// 交集
end = thrust::set_intersection(A.begin(), A.end(), B.begin(), B.end(), C.begin());
// C: [2, 4]�?CUDA Kernel 混合使用
cpp
// 获取原始指针传给自定�?Kernel
thrust::device_vector<float> d_data(N);
// �?Thrust 初始�?
thrust::sequence(d_data.begin(), d_data.end(), 0.0f);
// 传给自定�?Kernel
float* raw = thrust::raw_pointer_cast(d_data.data());
myCustomKernel<<<grid, block>>>(raw, N);
// 再用 Thrust 后处�?
float sum = thrust::reduce(d_data.begin(), d_data.end(), 0.0f);执行策略
cpp
#include <thrust/execution_policy.h>
// 在指�?Stream 上执行(避免同步�?
cudaStream_t stream;
cudaStreamCreate(&stream);
thrust::sort(thrust::cuda::par.on(stream),
d_vec.begin(), d_vec.end());
// CPU 执行(调试用�?
thrust::sort(thrust::host, h_vec.begin(), h_vec.end());实战:Top-K 选取
cpp
// �?N 个元素中选出最大的 K �?
void topK(thrust::device_vector<float>& d_scores,
thrust::device_vector<int>& d_indices, int K) {
int N = d_scores.size();
// 创建索引序列
thrust::sequence(d_indices.begin(), d_indices.end());
// 按分数降序排列索�?
thrust::sort_by_key(d_scores.begin(), d_scores.end(),
d_indices.begin(),
thrust::greater<float>());
// 截取�?K �?
d_scores.resize(K);
d_indices.resize(K);
}