CUDA C/C++:计算每点距离的平均值(相互作用能量,也许?)
我一直在尝试编写一个内核,用于计算N个给定点之间距离的倒数之和.C中的串行尾声就像
average = 0; for(int i = 0; i < Np; i++){ for(int j = i + 1; j < Np; j++){ average += 1.0e0f/sqrtf((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j])); } } average = average/(float)N; 其中rx和ry分别是x和y坐标. 我通过使用随机数生成器的内核生成点.对于内核,我使用每块128(256)个线程来获得4k(8k)点.在它上面,每个线程执行内部的内部循环,然后将结果传递给reduce sum函数,如下所示 生成点数: __global__ void InitRNG ( curandState * state,const int seed ){ int tIdx = blockIdx.x*blockDim.x + threadIdx.x; curand_init (seed,tIdx,&state[tIdx]); } __global__ void SortPoints(float* X,float* Y,const int N,curandState *state){ float rdmn1,rdmn2; unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x; float range; if(tIdx < N){ rdmn1 = curand_uniform(&state[tIdx]); rdmn2 = curand_uniform(&state[tIdx]); range = sqrtf(0.25e0f*N*rdmn1); X[tIdx] = range*cosf(2.0e0f*pi*rdmn2); Y[tIdx] = range*sinf(2.0e0f*pi*rdmn2); } } 减少: __device__ float ReduceSum2(float In){ __shared__ float data[BlockSize]; unsigned int tIdx = threadIdx.x; data[tIdx] = In; __syncthreads(); for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){ if(tIdx < i){ data[tIdx] += data[tIdx + i]; } __syncthreads(); } return data[0]; } 核心: __global__ void AvgDistance(float *X,float *Y,float *Avg,const int N){ int tIdx = blockIdx.x*blockDim.x + threadIdx.x; int bIdx = blockIdx.x; float x,y; float d = 0.0f; if(tIdx < N){ for(int i = tIdx + 1; i < N ; i++){ x = X[tIdx] - X[i]; y = Y[tIdx] - Y[i]; d += 1.0e0f/(sqrtf(x*x + y*y)); } __syncthreads(); Avg[bIdx] = ReduceSum2(d); } } 内核配置和启动如下: dim3 threads(BlockSize,BlockSize); dim3 blocks(ceil(Np/threads.x),ceil(Np/threads.y)); InitRNG<<<blocks.x,threads.x>>>(d_state,seed); SortPoints<<<blocks.x,threads.x>>>(d_rx,d_ry,Np,d_state); AvgDistance<<<blocks.x,threads.x,threads.x*sizeof(float)>>>(d_rx,d_Avg,Np); 最后,我将数据复制回主机,然后执行剩余的总和: Avg = new float[blocks.x]; CHECK(cudaMemcpy(Avg,blocks.x*sizeof(float),cudaMemcpyDeviceToHost),ERROR_CPY_DEVTOH); float average = 0; for(int i = 0; i < blocks.x; i++){ average += Avg[i]; } average = average/(float)Np; 4k积分,好的!结果是: Average distance between points (via Kernel) = 108.615 Average distance between points (via CPU) = 110.191 在这种情况下,总和可能会以不同的顺序执行,导致两个结果彼此分开,我不知道…… 但是当谈到8k时,结果却是不同的: Average distance between points (via Kernel) = 153.63 Average distance between points (via CPU) = 131.471 对我而言,似乎内核和串行代码都以相同的方式编写.是什么让我不相信CUDA计算浮点数的精度.这有意义吗?或者当某些线程同时从X和Y加载相同的数据时,对全局内存的访问会导致一些冲突吗?或者我编写内核的方式在某种程度上是“错误的”(我的意思是,我做的是什么导致两个结果彼此分开?). 解决方法
实际上,从我所知道的,问题似乎是在CPU方面.我根据您的代码创建了一个示例代码.
我能够重现你的结果. 首先,我将sinf,cosf和sqrtf的所有实例切换为相应的双版本.这对结果没有影响. 接下来我包含了一个typedef,所以我可以很容易地将精度从float切换到double和back,用mytype取代我的typedef中代码中每个相关的float实例. 当我使用float的typedef运行代码并且数据大小为4096时,我得到以下结果: GPU average = 108.294922 CPU average = 109.925285 当我运行typedef为double且数据大小为4096的代码时,我得到以下结果: GPU average = 108.294903 CPU average = 108.294903 当我使用float的typedef运行代码并且数据大小为8192时,我得到以下结果: GPU average = 153.447327 CPU average = 131.473526 当我运行typedef为double且数据大小为8192的代码时,我得到以下结果: GPU average = 153.447380 CPU average = 153.447380 至少有2个观察结果: > GPU结果在float和double之间不变,除了小数点后5位 基于此,我相信CPU正在提供可变的,可疑的行为. 这是我的代码供参考: #include <stdio.h> #include <curand.h> #include <curand_kernel.h> #define DSIZE 8192 #define BlockSize 32 #define pi 3.14159f #define cudaCheckErrors(msg) do { cudaError_t __err = cudaGetLastError(); if (__err != cudaSuccess) { fprintf(stderr,"Fatal error: %s (%s at %s:%d)n", msg,cudaGetErrorString(__err), __FILE__,__LINE__); fprintf(stderr,"*** FAILED - ABORTINGn"); exit(1); } } while (0) typedef double mytype; __global__ void InitRNG ( curandState * state,&state[tIdx]); } __global__ void SortPoints(mytype* X,mytype* Y,curandState *state){ mytype rdmn1,rdmn2; unsigned int tIdx = blockIdx.x*blockDim.x + threadIdx.x; mytype range; if(tIdx < N){ rdmn1 = curand_uniform(&state[tIdx]); rdmn2 = curand_uniform(&state[tIdx]); range = sqrt(0.25e0f*N*rdmn1); X[tIdx] = range*cos(2.0e0f*pi*rdmn2); Y[tIdx] = range*sin(2.0e0f*pi*rdmn2); } } __device__ mytype ReduceSum2(mytype In){ __shared__ mytype data[BlockSize]; unsigned int tIdx = threadIdx.x; data[tIdx] = In; __syncthreads(); for(unsigned int i = blockDim.x/2; i > 0; i >>= 1){ if(tIdx < i){ data[tIdx] += data[tIdx + i]; } __syncthreads(); } return data[0]; } __global__ void AvgDistance(mytype *X,mytype *Y,mytype *Avg,const int N){ int tIdx = blockIdx.x*blockDim.x + threadIdx.x; int bIdx = blockIdx.x; mytype x,y; mytype d = 0.0f; if(tIdx < N){ for(int i = tIdx + 1; i < N ; i++){ x = X[tIdx] - X[i]; y = Y[tIdx] - Y[i]; d += 1.0e0f/(sqrt(x*x + y*y)); } __syncthreads(); Avg[bIdx] = ReduceSum2(d); } } mytype cpu_avg(const mytype *rx,const mytype *ry,const int size){ mytype average = 0.0f; for(int i = 0; i < size; i++){ for(int j = i + 1; j < size; j++){ average += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j])); } } average = average/(mytype)size; return average; } int main() { int Np = DSIZE; mytype *rx,*ry,*d_rx,*d_ry,*d_Avg,*Avg; curandState *d_state; int seed = 1; dim3 threads(BlockSize,BlockSize); dim3 blocks((int)ceilf(Np/(float)threads.x),(int)ceilf(Np/(float)threads.y)); printf("number of blocks = %dn",blocks.x); printf("number of threads= %dn",threads.x); rx = (mytype *)malloc(DSIZE*sizeof(mytype)); if (rx == 0) {printf("malloc failn"); return 1;} ry = (mytype *)malloc(DSIZE*sizeof(mytype)); if (ry == 0) {printf("malloc failn"); return 1;} cudaMalloc((void**)&d_rx,DSIZE * sizeof(mytype)); cudaMalloc((void**)&d_ry,DSIZE * sizeof(mytype)); cudaMalloc((void**)&d_Avg,blocks.x * sizeof(mytype)); cudaMalloc((void**)&d_state,DSIZE * sizeof(curandState)); cudaCheckErrors("cudamalloc"); InitRNG<<<blocks.x,seed); SortPoints<<<blocks.x,d_state); AvgDistance<<<blocks.x,threads.x*sizeof(mytype)>>>(d_rx,Np); cudaCheckErrors("kernels"); Avg = new mytype[blocks.x]; cudaMemcpy(Avg,blocks.x*sizeof(mytype),cudaMemcpyDeviceToHost); cudaMemcpy(rx,d_rx,DSIZE*sizeof(mytype),cudaMemcpyDeviceToHost); cudaMemcpy(ry,cudaMemcpyDeviceToHost); cudaCheckErrors("cudamemcpy"); mytype average = 0; for(int i = 0; i < blocks.x; i++){ average += Avg[i]; } average = average/(mytype)Np; printf("GPU average = %fn",average); average = cpu_avg(rx,ry,DSIZE); printf("CPU average = %fn",average); return 0; } 我在RHEL 5.5,CUDA 5.0,Intel Xeon X5560上运行 编译: nvcc -O3 -arch=sm_20 -lcurand -lm -o t93 t93.cu 编辑: mytype cpu_avg(const mytype *rx,const int size){ mytype average = 0.0f; mytype temp = 0.0f; for(int i = 0; i < size; i++){ for(int j = i + 1; j < size; j++){ temp += 1.0e0f/sqrt((rx[i]-rx[j])*(rx[i]-rx[j]) + (ry[i]-ry[j])*(ry[i]-ry[j])); } average += temp/(mytype)size; temp = 0.0f; } return average; } 所以我想说CPU端的中间结果有问题.有趣的是,它没有显示在GPU结果上.我怀疑其原因是GPU平均值的最终总和是在CPU上完成的(因此每个单独的GPU块结果按大小缩小,例如8192),并且这些可能具有足以存活的中间精度.最后的师.如果您内联CPU平均值计算,您可能会再次观察到不同的内容. (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
- 在PostgreSQL中添加全文索引会加快我使用LIKE的常规查询吗?
- 开源xml读写库CMarkup使用注意事项
- ruby-on-rails – NameError:未初始化的常量Faker ::
- JSON.parse()和JSON.stringify()用法解析
- ruby-on-rails-3.1 – 轨道3.1中的无表模型
- ruby-on-rails – 如何将环境变量传递给从另一个Rake任务调
- ruby-on-rails – 从Rails.application.routes.url_helpers
- ios – 启动屏幕故事板在模拟器中显示但在iPad Pro时不显示
- cocos2d-x 声音和音效
- Postgresql 配置文件详解