#include#include #include #include #include "cuda_runtime.h" #include "device_launch_parameters.h" #include #define CHECK(call) { const cudaError_t error = call; if (error != cudaSuccess) { printf("Error: %s:%d, ", __FILE__, __LINE__); printf("code:%d, reason: %sn", error, cudaGetErrorString(error)); exit(1); } } void sumArraysOnHost(float* A, float* B, float* C, const int N) { for (int idx = 0; idx < N; idx++) { C[idx] = A[idx] + B[idx]; //printf("%dn",idx); } } __global__ void sumArraysOnGPU(float* A, float* B, float* C,const int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; if(i< N) C[i] = A[i] + B[i]; } void initialData(float* ip, int size) { // generate different seed for random number time_t t; srand((unsigned int)time(&t)); for (int i = 0; i < size; i++) { ip[i] = (float)(rand() & 0xFF) / 10.0f; } } __global__ void checkIndex(void) { printf("threadIdx:(%d, %d, %d) blockIdx:(%d, %d, %d) blockDim:(%d, %d, %d) " "gridDim:(%d, %d, %d)n", threadIdx.x, threadIdx.y, threadIdx.z, blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, blockDim.z, gridDim.x, gridDim.y, gridDim.z); } void checkResult(float* hostRef, float* gpuRef, const int N) { double epsilon = 1.0E-8; int match = 1; for (int i = 0; i < N; i++) { if ((hostRef[i] - gpuRef[i]) > epsilon|| (hostRef[i] - gpuRef[i])<-epsilon) { match = 0; printf("Arrays do not match!n"); printf("host %5.2f gpu %5.2f at current %dn", hostRef[i], gpuRef[i], i); break; } } if (match) printf("Arrays match.nn"); return; } int main(int argc, char** argv) { int nElem = 65536 *1024; cudaDeviceReset(); LARGE_INTEGER freq,tBegin,tEnd; size_t nBytes = nElem * sizeof(float); float* h_A, * h_B, * h_C; QueryPerformanceFrequency(&freq); QueryPerformanceCounter(&tBegin); h_A = (float*)malloc(nBytes); h_B = (float*)malloc(nBytes); h_C = (float*)malloc(nBytes); initialData(h_A, nElem); initialData(h_B, nElem); sumArraysOnHost(h_A, h_B, h_C, nElem); QueryPerformanceCounter(&tEnd); auto time = (double)(tEnd.QuadPart - tBegin.QuadPart) / (double)freq.QuadPart; printf("cpu cost %lf sn",time); float* d_A, * d_B, * d_C; float* g_C = (float*)malloc(nBytes); QueryPerformanceCounter(&tBegin); CHECK(cudaMalloc((float**)&d_A, nBytes)); CHECK(cudaMalloc((float**)&d_B, nBytes)); CHECK(cudaMalloc((float**)&d_C, nBytes)); CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice)); sumArraysOnGPU<<<65536,1024>>>(d_A, d_B, d_C,nElem); //CHECK(cudaDeviceSynchronize()); CHECK(cudaMemcpy(g_C, d_C, nBytes, cudaMemcpyDeviceToHost)); QueryPerformanceCounter(&tEnd); auto time2 = (double)(tEnd.QuadPart - tBegin.QuadPart) / (double)freq.QuadPart; printf("gpu cost %lf sn", time2); checkResult(g_C, h_C, nElem); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); free(h_A); free(h_B); free(h_C); return(0); }
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)