cuda 入门求和_随笔

cuda 入门求和
#include
#include
#include
#include
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#define CHECK(call) 
{ 
 const cudaError_t error = call; 
 if (error != cudaSuccess) 
 { 
 printf("Error: %s:%d, ", __FILE__, __LINE__); 
 printf("code:%d, reason: %sn", error, cudaGetErrorString(error)); 
 exit(1); 
 } 
}


void sumArraysOnHost(float* A, float* B, float* C, const int N) {
	for (int idx = 0; idx < N; idx++) {
		C[idx] = A[idx] + B[idx];
		//printf("%dn",idx);
	}
}

__global__ void sumArraysOnGPU(float* A, float* B, float* C,const int N) {
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if(i< N)
	C[i] = A[i] + B[i];
}
void initialData(float* ip, int size) {
	// generate different seed for random number
	time_t t;
	srand((unsigned int)time(&t));
	for (int i = 0; i < size; i++) {
		ip[i] = (float)(rand() & 0xFF) / 10.0f;
	}
}
__global__ void checkIndex(void) {
	printf("threadIdx:(%d, %d, %d) blockIdx:(%d, %d, %d) blockDim:(%d, %d, %d) "
		"gridDim:(%d, %d, %d)n", threadIdx.x, threadIdx.y, threadIdx.z,
		blockIdx.x, blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, blockDim.z,
		gridDim.x, gridDim.y, gridDim.z);
}

void checkResult(float* hostRef, float* gpuRef, const int N) {
	double epsilon = 1.0E-8;
	int match = 1;
	for (int i = 0; i < N; i++) {
		if ((hostRef[i] - gpuRef[i]) > epsilon|| (hostRef[i] - gpuRef[i])<-epsilon) {
			match = 0;
			printf("Arrays do not match!n");
			printf("host %5.2f gpu %5.2f at current %dn",
				hostRef[i], gpuRef[i], i);
			break;
		}
	}
	if (match) printf("Arrays match.nn");
	return;
}
int main(int argc, char** argv) {
	int nElem = 65536  *1024;
	cudaDeviceReset();
	LARGE_INTEGER freq,tBegin,tEnd;
	size_t nBytes = nElem * sizeof(float);
	float* h_A, * h_B, * h_C;
	QueryPerformanceFrequency(&freq);
	QueryPerformanceCounter(&tBegin);
	h_A = (float*)malloc(nBytes);
	h_B = (float*)malloc(nBytes);
	h_C = (float*)malloc(nBytes);
	
	initialData(h_A, nElem);
	initialData(h_B, nElem);

	sumArraysOnHost(h_A, h_B, h_C, nElem);
	QueryPerformanceCounter(&tEnd);
	auto time = (double)(tEnd.QuadPart - tBegin.QuadPart) / (double)freq.QuadPart;
	printf("cpu cost %lf sn",time);
	float* d_A, * d_B, * d_C;
	float* g_C = (float*)malloc(nBytes);
	QueryPerformanceCounter(&tBegin);
	CHECK(cudaMalloc((float**)&d_A, nBytes));

	CHECK(cudaMalloc((float**)&d_B, nBytes));
	
	CHECK(cudaMalloc((float**)&d_C, nBytes));
	
	CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
	
	CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));

	sumArraysOnGPU<<<65536,1024>>>(d_A, d_B, d_C,nElem);

	//CHECK(cudaDeviceSynchronize());
	CHECK(cudaMemcpy(g_C, d_C, nBytes, cudaMemcpyDeviceToHost));
	QueryPerformanceCounter(&tEnd);
	auto time2 = (double)(tEnd.QuadPart - tBegin.QuadPart) / (double)freq.QuadPart;

	printf("gpu cost %lf sn", time2);
	
	checkResult(g_C, h_C, nElem);

	
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
	free(h_A);
	free(h_B);
	free(h_C);
	return(0);
}
欢迎分享，转载请注明来源：内存溢出
原文地址: http://outofmemory.cn/zaji/5658568.html
cuda 入门求和

发表评论

评论列表（0条）