HPC Magazine mars 2013 - L'Atelier CUDA - Listing 2.
Test de performances en mémoire globale.
#include ‹stdio.h› #include ‹assert.h› inline cudaError_t checkCuda(cudaError_t result) { #if defined(DEBUG) || defined(_DEBUG) if (result != cudaSuccess) { fprintf(stderr, "Erreur (runtime) CUDA: %s\n", cudaGetErrorString(result)); assert(result == cudaSuccess); } #endif return result; } template __global__ void offset(T* a, int s) { int i = blockDim.x * blockIdx.x + threadIdx.x + s; a[i] = a[i] + 1; } template __global__ void stride(T* a, int s) { int i = (blockDim.x * blockIdx.x + threadIdx.x) * s; a[i] = a[i] + 1; } template void runTest(int deviceId, int nMB) { int blockSize = 256; float ms; T *d_a; cudaEvent_t startEvent, stopEvent; int n = nMB*1024*1024/sizeof(T); // NB: d_a(33*nMB) pour l'alignement checkCuda( cudaMalloc(&d_a, n * 33 * sizeof(T)) ); checkCuda( cudaEventCreate(&startEvent) ); checkCuda( cudaEventCreate(&stopEvent) ); printf("Bande passante en mode offset (Go/s):\n"); offset‹‹‹n/blockSize, blockSize›››(d_a, 0); for (int i = 0; i ‹= 32; i++) { checkCuda( cudaMemset(d_a, 0.0, n * sizeof(T)) ); checkCuda( cudaEventRecord(startEvent,0) ); offset‹‹‹n/blockSize, blockSize›››(d_a, i); checkCuda( cudaEventRecord(stopEvent,0) ); checkCuda( cudaEventSynchronize(stopEvent) ); checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) ); printf("%d, %f\n", i, 2*nMB/ms); } printf("\n"); printf("Bande passante en mode alignement (Go/s):\n"); stride‹‹‹n/blockSize, blockSize›››(d_a, 1); for (int i = 1; i ‹= 32; i++) { checkCuda( cudaMemset(d_a, 0.0, n * sizeof(T)) ); checkCuda( cudaEventRecord(startEvent,0) ); stride‹‹‹n/blockSize, blockSize›››(d_a, i); checkCuda( cudaEventRecord(stopEvent,0) ); checkCuda( cudaEventSynchronize(stopEvent) ); checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) ); printf("%d, %f\n", i, 2*nMB/ms); } checkCuda( cudaEventDestroy(startEvent) ); checkCuda( cudaEventDestroy(stopEvent) ); cudaFree(d_a); } int main(int argc, char **argv) { int nMB = 4; int deviceId = 0; bool bFp64 = false; for (int i = 1; i ‹ argc; i++) { if (!strncmp(argv[i], "dev=", 4)) deviceId = atoi((char*)(&argv[i][4])); else if (!strcmp(argv[i], "fp64")) bFp64 = true; } cudaDeviceProp prop; checkCuda( cudaSetDevice(deviceId) ); checkCuda( cudaGetDeviceProperties(&prop, deviceId) ); printf("GPU: %s\n", prop.name); printf("Taille des transferts (Mo): %d\n", nMB); printf("%s Precision\n", bFp64 ? "Double" : "Simple"); if (bFp64) runTest(deviceId, nMB); else runTest(deviceId, nMB); }