HPC Magazine mars 2013 - L'Atelier CUDA - Listing 3.
Exemple type de synchronisation en mémoire partagée.
#include ‹stdio.h› __global__ void staticReverse(int *d, int n) { __shared__ int s[64]; int t = threadIdx.x; int tr = n-t-1; s[t] = d[t]; __syncthreads(); d[t] = s[tr]; } __global__ void dynamicReverse(int *d, int n) { extern __shared__ int s[]; int t = threadIdx.x; int tr = n-t-1; s[t] = d[t]; __syncthreads(); d[t] = s[tr]; } int main(void) { const int n = 64; int a[n], r[n], d[n]; for (int i = 0; i ‹ n; i++) { a[i] = i; r[i] = n-i-1; d[i] = 0; } int *d_d; cudaMalloc(&d_d, n * sizeof(int)); // version memoire partagee statique cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice); staticReverse‹‹‹1,n›››(d_d, n); cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost); for (int i = 0; i ‹ n; i++) if (d[i] != r[i]) printf(“Erreur: d[%d]!=r[%d] (%d, %d)\n”, i, i, d[i], r[i]); // version memoire partagee dynamique cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice); dynamicReverse‹‹‹1,n,n*sizeof(int)›››(d_d, n); cudaMemcpy(d, d_d, n * sizeof(int), cudaMemcpyDeviceToHost); for (int i = 0; i ‹ n; i++) if (d[i] != r[i]) printf(“Erreur: d[%d]!=r[%d] (%d, %d)\n”, i, i, d[i], r[i]); }