<meta charset="utf-8">

<div class="hpc-listing" style="width:640px; max-height:600px">
  <p class="hpc-listing-title">HPC Magazine mars 2013 - L'Atelier CUDA - Listing 2.</p>
  <p class="hpc-listing-legend">Test de performances en mémoire globale.</p>
  <pre class="prettyprint linenums lang-c">

#include &lsaquo;stdio.h&rsaquo;
#include &lsaquo;assert.h&rsaquo;
&nbsp;
inline
cudaError_t checkCuda(cudaError_t result) {
#if defined(DEBUG) || defined(_DEBUG)
  if (result != cudaSuccess) {
    fprintf(stderr, "Erreur (runtime) CUDA: %s\n", cudaGetErrorString(result));
    assert(result == cudaSuccess);
  }
#endif
  return result;
}
&nbsp;
template 
__global__ void offset(T* a, int s) {
  int i = blockDim.x * blockIdx.x + threadIdx.x + s;
  a[i] = a[i] + 1;
}
&nbsp;
template 
__global__ void stride(T* a, int s) {
  int i = (blockDim.x * blockIdx.x + threadIdx.x) * s;
  a[i] = a[i] + 1;
}
&nbsp;
template 
void runTest(int deviceId, int nMB) {
  int blockSize = 256;
  float ms;
&nbsp;
  T *d_a;
  cudaEvent_t startEvent, stopEvent;
&nbsp;
  int n = nMB*1024*1024/sizeof(T);
&nbsp;
  // NB:  d_a(33*nMB) pour l'alignement
  checkCuda( cudaMalloc(&amp;d_a, n * 33 * sizeof(T)) );
&nbsp;
  checkCuda( cudaEventCreate(&amp;startEvent) );
  checkCuda( cudaEventCreate(&amp;stopEvent) );
&nbsp;
  printf("Bande passante en mode offset (Go/s):\n");
&nbsp;
  offset&lsaquo;&lsaquo;&lsaquo;n/blockSize, blockSize&rsaquo;&rsaquo;&rsaquo;(d_a, 0); 
&nbsp;
for (int i = 0; i &lsaquo;= 32; i++) {
    checkCuda( cudaMemset(d_a, 0.0, n * sizeof(T)) );
&nbsp;
    checkCuda( cudaEventRecord(startEvent,0) );
    offset&lsaquo;&lsaquo;&lsaquo;n/blockSize, blockSize&rsaquo;&rsaquo;&rsaquo;(d_a, i);
    checkCuda( cudaEventRecord(stopEvent,0) );
    checkCuda( cudaEventSynchronize(stopEvent) );
&nbsp;
    checkCuda( cudaEventElapsedTime(&amp;ms, startEvent, stopEvent) );
    printf("%d, %f\n", i, 2*nMB/ms);
  }
&nbsp;
  printf("\n");
  printf("Bande passante en mode alignement (Go/s):\n");
&nbsp;
  stride&lsaquo;&lsaquo;&lsaquo;n/blockSize, blockSize&rsaquo;&rsaquo;&rsaquo;(d_a, 1);
  for (int i = 1; i &lsaquo;= 32; i++) {
    checkCuda( cudaMemset(d_a, 0.0, n * sizeof(T)) );
&nbsp;
    checkCuda( cudaEventRecord(startEvent,0) );
    stride&lsaquo;&lsaquo;&lsaquo;n/blockSize, blockSize&rsaquo;&rsaquo;&rsaquo;(d_a, i);
    checkCuda( cudaEventRecord(stopEvent,0) );
    checkCuda( cudaEventSynchronize(stopEvent) );
&nbsp;
    checkCuda( cudaEventElapsedTime(&amp;ms, startEvent, stopEvent) );
    printf("%d, %f\n", i, 2*nMB/ms);
  }
&nbsp;
  checkCuda( cudaEventDestroy(startEvent) );
  checkCuda( cudaEventDestroy(stopEvent) );
  cudaFree(d_a);
}
&nbsp;
int main(int argc, char **argv)
{
  int nMB = 4;
  int deviceId = 0;
  bool bFp64 = false;
&nbsp;
  for (int i = 1; i &lsaquo; argc; i++) {    
    if (!strncmp(argv[i], "dev=", 4))
      deviceId = atoi((char*)(&amp;argv[i][4]));
    else if (!strcmp(argv[i], "fp64"))
      bFp64 = true;
  }
&nbsp;
  cudaDeviceProp prop;
&nbsp;
  checkCuda( cudaSetDevice(deviceId) );
  checkCuda( cudaGetDeviceProperties(&amp;prop, deviceId) );
  printf("GPU: %s\n", prop.name);
  printf("Taille des transferts (Mo): %d\n", nMB);
&nbsp;
  printf("%s Precision\n", bFp64 ? "Double" : "Simple");
&nbsp;
  if (bFp64) runTest(deviceId, nMB);
  else       runTest(deviceId, nMB);
}

  </pre> 
</div>

<script src="http://actionablecode.com/__shared/js/vendor/google-prettify/run_prettify.js"></script>