HPC Magazine mai 2013 - Programmer Xeon Phi (3ème partie) - Listing 3.
Implémentation simplifiée de l'algorithme IS.

#include <stdio.h>
#include <mkl.h>
#include <omp.h>

int main(){
  const int Size = 10000;
  const char transa='N'; 
  const char transb='N'; 
  const double alpha=1.0;
  const double beta=1.0;
  int Iteration = 10;

  int  ldab = Size;
  int matrix_element = Size*ldab;

/* We allocate memory on a 64-byte aligned boundary, using mkl_malloc, 
	 to improve memory access. */
  double* A = (double*)mkl_malloc(sizeof(double)*matrix_element, 64);
  double* B = (double*)mkl_malloc(sizeof(double)*matrix_element,64);
  double* C = (double*)mkl_malloc(sizeof(double)*matrix_element,64);

  if (A == NULL || B == NULL || C == NULL) {
    printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
    mkl_free(A);
    mkl_free(B);
    mkl_free(C);
    return 1;
  }

  printf("Initializing matrix data\n");
  fflush(0);
  /* Init of Matrices */
  #pragma omp parallel for
  for (int i=0; i<matrix_element; i++) 
    A[i] = B[i] = C[i] = (double)(i+1);
 
  printf("Computing matrix product using Intel MKL dgemm function\n");
  fflush(0);
  /* First call which does the thread/buffer initialization */    
  dgemm(&transa, &transb, &Size, &Size, &Size, &alpha, A, &ldab, B, 
    &ldab, &beta, C, &Size);

  /* t1 = starting time */    
  double t1 = omp_get_wtime();
  for (int k=0; k<Iteration; k++){
    dgemm(&transa, &transb, &Size, &Size, &Size, &alpha, A, &ldab, B, 
      &ldab, &beta, C, &Size);
  }
  /* t2 = end time */
  double t2 = omp_get_wtime();

  /* We compute the average number of flops */
  double flops = (2.0*Size*Size*Size) * 1e-9/((t2-t1)/Iteration);
  printf("Average =  %.1f GFLOP/s\n", flops);
  printf ("\n Deallocating memory \n\n");
  fflush(0);

  mkl_free(A); 
  mkl_free(B); 
  mkl_free(C);

  printf (" Example completed. \n\n");
  return 0;
}