HPC Magazine mai 2013 - Programmer Xeon Phi (3ème partie) - Listing 3.
Implémentation simplifiée de l'algorithme IS.
#include <stdio.h> #include <mkl.h> #include <omp.h> int main(){ const int Size = 10000; const char transa='N'; const char transb='N'; const double alpha=1.0; const double beta=1.0; int Iteration = 10; int ldab = Size; int matrix_element = Size*ldab; /* We allocate memory on a 64-byte aligned boundary, using mkl_malloc, to improve memory access. */ double* A = (double*)mkl_malloc(sizeof(double)*matrix_element, 64); double* B = (double*)mkl_malloc(sizeof(double)*matrix_element,64); double* C = (double*)mkl_malloc(sizeof(double)*matrix_element,64); if (A == NULL || B == NULL || C == NULL) { printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n"); mkl_free(A); mkl_free(B); mkl_free(C); return 1; } printf("Initializing matrix data\n"); fflush(0); /* Init of Matrices */ #pragma omp parallel for for (int i=0; i<matrix_element; i++) A[i] = B[i] = C[i] = (double)(i+1); printf("Computing matrix product using Intel MKL dgemm function\n"); fflush(0); /* First call which does the thread/buffer initialization */ dgemm(&transa, &transb, &Size, &Size, &Size, &alpha, A, &ldab, B, &ldab, &beta, C, &Size); /* t1 = starting time */ double t1 = omp_get_wtime(); for (int k=0; k<Iteration; k++){ dgemm(&transa, &transb, &Size, &Size, &Size, &alpha, A, &ldab, B, &ldab, &beta, C, &Size); } /* t2 = end time */ double t2 = omp_get_wtime(); /* We compute the average number of flops */ double flops = (2.0*Size*Size*Size) * 1e-9/((t2-t1)/Iteration); printf("Average = %.1f GFLOP/s\n", flops); printf ("\n Deallocating memory \n\n"); fflush(0); mkl_free(A); mkl_free(B); mkl_free(C); printf (" Example completed. \n\n"); return 0; }