/* LU decomposiotion parallelization using CUDA SJ */ /* Unified memory version */ #include #include #include #include // includes CUDA #include // includes, project #include #include // helper functions for SDK examples // element type #define TYPE double // input size #ifndef N #define N 2048 #endif // matrix typedef TYPE matrix[N][N]; #define NEW(type) ( (type*)malloc(sizeof(type)) ) double ltime(); void crout(matrix *A, matrix *L, matrix *U, int n); void crout_cuda(matrix *A, matrix *L, matrix *U, int n); void matmul(matrix *A, matrix *B, matrix *C); int main(int argc, char **argv) { int i, j; double starttime, endtime, seqtime = 0, partime; srand(42); matrix *A = NULL, *L = NULL, *U = NULL, *B = NULL; int msize = sizeof(matrix); /* allocate space for unified memory */ checkCudaErrors(cudaMallocManaged(&A, msize)); checkCudaErrors(cudaMallocManaged(&L, msize)); checkCudaErrors(cudaMallocManaged(&U, msize)); // B is host only B = NEW(matrix); /* create random matrix */ for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { (*A)[i][j] = (TYPE) (rand()%10); } } if (N <= 10) { printf("A:\n"); for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { printf("%6.2f ", (*A)[i][j]); } printf("\n"); } } // sequential starttime = ltime(); if (N <= 2048) crout(A, L, U, N); endtime = ltime(); seqtime = endtime-starttime; printf("Sequential %.3f s, %6.2lf MFLOPS, U(6, 9) = %lf \n", seqtime, ((double)N*N*N*2)/(1000000*seqtime), (*U)[6][7]); // parallel starttime = ltime(); crout_cuda(A, L, U, N); endtime = ltime(); partime = endtime-starttime; // cudaDeviceSynchronize(); // wait for results to be ready printf("Parallel %.3f s, %6.2lf MFLOPS, U(6, 9) = %lf \n", partime, ((double)N*N*N*2)/(1000000*partime), (*U)[6][7]); printf("Speedup %.3f\n", seqtime/partime); if (N <= 10) { printf("L:\n"); for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { printf("%6.2f ", (*L)[i][j]); } printf("\n"); } printf("U:\n"); for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { printf("%6.2f ", (*U)[i][j]); } printf("\n"); } matmul(L, U, B); printf("B:\n"); for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { printf("%6.2f ", (*B)[i][j]); } printf("\n"); } printf("A-B:\n"); for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { printf("%6.2f ", (*A)[i][j] - (*B)[i][j]); } printf("\n"); } } checkCudaErrors(cudaFree(A)); checkCudaErrors(cudaFree(U)); checkCudaErrors(cudaFree(L)); free(B); } // main() // LU decomposition in Crout's method // Source: Numerical Reciepes / Wikipedia void crout(matrix *A, matrix *L, matrix *U, int n) { int i, j, k; TYPE sum = 0; for (i = 0; i < n; i++) { (*U)[i][i] = 1; } for (j = 0; j < n; j++) { for (i = j; i < n; i++) { sum = 0; for (k = 0; k < j; k++) { sum = sum + (*L)[i][k] * (*U)[k][j]; } (*L)[i][j] = (*A)[i][j] - sum; } for (i = j; i < n; i++) { sum = 0; for(k = 0; k < j; k++) { sum = sum + (*L)[j][k] * (*U)[k][i]; } if ((*L)[j][j] == 0) { printf("det(L) close to 0!\n Can't divide by 0...\n"); exit(EXIT_FAILURE); } (*U)[j][i] = ((*A)[j][i] - sum) / (*L)[j][j]; } } } /* // these versions work even if N > blockDim.x * gridDim.x // zero matrix __global__ void crout_kernel0(matrix *A, matrix *L, matrix *U, int n) { int nThreads = blockDim.x * gridDim.x; int treadId = blockDim.x * blockIdx.x + threadIdx.x; int i, j; for (i = 0; i < n; i++) { for (j = treadId; j < n; j += nThreads) { (*U)[i][j] = 0; (*L)[i][j] = 0; } } } __global__ void crout_kernel1(matrix *A, matrix *L, matrix *U, int n) { int nThreads = blockDim.x * gridDim.x; int treadId = blockDim.x * blockIdx.x + threadIdx.x; int i; for (i = treadId; i < n; i += nThreads) { (*U)[i][i] = 1; } } __global__ void crout_kernel2(matrix *A, matrix *L, matrix *U, int n, int j) { int nThreads = blockDim.x * gridDim.x; int treadId = blockDim.x * blockIdx.x + threadIdx.x; int i, k; TYPE sum = 0; for (i = j+treadId; i < n; i+=nThreads) { sum = 0; for (k = 0; k < j; k++) { sum = sum + (*L)[i][k] * (*U)[k][j]; } (*L)[i][j] = (*A)[i][j] - sum; } } __global__ void crout_kernel3(matrix *A, matrix *L, matrix *U, int n, int j) { int nThreads = blockDim.x * gridDim.x; int treadId = blockDim.x * blockIdx.x + threadIdx.x; int i, k; TYPE sum = 0; for (i = j+treadId; i < n; i +=nThreads) { sum = 0; for(k = 0; k < j; k++) { sum = sum + (*L)[j][k] * (*U)[k][i]; } (*U)[j][i] = ((*A)[j][i] - sum) / (*L)[j][j]; } } */ // these versions require N <= blockDim.x * gridDim.x; // zero matrix, not needes of upper part of L and lower part of U are // not used __global__ void crout_kernel0(matrix *A, matrix *L, matrix *U, int n) { int j = blockDim.x * blockIdx.x + threadIdx.x; int i; for (i = 0; i < n; i++) { if (j < n) { (*U)[i][j] = 0; (*L)[i][j] = 0; } } } // phase 1: init diagonal of U to 1 __global__ void crout_kernel1(matrix *A, matrix *L, matrix *U, int n) { int i = blockDim.x * blockIdx.x + threadIdx.x; if (i < n) { (*U)[i][i] = 1; } } // phase 2: update partial row of L // read global memory __global__ void crout_kernel2(matrix *A, matrix *L, matrix *U, int n, int j) { int i = j + blockDim.x * blockIdx.x + threadIdx.x; int k; TYPE sum = 0; if (i < n) { sum = 0; for (k = 0; k < j; k++) { sum = sum + (*L)[i][k] * (*U)[k][j]; } (*L)[i][j] = (*A)[i][j] - sum; } } // use share memory to keep a shared copy of a column of ready U __global__ void crout_kernel2s(matrix *A, matrix *L, matrix *U, int n, int j) { int i = j + blockDim.x * blockIdx.x + threadIdx.x; int k; __shared__ TYPE ucol[N]; for (k = threadIdx.x ; k < j; k += blockDim.x) ucol[k] = (*U)[k][j]; __syncthreads(); TYPE sum = 0; if (i < n) { sum = 0; for (k = 0; k < j; k++) { sum = sum + (*L)[i][k] * ucol[k]; } (*L)[i][j] = (*A)[i][j] - sum; } } // interleave the access, not a benerif as shared memory access is not the bottleneck __global__ void crout_kernel2si(matrix *A, matrix *L, matrix *U, int n, int j) { int i = j + blockDim.x * blockIdx.x + threadIdx.x; int k; __shared__ TYPE ucol[N]; for (k = threadIdx.x ; k < j; k += blockDim.x) ucol[k] = (*U)[k][j]; __syncthreads(); TYPE sum = 0; if (i < n) { sum = 0; for (k = threadIdx.x; k < j; k++) { sum = sum + (*L)[i][k] * ucol[k]; } for (k = 0; k < j && k < threadIdx.x; k++) { sum = sum + (*L)[i][k] * ucol[k]; } (*L)[i][j] = (*A)[i][j] - sum; } } // phase 3: update partial column of U // read global memory __global__ void crout_kernel3(matrix *A, matrix *L, matrix *U, int n, int j) { int i = j + blockDim.x * blockIdx.x + threadIdx.x; int k; TYPE sum = 0; if (i < n) { sum = 0; for(k = 0; k < j; k++) { sum = sum + (*L)[j][k] * (*U)[k][i]; } (*U)[j][i] = ((*A)[j][i] - sum) / (*L)[j][j]; } } // use block-local copy of a row of L, not really useful __global__ void crout_kernel3s(matrix *A, matrix *L, matrix *U, int n, int j) { int i = j + blockDim.x * blockIdx.x + threadIdx.x; int k; TYPE sum = 0; __shared__ TYPE lrow[N]; for (k = threadIdx.x ; k < j; k += blockDim.x) lrow[k] = (*L)[j][k]; __syncthreads(); if (i < n) { sum = 0; for(k = 0; k < j; k++) { sum = sum + lrow[k] * (*U)[k][i]; } (*U)[j][i] = ((*A)[j][i] - sum) / (*L)[j][j]; } } void crout_cuda(matrix *A, matrix *L, matrix *U, int n) { int j; // no need to allocate // no need to copy input matrix to device // size of grid and block int threads = 128; int grid = (int)ceil(1.0*n/threads); printf("n=%d t=%d b=%d\n", n, threads, grid); // zero matrices // this is needed only if unused parts of L and U are used crout_kernel0<<>>(A, L, U, n); crout_kernel1<<>>(A, L, U, n); // proceed along rows/columns of L and U for (j = 0; j < n; j++) { // works also without crout_kernel2s<<>>(A, L, U, n, j); // implicit synchronization crout_kernel3<<>>(A, L, U, n, j); } // extra sync is needed here before host reads the results // could be later, here only to make benchmarking fair cudaDeviceSynchronize(); // no need to retrive results from device to host // unified memory is freed at host } // crout_cuda() /* basic algorithm according to the defition */ void matmul(matrix *A, matrix *B, matrix *C) { int i, j, k; TYPE c; for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { c = 0.0; for (k = 0; k < N; k++) { c += (*A)[i][k] * (*B)[k][j]; } (*C)[i][j] = c; } } } double ltime() { struct timeval tv; struct timezone tz; gettimeofday(&tv, &tz); return tv.tv_sec + (double)tv.tv_usec/1000000; }