#include #define MATRIX_WIDTH 4096 #define MATRIX_HEIGHT 4096 #define TILE_SIZE 16 __global__ void naive_matmul(int *a, int *b, int *c) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; if (row < MATRIX_HEIGHT && col < MATRIX_WIDTH) { int sum = 0; for (int i = 0; i < MATRIX_WIDTH; i++) { sum += a[row * MATRIX_WIDTH + i] * b[i * MATRIX_WIDTH + col]; } c[row * MATRIX_WIDTH + col] = sum; } } __global__ void tiled_matmul(int *a, int *b, int *c) { __shared__ int tile_a[TILE_SIZE][TILE_SIZE]; __shared__ int tile_b[TILE_SIZE][TILE_SIZE]; int tx = threadIdx.x; int ty = threadIdx.y; int bx = blockIdx.x; int by = blockIdx.y; int row = by * TILE_SIZE + ty; int col = bx * TILE_SIZE + tx; int value = 0; for (int i = 0; i < (MATRIX_WIDTH + TILE_SIZE - 1) / TILE_SIZE; i++) { if (row < MATRIX_HEIGHT && i * TILE_SIZE + tx < MATRIX_WIDTH) { tile_a[ty][tx] = a[row * MATRIX_WIDTH + i * TILE_SIZE + tx]; } else { tile_a[ty][tx] = 0; } if (col < MATRIX_WIDTH && i * TILE_SIZE + ty < MATRIX_HEIGHT) { tile_b[ty][tx] = b[(i * TILE_SIZE + ty) * MATRIX_WIDTH + col]; } else { tile_b[ty][tx] = 0; } __syncthreads(); for (int j = 0; j < TILE_SIZE; j++) { value += tile_a[ty][j] * tile_b[j][tx]; } __syncthreads(); } if (row < MATRIX_HEIGHT && col < MATRIX_WIDTH) { c[row * MATRIX_WIDTH + col] = value; } } int main() { int *h_a, *h_b, *h_naive_c, *h_tiled_c; int *d_a, *d_b, *d_naive_c, *d_tiled_c; int n = MATRIX_WIDTH * MATRIX_HEIGHT; int size = n * sizeof(int); dim3 threadPerBlock(TILE_SIZE, TILE_SIZE); dim3 blockPerGrid((MATRIX_WIDTH + threadPerBlock.x - 1) / threadPerBlock.x, (MATRIX_HEIGHT + threadPerBlock.y - 1) / threadPerBlock.y); h_a = (int *)malloc(size); h_b = (int *)malloc(size); h_naive_c = (int *)malloc(size); h_tiled_c = (int *)malloc(size); cudaMalloc(&d_a, size); cudaMalloc(&d_b, size); cudaMalloc(&d_naive_c, size); cudaMalloc(&d_tiled_c, size); for (int i = 0; i < n; ++i) { h_a[i] = rand() % (100); h_b[i] = rand() % (100); } cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice); cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice); // Naive version cudaEvent_t naive_start, naive_end; cudaEventCreate(&naive_start); cudaEventCreate(&naive_end); cudaEventRecord(naive_start); for (int i = 0; i < 10; i++) { naive_matmul<<>>(d_a, d_b, d_naive_c); } cudaEventRecord(naive_end); cudaDeviceSynchronize(); cudaMemcpy(h_naive_c, d_naive_c, size, cudaMemcpyDeviceToHost); float naive_elapsed_time; cudaEventElapsedTime(&naive_elapsed_time, naive_start, naive_end); printf("Time taken: %f ms for 10 trials by the naive method\n" , naive_elapsed_time); cudaEventDestroy(naive_start); cudaEventDestroy(naive_end); // Tiled version cudaEvent_t tiled_start, tiled_end; cudaEventCreate(&tiled_start); cudaEventCreate(&tiled_end); cudaEventRecord(tiled_start); for (int i = 0; i < 10; i++) { tiled_matmul<<>>(d_a, d_b, d_tiled_c); } cudaEventRecord(tiled_end); cudaDeviceSynchronize(); cudaMemcpy(h_tiled_c, d_tiled_c, size, cudaMemcpyDeviceToHost); float tiled_elapsed_time; cudaEventElapsedTime(&tiled_elapsed_time, tiled_start, tiled_end); printf("Time taken: %f ms for 10 trials by the tiled method\n" , tiled_elapsed_time); cudaEventDestroy(tiled_start); cudaEventDestroy(tiled_end); // Comparison int errors = 0; for (int i = 0; i < n; i++) { if (h_naive_c[i] != h_tiled_c[i]) { errors++; } } printf("Number of errors: %d\n", errors); cudaFree(d_a); cudaFree(d_b); cudaFree(d_naive_c); cudaFree(d_tiled_c); free(h_a); free(h_b); free(h_naive_c); free(h_tiled_c); return 0; }