#include <stdio.h>

#define MATRIX_WIDTH  4096
#define MATRIX_HEIGHT 4096
#define TILE_SIZE 16


__global__ void naive_matmul(int *a, int *b, int *c)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row < MATRIX_HEIGHT && col < MATRIX_WIDTH) {
        int sum = 0;
        for (int i = 0; i < MATRIX_WIDTH; i++) {
            sum += a[row * MATRIX_WIDTH + i] * b[i * MATRIX_WIDTH + col];
        }
        c[row * MATRIX_WIDTH + col] = sum;
    }
}


__global__ void tiled_matmul(int *a, int *b, int *c)
{
    __shared__ int tile_a[TILE_SIZE][TILE_SIZE];
    __shared__ int tile_b[TILE_SIZE][TILE_SIZE];

    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int bx = blockIdx.x; 
    int by = blockIdx.y;
    int row = by * TILE_SIZE + ty;
    int col = bx * TILE_SIZE + tx;

    int value = 0;
    for (int i = 0; i < (MATRIX_WIDTH + TILE_SIZE - 1) / TILE_SIZE; i++) {
        if (row < MATRIX_HEIGHT && i * TILE_SIZE + tx < MATRIX_WIDTH) {
            tile_a[ty][tx] = a[row * MATRIX_WIDTH + i * TILE_SIZE + tx];
        } else {
            tile_a[ty][tx] = 0;
        }

        if (col < MATRIX_WIDTH && i * TILE_SIZE + ty < MATRIX_HEIGHT) {
            tile_b[ty][tx] = b[(i * TILE_SIZE + ty) * MATRIX_WIDTH + col];
        } else {
            tile_b[ty][tx] = 0;
        }

        __syncthreads();

        for (int j = 0; j < TILE_SIZE; j++) {
            value += tile_a[ty][j] * tile_b[j][tx];
        }

        __syncthreads();
    }

    if (row < MATRIX_HEIGHT && col < MATRIX_WIDTH) {
        c[row * MATRIX_WIDTH + col] = value;
    }
}


int main()
{
    int *h_a, *h_b, *h_naive_c, *h_tiled_c;
    int *d_a, *d_b, *d_naive_c, *d_tiled_c;

    int n = MATRIX_WIDTH * MATRIX_HEIGHT;
    int size = n * sizeof(int);
    dim3 threadPerBlock(TILE_SIZE, TILE_SIZE);
    dim3 blockPerGrid((MATRIX_WIDTH + threadPerBlock.x - 1) / threadPerBlock.x,
                      (MATRIX_HEIGHT + threadPerBlock.y - 1) / threadPerBlock.y);

    h_a = (int *)malloc(size);
    h_b = (int *)malloc(size);
    h_naive_c = (int *)malloc(size);
    h_tiled_c = (int *)malloc(size);

    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_naive_c, size);
    cudaMalloc(&d_tiled_c, size);

    for (int i = 0; i < n; ++i) {
        h_a[i] = rand() % (100);
        h_b[i] = rand() % (100);
    }

    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // Naive version
    cudaEvent_t naive_start, naive_end;
    cudaEventCreate(&naive_start);
    cudaEventCreate(&naive_end);

    cudaEventRecord(naive_start);
    for (int i = 0; i < 10; i++) {
        naive_matmul<<<blockPerGrid, threadPerBlock>>>(d_a, d_b, d_naive_c);
    }
    cudaEventRecord(naive_end);
    cudaDeviceSynchronize();
    cudaMemcpy(h_naive_c, d_naive_c, size, cudaMemcpyDeviceToHost);

    float naive_elapsed_time;
    cudaEventElapsedTime(&naive_elapsed_time, naive_start, naive_end);
    printf("Time taken: %f ms for 10 trials by the naive method\n" , naive_elapsed_time);

    cudaEventDestroy(naive_start);
    cudaEventDestroy(naive_end);

    // Tiled version
    cudaEvent_t tiled_start, tiled_end;
    cudaEventCreate(&tiled_start);
    cudaEventCreate(&tiled_end);

    cudaEventRecord(tiled_start);
    for (int i = 0; i < 10; i++) {
        tiled_matmul<<<blockPerGrid, threadPerBlock>>>(d_a, d_b, d_tiled_c);
    }
    cudaEventRecord(tiled_end);
    cudaDeviceSynchronize();
    cudaMemcpy(h_tiled_c, d_tiled_c, size, cudaMemcpyDeviceToHost);

    float tiled_elapsed_time;
    cudaEventElapsedTime(&tiled_elapsed_time, tiled_start, tiled_end);
    printf("Time taken: %f ms for 10 trials by the tiled method\n" , tiled_elapsed_time);

    cudaEventDestroy(tiled_start);
    cudaEventDestroy(tiled_end);

    // Comparison
    int errors = 0;
    for (int i = 0; i < n; i++) {
        if (h_naive_c[i] != h_tiled_c[i]) {
            errors++;
        }
    }
    printf("Number of errors: %d\n", errors);
    
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_naive_c);
    cudaFree(d_tiled_c);
    free(h_a);
    free(h_b);
    free(h_naive_c);
    free(h_tiled_c);

    return 0;
}