/**
 * Question 1: Run the program with the problem size of 1000 and 10 threads, what is the approximate speedup you are achieving?
 *
 * Answer 1: On my home machine single threading took 0.640239 seconds and multithreading took 0.194644 seconds.
 * On the lab machine single threading took 0.697472 seconds and multithreading took 0.114415 seconds.
 * This means that the my home machine is 3.29 times faster and the lab machine is 6.10 times faster.
 *
 * Question 2: Is there a problem size / number of threads combination that slows down the computation process? Why do you think it is happening?
 *
 * Answer 2: There is a problem with having too many threads for the number of hardware threads available. Increasing past this only
 * increases the overhead of creating and managing the threads. This is because the threads are not running in parallel and are instead
 * being switched between by the OS. At lower matrix sizes the cost of creating threads and managing them is greater than the cost of
 * just doing the computation in a single thread, so any combination where threads > the number of hardware threads will be slower, and
 * as the matrix size approaches 1, than the greater effect thread creation and management will have on the speed of the program.
 *
 * Question 3: What is the minimum size of the problem that benefits from creating an extra thread?
 *
 * Answer 3: The lowest size on my home machine that consistently benefited from an extra thread was 150, but this can change depending
 * on the specifications of the machine.
 *
 * Question 4: Does using the threads always improve execution duration?
 *
 * Answer 4: No, as the number of threads increases past the number of hardware threads available the execution duration increases
 * due to managing the threads, as well as low size matrices where the cost of creating and managing threads is greater than the
 * cost of just doing the computation in a single thread.
 *
 * Question 5: Guesstimate and comment on the nature of growth of the speedup with the number of threads – is it linear, exponential, are there any limits?
 *
 * Answer 5: The speedup is linear up to the number of hardware threads available, given a large enough matrix size.
 * Given a matrix of size x, the speedup of using y threads is approximately x/y, up to the number of hardware threads available.
 * After this the speedup will decrease as the number of threads increases, due to the overhead of creating and managing threads.
 **/

#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>

#define MAXN 5

int **generate_square_matrix(int size) {
    int **array = malloc(sizeof(int *) * size);
    for (int i = 0; i < size; i++) {
        array[i] = malloc(sizeof(int) * size);
        memset(array[i], 0, sizeof(int) * size);
    }
    return array;
}

int **generate_square_matrix_and_fill_it(int size) {
    int **array = generate_square_matrix(size);
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            array[i][j] = rand() % MAXN;
        }
    }
    return array;
}

void print_square_matrix(int **array, int size) {
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            printf("%i ", array[i][j]);
        }
        printf("\n");
    }
}

int check_if_matrices_differ(int **array, int **array2, int size) {
    int result = 0;
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            result += array[i][j] - array2[i][j];
            if (result != 0) {
                return result;
            }
        }
    }
    return result;
}

typedef struct _params {
    int **first_array;
    int **second_array;
    int **result;
    int max_threads;
    int row_index;
    int size;
} ThreadParams;

void multiply_matrices(void *threadParams) {
    ThreadParams *t = (ThreadParams *) threadParams;
    int N = t->size;
    int row = t->row_index;
    int column = 0;
    int temp_result = 0;
    while (row < N) {
        column = 0;
        while (column < N) {
            temp_result = 0;
            for (int i = 0; i < N; i++) {
                temp_result = temp_result + t->first_array[row][i] * t->second_array[i][column];
            }
            t->result[row][column] = temp_result;
            column = column + 1;
        }
        row = row + 1;
    }
}

void *multiply_matrices_threaded(void *threadParams) {
    /**
     * write a code for matrix multiplication that will utilize the
     * threading capacity and parallelize the computation in such a
     * way that a thread computes result per one or more rows
     */
    ThreadParams *t = (ThreadParams *) threadParams;
    int N = t->size;
    int row = t->row_index;
    int column = 0;
    int temp_result = 0;
    while (row < N) {
        column = 0;
        while (column < N) {
            temp_result = 0;
            for (int i = 0; i < t->size; i++) {
                temp_result = temp_result + t->first_array[row][i] * t->second_array[i][column];
            }
            t->result[row][column] = temp_result;
            column = column + 1;
        }
        row += t->max_threads;
    }
    return NULL;
}

int main(int argc, char **argv) {
    if (argc != 3) {
        printf("Please provide size of the matrix and the number of threads to execute\n");
        exit(0);
    }
    int size = atoi(argv[1]);
    int max_threads = atoi(argv[2]);
    // The value you pass to srand determines the random sequence
    srand(time(NULL)); // Line to initialize the random number generator.
    int **array1 = generate_square_matrix_and_fill_it(size);
    int **array2 = generate_square_matrix_and_fill_it(size);
    int **result = generate_square_matrix(size); // generate an empty matrix
    struct timeval begin;
    struct timeval end;
    gettimeofday(&begin, NULL); // fills the contents with time since the beginning of epoch
    ThreadParams *thr = (ThreadParams *) malloc(
            sizeof(ThreadParams)); // allocate a structure for holding function parameters
    thr->first_array = array1; // first matrix to multiply
    thr->second_array = array2; // the second matrix to multiply
    thr->result = result; // where to store the results - note it needs to be generated
    thr->row_index = 0; // this variable, in combination with max_threads can be used for parallelization
    thr->size = size;
    thr->max_threads = max_threads;

    multiply_matrices((void *) thr);

    gettimeofday(&end, NULL); // fills the contents with time since the beginning of epoch
    //The next line is inspired by https://linuxhint.com/gettimeofday_c_language/
    long long microseconds = (end.tv_sec * 1000000 + end.tv_usec) - (begin.tv_sec * 1000000 + begin.tv_usec);
    double duration = (1.0 * microseconds) / 1000000;
    printf("Single threaded took %lf seconds to execute \n", duration);
    int **threaded_result = generate_square_matrix(size);
    gettimeofday(&begin, NULL);
    /**
     * Write your code to create and use max_threads here, such that the threaded_result 
     * is populated with the result of the computation.
     */
    thr->result = threaded_result;
    pthread_t threads[max_threads];
    for (int i = 0; i < max_threads; i++) {
        ThreadParams *params = (ThreadParams *) malloc(sizeof(ThreadParams));
        params->first_array = array1;
        params->second_array = array2;
        params->result = threaded_result;
        params->row_index = i;
        params->size = size;
        params->max_threads = max_threads;
        pthread_create(&threads[i], NULL, &multiply_matrices_threaded, (void *) params);
    }
    for (int i = 0; i < max_threads; i++) {
        pthread_join(threads[i], NULL);
    }

    gettimeofday(&end, NULL);
    //The next line is inspired by https://linuxhint.com/gettimeofday_c_language/
    microseconds = (end.tv_sec * 1000000 + end.tv_usec) - (begin.tv_sec * 1000000 + begin.tv_usec);
    duration = (1.0 * microseconds) / 1000000;
    printf("Multi-threaded took %lf seconds to execute \n", duration);

    if (check_if_matrices_differ(result, threaded_result, size) != 0) {
        printf("Threaded result differ from single core computation, error\n");
        exit(1);
    }
    return 0;
}