Reports

here is an update, I have written an update version of the code using dynamic allocation for all the matrices, this works quite well in parallel too(I have tested it up to 4096x4096); the only minor issue is that, with the largest size tested, I had to turn off the function call to the "print" function because it stalled the program.

Inside the function for the block multiplication there is now a condition on all 3 inner loops to take care of the scenario where row and columns values cannot be divided by block dimension, using fmin() function with this syntax:

for(int i=ii; i<fmin(ii+blockSize, rowsA); ++i)
                                {
                                    for(int j=jj; j<fmin(jj+blockSize, colsB); ++j)
                                    {
                                        for(int k=kk;k<fmin(kk+blockSize, rowsA); ++k)
                                        {
                                        matC[i][j] += matA[i][k]*matB[k][j];

I tried this approach also in the early version of the serial code but for some reason it didn't work, probably because I made some logical mistakes.

Anyway, this code do not work on rectangular matrices, if you try to run it with 2 rectangular matrices you will get an error because pointers writes outiside the memory areas they are supposed to work into.
I tried to think about how to convert all checks and mathematical conditions required for rectangular matrices into working code but I had no success, I admit it's beyond my skills, if anyone has code (maybe from past examples or from some source on the net) to be used it could be an extra addition to the algorithm, I searched a lot both here and on the internet but found nothing.

Here is the updated full code:

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>

/* run this program using the console pauser or add your own getch, system("pause") or input loop */


    // function for product block calculation between matri A and B
    void matMultDyn(int rowsA, int colsA, int rowsB, int colsB, int blockSize, int **matA, int **matB, int **matC)
    {
        double total_time_prod = omp_get_wtime();
        #pragma omp parallel
        {
            #pragma omp single
            {
                //int num_threads=omp_get_num_threads();
                //printf("%d ", num_threads);
                for(int ii=0; ii<rowsA; ii+=blockSize)
                {
                    for(int jj=0; jj<colsB; jj+=blockSize)
                    {
                        for(int kk=0; kk<rowsA; kk+=blockSize)
                        {
                            #pragma omp task depend(in: matA[ii:blockSize][kk:blockSize], matB[kk:blockSize][jj:blockSize]) depend(inout: matC[ii:blockSize][jj:blockSize])
                            {
                                for(int i=ii; i<fmin(ii+blockSize, rowsA); ++i)
                                {
                                    for(int j=jj; j<fmin(jj+blockSize, colsB); ++j)
                                    {
                                        for(int k=kk;k<fmin(kk+blockSize, rowsA); ++k)
                                        {
                                        matC[i][j] += matA[i][k]*matB[k][j];
                                        //printf("Hello from iteration n: %d\n",k);
                                        //printf("Test valore matrice: %d\n",matC[i][j]);
                                        //printf("Thread Id: %d\n",omp_get_thread_num());
                                        }
                                    }
                                }
                            }               
                        }
                    }    
                }
            }
        }
        total_time_prod = omp_get_wtime() - total_time_prod;
        printf("Total product execution time by parallel threads (in seconds): %f\n", total_time_prod);
    }
    
    
    //Function for printing of the Product Matrix
    void printMatrix(int **product, int rows, int cols)
    { 
        printf("Resultant Product Matrix:\n");
        for (int i = 0; i < rows; i++) { 
            for (int j = 0; j < cols; j++) { 
                printf("%d ", product[i][j]); 
            } 
            printf("\n"); 
        } 
    } 

int main(int argc, char *argv[]) {
    
    //variable to calculate total program runtime
    double program_runtime = omp_get_wtime();
    //matrices and blocksize dimensions
    int rowsA = 256, colsA = 256;
    int rowsB = 256, colsB = 256;
    int blockSize = 24;
    
    
    
    if (colsA != rowsB)
    {
    printf("No. of columns of first matrix must match no. of rows of the second matrix, program terminated");
    exit(EXIT_SUCCESS);
    }
    else if(rowsA != rowsB || rowsB != colsB)
    {
        blockSize= 1;
        //printf("Blocksize value: %f\n", blockSize);
    }
    
    //variable to calculate total time for inizialization procedures
    double init_runtime = omp_get_wtime();
    //Dynamic matrices pointers allocation
    int** matA = (int**)malloc(rowsA * sizeof(int*));
    int** matB = (int**)malloc(rowsB * sizeof(int*));
    int** matC = (int**)malloc(rowsA * sizeof(int*));
    
    //check for segmentation fault 
    if (matA == NULL || matB == NULL || matC == NULL)
    {
        fprintf(stderr, "out of memory\n");
        exit(0);
    }


    
    //------------------------------------ Matrices initializazion ------------------------------------------
        
    // MatA initialization
    //#pragma omp parallel for
    for (int i = 0; i < rowsA; i++)
        {
            matA[i] = (int*)malloc(colsA * sizeof(int));
        }
        
    for (int i = 0; i < rowsA; i++)
            for (int j = 0; j < colsA; j++)
                matA[i][j] = 3;
    
    
    
    
    // MatB initialization
    //#pragma omp parallel for
    for (int i = 0; i < rowsB; i++)
    {
            matB[i] = (int*)malloc(colsB * sizeof(int));
    }
        
    for (int i = 0; i < rowsB; i++)
            for (int j = 0; j < colsB; j++)
                matB[i][j] = 1;
    
    
    
    // matC initialization (Product Matrix)
    //#pragma omp parallel for
    for (int i = 0; i < rowsA; i++)
    {
            matC[i] = (int*)malloc(colsB * sizeof(int));
    }
        
    for (int i = 0; i < rowsA; i++)
            for (int j = 0; j < colsB; j++)
                matC[i][j] = 0;
    
    
    
    
    init_runtime = omp_get_wtime() - init_runtime;
    printf("Total time for matrix initialization (in seconds): %f\n", init_runtime);
    //omp_set_num_threads(8);
    
    // function call for block matrix product between A and B
    matMultDyn(rowsA, rowsA, rowsB, colsB, blockSize, matA, matB, matC); 
    
    // function call to print the resultant Product matrix C
    printMatrix(matC, rowsA, colsB);
    
    
    
    
    // --------------------------------------- Dynamic matrices pointers' cleanup -------------------------------------------
    for (int i = 0; i < rowsA; i++) { 
        free(matA[i]); 
        free(matC[i]); 
    } 
    for (int i = 0; i < colsB; i++) { 
        free(matB[i]); 
    } 
    
    free(matA); 
    free(matB); 
    free(matC);
    
    
    //Program total runtime calculation
    program_runtime = omp_get_wtime() - program_runtime;
    printf("Program total runtime (in seconds): %f\n", program_runtime);
    return 0;
    
}

To complete the testing and comparison on the code, I will create a machine on Google Clould equipped with 32 cores, so I can see how the code run on an actual 16 cores machine and then with 32 cores.
For reference, I'm running this code on my MSI notebook, which is equipped with an Intel i7th 11800, 8 cores at 3.2 Ghz, and can manage up to 16 threads concurrently; the reason to go and test on Google Cloud is because I want to have the software run on a "real" 16 cores machine, where 1 threads run on one core, and then scaling further up to 32 cores.
With the collected data I will then draw some graphs for comparison.

79745987