#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include "mympi.h"
 typedef int Element_type ;
# define MPI_TYPE MPI_INT
# define REORDER 1

int main ( int argc , char * argv [])
{
    Element_type **A ;/* matrix to be multiplied */
    Element_type *A_storage ;/* backing storage for matrix */
    Element_type *X_local ;/* vectorj */
    Element_type *B_sums ;/* result vector A*X = B */
    Element_type *B_partial; /* subblock of result vector */ 
    Element_type *input_vec; /* read in input vector */
    int ncols ; // total number of columns in input matrix
    int id ; // process rank in MPI_COMM_WORLD
    int p; // number of processes in MPI_COMM_WORLD
    int i, j; // loop indices
    int error ; // error exit value of calls
    int nlocal_rows ; // number of rows belonging to this process
    int nlocal_cols ; // number of cols belonging to this process
    int grid_id ; // rank of process within Cartesian grid
    int row_id ; // rank of process within its row subgroup
    int grid_size[2]; // grid dimensions
    int grid_coords[2]; // process 's coordinates in Cartesian grid

    MPI_Comm grid_comm ; // Cartesian communicator for grid of procs
    MPI_Comm row_comm ; // communicator for all processes in a grid row
    MPI_Comm col_comm ; // communicator for all processes in a grid col
    int *send_count ; // array of counts for MPI_Scatterv function
    int *send_offset ; // array of offsets for MPI_Scatterv function
    int periodic[2]; // array of wraparound flags when creating grid
    int nrows_X; // number of rows of X vector local to process
    FILE *file; // for open input files

    MPI_Init(&argc, &argv ) ;
    MPI_Comm_rank( MPI_COMM_WORLD, &id ) ;
    MPI_Comm_size( MPI_COMM_WORLD, &p ) ;

    grid_size[0] = 0; // Let MPI choose dimensions */
    grid_size[1] = 0;
    MPI_Dims_create (p, 2, grid_size ) ;
    periodic[0] = 0; // No wraparound
    periodic[1] = 0; // No wraparound

    MPI_Cart_create( MPI_COMM_WORLD, 2, grid_size, periodic, REORDER, &grid_comm );
    MPI_Comm_rank( grid_comm, &grid_id ); // grid_id is rank in Cartesian grid */
    MPI_Cart_coords( grid_comm, grid_id, 2, grid_coords ); // coordinates in grid
    MPI_Comm_split( grid_comm, grid_coords[0], grid_coords[1], &row_comm );
    MPI_Comm_split( grid_comm, grid_coords[1], grid_coords[0], &col_comm ) ;

    read_and_distribute_2dblock_matrix( argv [1], ( void *)&A, ( void *)&A_storage , MPI_TYPE, &nrows, &ncols, &error, grid_comm ) ;
    if ( 0 != error )
	MPI_Abort(MPI_COMM_WORLD, OPEN_FILE_ERROR ) ;
    collect_and_print_2dblock_matrix((void **)A , MPI_TYPE, nrows, ncols, grid_comm);
    nlocal_rows = size_of_block(grid_coords[0] , nrows, grid_size[0]) ;
    nlocal_cols = size_of_block(grid_coords[1] , ncols, grid_size[1]) ;
    if ( 0 == grid_coords [0] ) {
	MPI_Comm_rank(row_comm, &row_id ) ;
	if ( 0 == row_id ) { /* we could have also just checked column coord . */
	   file = fopen (argv [2], " r " ) ;
	   if ( NULL == file ) {
		MPI_Abort( MPI_COMM_WORLD, MPI_ERR_FILE ) ;
	   }
	   else {
	       int temp;
	       fread (&temp, sizeof(int), 1, file ) ; /* Make sure vector length matches matrix width . */
	       if ( temp != ncols ) {
		  fprintf( stderr, " Vector size and matrix size unmatched .\ n" ) ;
		  MPI_Abort( MPI_COMM_WORLD, OPEN_FILE_ERROR ) ;
	       }
	       input_vec = ( Element_type *) malloc( ncols * sizeof ( Element_type ) ) ;
	       if ( NULL == input_vec ) {
	          fprintf ( stderr , " Could not allocate more storage .\ n " ) ;
	          MPI_Abort( MPI_COMM_WORLD, MPI_ERR_NO_MEM ) ;
	       }
	       fread ( input_vec, sizeof( Element_type ) , ncols, file ) ;
	   }
         }
    }
    nrows_X = size_of_block( grid_coords [1], ncols, grid_size [1] ) ;
    X_local = ( Element_type *) malloc( nrows_X * sizeof ( Element_type ) ) ;
    if ( NULL == X_local ) {
	fprintf ( stderr , " Could not allocate more storage .\ n " ) ;
	MPI_Abort( MPI_COMM_WORLD, MPI_ERR_NO_MEM ) ;
    }
    if ( 0 == grid_coords [0] ) {
	if ( grid_size [1] > 1) {
	   send_count = ( int *) malloc ( grid_size [1] * sizeof ( int ) ) ;
	   send_offset = ( int *) malloc ( grid_size [1] * sizeof ( int ) ) ;
	   if ( NULL == send_count || NULL == send_offset ) {
		fprintf( stderr , " Could not allocate more storage .\ n " ) ;
		MPI_Abort ( MPI_COMM_WORLD , MPI_ERR_NO_MEM ) ;
	   }
           init_communication_arrays( grid_size [1], ncols, send_count, send_offset);
           MPI_Scatterv( input_vec, send_count, send_offset, MPI_TYPE, X_local, nrows_X, MPI_TYPE, 0, row_comm ) ;
	}
	else
           for ( i = 0; i < ncols ; i ++) X_local[ i ] = input_vec[ i ];
     }
    MPI_Bcast( X_local, nrows_X, MPI_TYPE, 0, col_comm ) ;
    B_partial = ( Element_type *) malloc ( nlocal_rows * sizeof ( Element_type ) ) ;
    B_sums = ( Element_type *) malloc ( nlocal_rows * sizeof ( Element_type ) ) ;
    if ( NULL == B_sums || NULL == B_partial ) {
	fprintf( stderr , " Could not allocate more storage .\ n " ) ;
	MPI_Abort( MPI_COMM_WORLD , MPI_ERR_NO_MEM ) ;
    }
    for ( i = 0; i < nlocal_rows ; i ++) {
	B_partial[ i ] = 0.0;
	for ( j = 0; j < nlocal_cols ; j ++)
	    B_partial[ i ] += A[ i ][ j ] * X_local[ j ];
    }
    MPI_Reduce ( B_partial , B_sums , nlocal_rows, MPI_TYPE, MPI_SUM, 0, row_comm ) ;
    if ( grid_coords [1] == 0)
	print_block_vector( B_sums, MPI_TYPE, nrows, col_comm ) ;
    MPI_Finalize() ;
    return 0;
}