/* * Matrix-vector multiplication, Version 2 * * This program multiplies a matrix and a vector. * The matrix and vector are input from files. * The answer is printed to standard output. * * Data distribution of matrix: columnwise block striped * Data distribution of vector: block * * Programmed by Michael J. Quinn * * Last modification: 9 September 2002 */ #include #include #include "mympi.h" /* Change these two definitions when the matrix and vector element types change */ typedef double dtype; #define mpitype MPI_DOUBLE int main (int argc, char *argv[]) { dtype **a; /* The first factor, a matrix */ dtype *b; /* The second factor, a vector */ dtype *c; /* The product, a vector */ dtype *c_part_out; /* Partial sums, sent */ dtype *c_part_in; /* Partial sums, received */ int *cnt_out; /* Elements sent to each proc */ int *cnt_in; /* Elements received per proc */ int *disp_out; /* Indices of sent elements */ int *disp_in; /* Indices of received elements */ int i, j; /* Loop indices */ int id; /* Process ID number */ int local_els; /* Cols of 'a' and elements of 'b' held by this process */ int m; /* Rows in the matrix */ int n; /* Columns in the matrix */ int nprime; /* Size of the vector */ int p; /* Number of processes */ double max_seconds; double seconds; /* Elapsed time */ dtype *storage; /* This process's portion of 'a' */ MPI_Init (&argc, &argv); MPI_Comm_rank (MPI_COMM_WORLD, &id); MPI_Comm_size (MPI_COMM_WORLD, &p); read_col_striped_matrix (argv[1], (void ***) &a, (void **) &storage, mpitype, &m, &n, MPI_COMM_WORLD); print_col_striped_matrix ((void **) a, mpitype, m, n, MPI_COMM_WORLD); read_block_vector (argv[2], (void **) &b, mpitype, &nprime, MPI_COMM_WORLD); print_block_vector ((void *) b, mpitype, nprime, MPI_COMM_WORLD); /* Each process multiplies its columns of 'a' and vector 'b', resulting in a partial sum of product 'c'. */ c_part_out = (dtype *) my_malloc (id, n * sizeof(dtype)); local_els = BLOCK_SIZE(id,p,n); MPI_Barrier (MPI_COMM_WORLD); seconds = -MPI_Wtime(); for (i = 0; i < n; i++) { c_part_out[i] = 0.0; for (j = 0; j < local_els; j++) c_part_out[i] += a[i][j] * b[j]; } create_mixed_xfer_arrays (id, p, n, &cnt_out, &disp_out); create_uniform_xfer_arrays (id, p, n, &cnt_in, &disp_in); c_part_in = (dtype*) my_malloc (id, p*local_els*sizeof(dtype)); MPI_Alltoallv (c_part_out, cnt_out, disp_out, mpitype, c_part_in, cnt_in, disp_in, mpitype, MPI_COMM_WORLD); c = (dtype*) my_malloc (id, local_els * sizeof(dtype)); for (i = 0; i < local_els; i++) { c[i] = 0.0; for (j = 0; j < p; j++) c[i] += c_part_in[i + j*local_els]; } MPI_Barrier (MPI_COMM_WORLD); seconds += MPI_Wtime(); MPI_Allreduce (&seconds, &max_seconds, 1, mpitype, MPI_MAX, MPI_COMM_WORLD); if (!id) { printf ("MV3) N = %d, Processes = %d, Time = %12.6f sec,", n, p, max_seconds); printf ("Mflop = %6.2f\n", 2*n*n/(1000000.0*max_seconds)); } print_block_vector ((void *) c, mpitype, n, MPI_COMM_WORLD); MPI_Finalize(); return 0; }