Matrix Multiply

Matrix Multiply
	Some MPI Examples
mmult.c
/* 
   Matrix Multiply

   From PVM: Parallel Virtual Machine
   A Users' Guide and Tutorial for Networked Parallel Computing

   Geist et al

   Hauled over to mpi by t. bennet.
*/

/* defines and prototypes for the MPI library */
#include <mpi.h>
#include <stdio.h>

/* Maximum number of children this program will spawn */
#define MAXNTIDS    100
#define MAXROW      10

/* Message tags */
#define ATAG        2
#define BTAG        3
#define DIMTAG      5

int myid;

void
InitBlock(float *a, float *b, float *c, int blk, int row, int col)
{
        int len, ind;
        int i,j;

        srand(myid);
        len = blk*blk;
        for (ind = 0; ind < len; ind++) 
                { a[ind] = (float)(rand()%1000)/100.0; c[ind] = 0.0; }
        for (i = 0; i < blk; i++) {
                for (j = 0; j < blk; j++) {
                        if (row == col)
                                b[j*blk+i] = (i==j)? 1.0 : 0.0;
                        else
                                b[j*blk+i] = 0.0;
                }
        }
}

void
BlockMult(float* c, float* a, float* b, int blk) 
{
        int i,j,k;

        for (i = 0; i < blk; i++)
                for (j = 0; j < blk; j ++)
                        for (k = 0; k < blk; k++)
                                c[i*blk+j] += (a[i*blk+k] * b[k*blk+j]);
}

int
main(int argc, char* argv[])
{
        /* Get set up. */
        MPI_Init(&argc,&argv);
        MPI_Comm_rank(MPI_COMM_WORLD,&myid);

        /* number of tasks.*/
        int ntask;
        MPI_Comm_size(MPI_COMM_WORLD,&ntask);

        /* Parameters. */
        int i, m, blksize;

        /* array of the tids in my row */
        float *a, *b, *c, *atmp;
        int row, col, up, down, left;

        /* Get the array dimensions. */
        if (argc == 3) {
                m = atoi(argv[1]);
                blksize = atoi(argv[2]);
        }
        if (argc < 3) {
                fprintf(stderr, "usage: mmult dim\n");
                MPI_Finalize();
                exit(3);
        }
        if (ntask != m*m) {
                fprintf(stderr, 
                        "number of tasks must equal number of regions.");
                MPI_Finalize();
                exit(4);
        }

        /* find the tids in my row */
        //for (i = 0; i < m; i++) 
        //      myrow[i] = pvm_gettid("mmult", (myid/m)*m + i);

        /* allocate the memory for the local blocks */
        a = (float*)malloc(sizeof(float)*blksize*blksize);
        b = (float*)malloc(sizeof(float)*blksize*blksize);
        c = (float*)malloc(sizeof(float)*blksize*blksize);
        atmp = (float*)malloc(sizeof(float)*blksize*blksize);

        /* check for valid pointers */
        if (!(a && b && c && atmp)) { 
                fprintf(stderr, "%s: out of memory!\n", argv[0]);
                free(a); free(b); free(c); free(atmp);
                MPI_Finalize();
                exit(5);
        }

        /* find my block's row and column, and the left end of my row. */
        row = myid/m; 
        col = myid % m;
        left = row*m;

        /* Make a communicator for our row. (This is gross.  Surely there's
           a reasonable way to do this.) */
        MPI_Group all;
        MPI_Comm_group(MPI_COMM_WORLD, &all);
        int range[3] = { left, left + m - 1, 1 };
        MPI_Group rowgrp;
        MPI_Group_range_incl(all, 1, &range, &rowgrp);
        MPI_Comm rowcom;
        MPI_Comm_create(MPI_COMM_WORLD, rowgrp, &rowcom);

        /* calculate the neighbors above and below. */
        up = (row ? row-1 : m-1)*m+col;
        down = row == m-1 ? col : (row+1)*m+col;

        /* initialize the blocks */
        InitBlock(a, b, c, blksize, row, col);

        /* do the matrix multiply */
        for (i = 0; i < m; i++) {
                /* Mcast the block of matrix A.  The communicator row 
                   determines the set of receivers, which was set earlier
                   to be our row. */
                if (col == (row + i)%m) 
                        memcpy(atmp, a, sizeof(float)*blksize*blksize);
                MPI_Bcast(atmp, blksize*blksize, MPI_FLOAT, (row + i)%m, 
                          rowcom);
                BlockMult(c,atmp,b,blksize);

                /* rotate the columns of B */
                MPI_Send(b, blksize*blksize, MPI_FLOAT, up, (i+1)*BTAG,
                         MPI_COMM_WORLD);
                MPI_Status st;
                MPI_Recv(b, blksize*blksize, MPI_FLOAT, down, (i+1)*BTAG,
                         MPI_COMM_WORLD, &st);
        }

        /* check it */
        for (i = 0 ; i < blksize*blksize; i++) 
                if (a[i] != c[i]) 
                        printf("Error a[%d] (%g) != c[%d] (%g) \n", i, a[i],i,
                               c[i]);

        if(myid == 0) printf("Done.\n");
        free(a); free(b); free(c); free(atmp);
        MPI_Finalize();
        return 0;
}
Run this beast with something like:
mpiexec -hosts 2 bennet.mc.edu 4 sandbox.mc.edu 5 mmult 3 100
Here, I've made the number of tasks (4 + 5) agree with the number of squares (3 × 3). The program will barf if they don't agree. This is redundant, and I should fix it, but I hope no one is holding their breath. (Or you could fix it.)