/* matmul.shmem.c SJ */
/* an example of SysV shmem usage */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <unistd.h>
#include <wait.h>
#include <sys/ipc.h>
#include <sys/shm.h>


#define N 400
/* creates PP*PP+1 processes */
#define PP 2
#define KEY 123

#define NEW(type)  ( (type*)malloc(sizeof(type)) )

typedef double matrix[N][N];

void matmul(matrix *A, matrix *B, matrix *C);
void partmatmul(matrix *A, matrix *B, matrix *C,
				int size, int blocks, int row, int col);
double dtime();

int main(int argc, char **argv)
{
  pid_t child;
  int i, j;
  double starttime, endtime, seqtime, partime;

  int row, col;

  struct shmid_ds smdesc; /* shared memory descriptor */
  int smid;	/* shamred memory block id */
  char *smaddr; /* local mapping address of shared memory */

  pid_t childs[PP][PP];

  
  matrix *A, *B, *C;
  matrix *sA, *sB, *sC;

  /* space allocation */
  A = NEW(matrix);
  B = NEW(matrix);
  C = NEW(matrix);

  if (! (A && B && C)) {
	  perror("malloc: \n");
	  free(A);
	  free(B);
	  free(C);
	  exit(2);
  }

  /* create random matrices */
  for (i = 0; i < N; i++) {
	  for (j = 0; j < N; j++) {

		  (*A)[i][j] = (double)(i + j + 1);
		  (*B)[i][j] = (double)(i + j);

	  }
  }

  starttime = dtime();

  /* sequential */
  matmul(A, B, C);

  endtime = dtime();
  seqtime = endtime-starttime;
  printf("Sequential  %6.2f s, %6.2lf MFLOPS, C(6, 9) = %lf \n",
		 seqtime, (N*N*N*2)/(1000000*seqtime), (*C)[6][9]);


  /* parallel with shared memory */

  starttime = dtime();

  /* allocate shareds memory segments for A, B, C */
  smid = shmget(KEY, 3*sizeof(matrix), IPC_CREAT | IPC_EXCL | 0600);
  if (smid == -1) {
	  perror("shmget: ");
	  exit(1);
  }

  /* attach shared memory for local use */
  smaddr = shmat(smid, NULL, 0);
  if (!smaddr) {
	  perror("shmat: ");
	  exit(1);
  }

  if (0)
	  (*A)[6][9] = 3.3;

  /* create addresses to matrices */
  sA = (matrix*)smaddr;
  sB = (matrix*)(smaddr + sizeof(matrix));
  sC = (matrix*)(smaddr + 2*sizeof(matrix));

  /* copy A and B to shared memory */
  memcpy(sA, A, sizeof(matrix));
  memcpy(sB, B, sizeof(matrix));

  /* create PP*PP child processes */
 
  for (row = 0; row < PP; row++) {
	  for (col = 0; col < PP; col++) {
		  child = fork();
		  if (child == 0) {
			  /* child process */
			  partmatmul(sA, sB, sC, N, PP, row, col);
			  exit(0);
		  } else if (child == -1) {
			  perror("fork: ");
			  exit(1);
		  } else {
			  childs[row][col] = child;
		  }
	  }
  }

  for (row = 0; row < PP; row++)
	  for (col = 0; col < PP; col++)
		  waitpid(childs[row][col], NULL, 0);

  endtime = dtime();

  partime = endtime-starttime;
  printf("Parallel %6.2f s, %6.2f MFLOPS, C(6, 9) = %f \n",
		 partime, (N*N*N*2)/(1000000*partime), (*sC)[6][9]);
  printf("Speedup : %6.4f\n", (float)seqtime/partime);

  shmdt(smaddr);
  shmctl(smid, IPC_RMID, &smdesc);


  free(A);
  free(B);
  free(C);

  return 0;
}

void matmul(matrix *A, matrix *B, matrix *C)
{
	int i, j, k;
	double c;

	for (i = 0; i < N; i++) {
		for (j = 0; j < N; j++) {

			c = 0.0;
			for (k = 0; k < N; k++) {
				c += (*A)[i][k] * (*B)[k][j];
			}
			(*C)[i][j] = c;
		}
	}
}

void partmatmul(matrix *A, matrix *B, matrix *C,
				int size, int blocks, int row, int col)
{
	int i, j, k;
	double c;
	int Frow, Lrow, Fcol, Lcol;


	Frow = row*(size/blocks);
	Lrow = (row+1)*(size/blocks)-1;
	Fcol = col*(size/blocks);
	Lcol = (col+1)*(size/blocks)-1;

	for (i = Frow; i <= Lrow; i++) {
		for (j = Frow; j <= Lcol; j++) {

			c = 0.0;
			for (k = 0; k < size; k++) {
				c += (*A)[i][k] * (*B)[k][j];
			}
			(*C)[i][j] = c;
		}
	}

}

double dtime()
{
	struct timeval tv;
	struct timezone tz;

	gettimeofday(&tv, &tz);
	return tv.tv_sec + (double)tv.tv_usec/1000000;
}