#include <stdio.h> #include <stdlib.h> #include "mpi.h" #define NUMBER_OF_TESTS 10 int main( argc, argv ) int argc; char **argv; { MPI_Datatype vec1, vec_n; int blocklens[2]; MPI_Aint indices[2]; MPI_Datatype old_types[2]; double *buf, *lbuf; register double *in_p, *out_p; int rank; int n, stride; double t1, t2, tmin; int i, j, k, nloop; MPI_Status status; MPI_Init( &argc, &argv ); MPI_Comm_rank( MPI_COMM_WORLD, &rank ); n = 1000; stride = 24; nloop = 100000/n; buf = (double *) malloc( n * stride * sizeof(double) ); if (!buf) { fprintf( stderr, "Could not allocate send/recv buffer of size %d\n", n * stride ); MPI_Abort( MPI_COMM_WORLD, 1 ); } lbuf = (double *) malloc( n * sizeof(double) ); if (!lbuf) { fprintf( stderr, "Could not allocated send/recv lbuffer of size %d\n", n ); MPI_Abort( MPI_COMM_WORLD, 1 ); } if (rank == 0) printf( "Kind\tn\tstride\ttime (sec)\tRate (MB/sec)\n" ); /* Use a fixed vector type */ MPI_Type_vector( n, 1, stride, MPI_DOUBLE, &vec1 ); MPI_Type_commit( &vec1 ); tmin = 1000; for (k=0; k<NUMBER_OF_TESTS; k++) { if (rank == 0) { /* Make sure both processes are ready */ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 1, 14, MPI_BOTTOM, 0, MPI_INT, 1, 14, MPI_COMM_WORLD, &status ); t1 = MPI_Wtime(); for (j=0; j<nloop; j++) { MPI_Send( buf, 1, vec1, 1, k, MPI_COMM_WORLD ); MPI_Recv( buf, 1, vec1, 1, k, MPI_COMM_WORLD, &status ); } t2 = (MPI_Wtime() - t1) / nloop; if (t2 < tmin) tmin = t2; } else if (rank == 1) { /* Make sure both processes are ready */ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 0, 14, MPI_BOTTOM, 0, MPI_INT, 0, 14, MPI_COMM_WORLD, &status ); for (j=0; j<nloop; j++) { MPI_Recv( buf, 1, vec1, 0, k, MPI_COMM_WORLD, &status ); MPI_Send( buf, 1, vec1, 0, k, MPI_COMM_WORLD ); } } } /* Convert to half the round-trip time */ tmin = tmin / 2.0; if (rank == 0) { printf( "Vector\t%d\t%d\t%f\t%f\n", n, stride, tmin, n * sizeof(double) * 1.0e-6 / tmin ); } MPI_Type_free( &vec1 ); /* Use a variable vector type */ blocklens[0] = 1; blocklens[1] = 1; indices[0] = 0; indices[1] = stride * sizeof(double); old_types[0] = MPI_DOUBLE; old_types[1] = MPI_UB; MPI_Type_struct( 2, blocklens, indices, old_types, &vec_n ); MPI_Type_commit( &vec_n ); tmin = 1000; for (k=0; k<NUMBER_OF_TESTS; k++) { if (rank == 0) { /* Make sure both processes are ready */ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 1, 14, MPI_BOTTOM, 0, MPI_INT, 1, 14, MPI_COMM_WORLD, &status ); t1 = MPI_Wtime(); for (j=0; j<nloop; j++) { MPI_Send( buf, n, vec_n, 1, k, MPI_COMM_WORLD ); MPI_Recv( buf, n, vec_n, 1, k, MPI_COMM_WORLD, &status ); } t2 = (MPI_Wtime() - t1) / nloop; if (t2 < tmin) tmin = t2; } else if (rank == 1) { /* Make sure both processes are ready */ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 0, 14, MPI_BOTTOM, 0, MPI_INT, 0, 14, MPI_COMM_WORLD, &status ); for (j=0; j<nloop; j++) { MPI_Recv( buf, n, vec_n, 0, k, MPI_COMM_WORLD, &status ); MPI_Send( buf, n, vec_n, 0, k, MPI_COMM_WORLD ); } } } /* Convert to half the round-trip time */ tmin = tmin / 2.0; if (rank == 0) { printf( "Struct\t%d\t%d\t%f\t%f\n", n, stride, tmin, n * sizeof(double) * 1.0e-6 / tmin ); } MPI_Type_free( &vec_n ); /* Use user-packing with known stride */ tmin = 1000; for (k=0; k<NUMBER_OF_TESTS; k++) { if (rank == 0) { /* Make sure both processes are ready */ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 1, 14, MPI_BOTTOM, 0, MPI_INT, 1, 14, MPI_COMM_WORLD, &status ); t1 = MPI_Wtime(); for (j=0; j<nloop; j++) { /* If the compiler isn't good at unrolling and changing multiplication to indexing, this won't be as good as it could be */ for (i=0; i<n; i++) lbuf[i] = buf[i*stride]; MPI_Send( lbuf, n, MPI_DOUBLE, 1, k, MPI_COMM_WORLD ); MPI_Recv( lbuf, n, MPI_DOUBLE, 1, k, MPI_COMM_WORLD, &status ); for (i=0; i<n; i++) buf[i*stride] = lbuf[i]; } t2 = (MPI_Wtime() - t1) / nloop; if (t2 < tmin) tmin = t2; } else if (rank == 1) { /* Make sure both processes are ready */ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 0, 14, MPI_BOTTOM, 0, MPI_INT, 0, 14, MPI_COMM_WORLD, &status ); for (j=0; j<nloop; j++) { MPI_Recv( lbuf, n, MPI_DOUBLE, 0, k, MPI_COMM_WORLD, &status ); for (i=0; i<n; i++) buf[i*stride] = lbuf[i]; for (i=0; i<n; i++) lbuf[i] = buf[i*stride]; MPI_Send( lbuf, n, MPI_DOUBLE, 0, k, MPI_COMM_WORLD ); } } } /* Convert to half the round-trip time */ tmin = tmin / 2.0; if (rank == 0) { printf( "User\t%d\t%d\t%f\t%f\n", n, stride, tmin, n * sizeof(double) * 1.0e-6 / tmin ); } /* Use user-packing with known stride, using addition in the user copy code */ tmin = 1000; for (k=0; k<NUMBER_OF_TESTS; k++) { if (rank == 0) { /* Make sure both processes are ready */ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 1, 14, MPI_BOTTOM, 0, MPI_INT, 1, 14, MPI_COMM_WORLD, &status ); t1 = MPI_Wtime(); for (j=0; j<nloop; j++) { /* If the compiler isn't good at unrolling and changing multiplication to indexing, this won't be as good as it could be */ in_p = buf; out_p = lbuf; for (i=0; i<n; i++) { out_p[i] = *in_p; in_p += stride; } MPI_Send( lbuf, n, MPI_DOUBLE, 1, k, MPI_COMM_WORLD ); MPI_Recv( lbuf, n, MPI_DOUBLE, 1, k, MPI_COMM_WORLD, &status ); out_p = buf; in_p = lbuf; for (i=0; i<n; i++) { *out_p = in_p[i]; out_p += stride; } } t2 = (MPI_Wtime() - t1) / nloop; if (t2 < tmin) tmin = t2; } else if (rank == 1) { /* Make sure both processes are ready */ MPI_Sendrecv( MPI_BOTTOM, 0, MPI_INT, 0, 14, MPI_BOTTOM, 0, MPI_INT, 0, 14, MPI_COMM_WORLD, &status ); for (j=0; j<nloop; j++) { MPI_Recv( lbuf, n, MPI_DOUBLE, 0, k, MPI_COMM_WORLD, &status ); in_p = lbuf; out_p = buf; for (i=0; i<n; i++) { *out_p = in_p[i]; out_p += stride; } out_p = lbuf; in_p = buf; for (i=0; i<n; i++) { out_p[i] = *in_p; in_p += stride; } MPI_Send( lbuf, n, MPI_DOUBLE, 0, k, MPI_COMM_WORLD ); } } } /* Convert to half the round-trip time */ tmin = tmin / 2.0; if (rank == 0) { printf( "User(add)\t%d\t%d\t%f\t%f\n", n, stride, tmin, n * sizeof(double) * 1.0e-6 / tmin ); } MPI_Finalize( ); return 0; }