Skip to content

Instantly share code, notes, and snippets.

@rmcgibbo
Last active August 16, 2022 23:15
Show Gist options
  • Select an option

  • Save rmcgibbo/7178576 to your computer and use it in GitHub Desktop.

Select an option

Save rmcgibbo/7178576 to your computer and use it in GitHub Desktop.

Revisions

  1. rmcgibbo renamed this gist Oct 27, 2013. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  2. rmcgibbo revised this gist Oct 27, 2013. 1 changed file with 58 additions and 74 deletions.
    132 changes: 58 additions & 74 deletions test.cpp
    Original file line number Diff line number Diff line change
    @@ -1,108 +1,92 @@
    /*
    * An efficient MPI parallel reduction without MPI_Scan or MPI_Reduce. (i.e.
    * only send/recv).
    *
    * TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
    * 0. You just DO WHAT THE FUCK YOU WANT TO.
    */
    #include <mpi.h>
    #include <cstdio>
    #include <cmath>
    #include <vector>
    using std::vector;

    int log2(int v) {
    // http://graphics.stanford.edu/~seander/bithacks.html
    // i think this is 32 bit specific
    int r;
    static const int MultiplyDeBruijnBitPosition[32] =
    {
    0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
    8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
    };

    v |= v >> 1; // first round down to one less than a power of 2
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;

    r = MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
    return r;
    }
    static int fastlog2(uint32_t v);


    int parallel_reduce(int mydata) {
    int MPI_ManualReduce(int value) {
    /*
    * Reduces values on all processes to a single value on rank 0.
    *
    * This function does the same thing as the function MPI_Reduce using
    * only MPI_Send and MPI_Recv. As shown, it operates with additions on
    * integers, so you could trivially use MPI_Reduce, but for operations
    * on variable size structs for which you cannot define an MPI_Datatype,
    * you can still use this method, by modifying it to use your op
    * and your datastructure.
    */
    int recvbuffer;
    int tag = 0;
    const int size = MPI::COMM_WORLD.Get_size();
    const int rank = MPI::COMM_WORLD.Get_rank();

    for (int i = 0; i < size; i++) {
    MPI::COMM_WORLD.Barrier();
    if (i == rank)
    printf("data[%d]=%d\n", rank, mydata);
    }
    MPI::COMM_WORLD.Barrier();
    const int lastpower = 1 << fastlog2(size);




    // each of the ranks greater than the last power of 2 less than size
    // need to downshift their data, since the binary tree reduction below
    // only works when N is a power of two.
    const int lastpower = 1<<log2(size);
    for (int i = lastpower; i < size; i++)
    if (rank == i) {
    printf("rank %d sending down to %d\n", i, i-lastpower);
    MPI::COMM_WORLD.Send(&mydata, 1, MPI_INT, i-lastpower, tag);
    mydata = 0;
    }
    if (rank == i)
    MPI::COMM_WORLD.Send(&value, 1, MPI_INT, i-lastpower, tag);
    for (int i = 0; i < size-lastpower; i++)
    if (rank == i) {
    printf("rank %d receiving up from %d\n", i, i+lastpower);
    MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, i+lastpower, tag);
    mydata += recvbuffer;
    value += recvbuffer; // your operation
    }

    for (int i = 0; i < size; i++) {
    MPI::COMM_WORLD.Barrier();
    if (i == rank)
    printf("data[%d]=%d\n", rank, mydata);
    }

    MPI::COMM_WORLD.Barrier();


    for (int d = 0; d < log2(lastpower); d++) {
    tag += 1;
    for (int k = 0; k < lastpower; k += 1<<(d+1)) {
    int receiver = k + (1<<(d+1)) - 1;
    int sender = k + (1<<d) - 1;

    if (rank == 0)
    printf("x[%d] = x[%d] + x[%d]\n", receiver, sender, receiver);

    for (int d = 0; d < fastlog2(lastpower); d++)
    for (int k = 0; k < lastpower; k += 1 << (d + 1)) {
    const int receiver = k;
    const int sender = k + (1 << d);
    if (rank == receiver) {
    MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, sender, tag);
    mydata += recvbuffer;
    }
    else if (rank == sender) {
    MPI::COMM_WORLD.Send(&mydata, 1, MPI_INT, receiver, tag);
    value += recvbuffer; // your operation
    }
    else if (rank == sender)
    MPI::COMM_WORLD.Send(&value, 1, MPI_INT, receiver, tag);
    }
    MPI::COMM_WORLD.Barrier();
    if (rank == 0)
    printf("\n");
    }
    return mydata;
    return value;
    }

    static int fastlog2(uint32_t v) {
    // http://graphics.stanford.edu/~seander/bithacks.html
    int r;
    static const int MultiplyDeBruijnBitPosition[32] =
    {
    0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
    8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
    };

    v |= v >> 1; // first round down to one less than a power of 2
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;

    r = MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
    return r;
    }



    int main(int argc, char* argv[]) {
    MPI::Init(argc, argv);
    const int size = MPI::COMM_WORLD.Get_size();
    const int rank = MPI::COMM_WORLD.Get_rank();

    vector<int> data(size);
    std::vector<int> data(size);
    for (int i = 0; i < size; i++)
    data[i] = i;

    // each rank only gets one entry, and
    // they need to sum them by sending messages
    int result = parallel_reduce(data[rank]);
    int result = MPI_ManualReduce(data[rank]);
    MPI::COMM_WORLD.Barrier();

    // Compute the correct result
    @@ -111,9 +95,9 @@ int main(int argc, char* argv[]) {
    sum += data[i];


    if (rank == (1<<log2(size))-1) {
    printf("Final Result on Rank%d = %d\n", rank, result);
    printf("Correct result = %d\n", sum);
    if (rank == 0) {
    printf("MPI Result = %d\n", result);
    printf("Correct Result = %d\n", sum);
    }
    MPI::Finalize();
    }
    }
  3. rmcgibbo revised this gist Oct 27, 2013. 1 changed file with 43 additions and 6 deletions.
    49 changes: 43 additions & 6 deletions test.cpp
    Original file line number Diff line number Diff line change
    @@ -27,24 +27,61 @@ int log2(int v) {

    int parallel_reduce(int mydata) {
    int recvbuffer;
    int tag = 0;
    const int size = MPI::COMM_WORLD.Get_size();
    const int rank = MPI::COMM_WORLD.Get_rank();

    for (int i = 0; i < size; i++) {
    MPI::COMM_WORLD.Barrier();
    if (i == rank)
    printf("data[%d]=%d\n", rank, mydata);
    }
    MPI::COMM_WORLD.Barrier();




    // each of the ranks greater than the last power of 2 less than size
    // need to downshift their data, since the binary tree reduction below
    // only works when N is a power of two.
    const int lastpower = 1<<log2(size);
    for (int i = lastpower; i < size; i++)
    if (rank == i) {
    printf("rank %d sending down to %d\n", i, i-lastpower);
    MPI::COMM_WORLD.Send(&mydata, 1, MPI_INT, i-lastpower, tag);
    mydata = 0;
    }
    for (int i = 0; i < size-lastpower; i++)
    if (rank == i) {
    printf("rank %d receiving up from %d\n", i, i+lastpower);
    MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, i+lastpower, tag);
    mydata += recvbuffer;
    }

    for (int i = 0; i < size; i++) {
    MPI::COMM_WORLD.Barrier();
    if (i == rank)
    printf("data[%d]=%d\n", rank, mydata);
    }

    MPI::COMM_WORLD.Barrier();

    for (int d = 0; d < log2(size); d++) {
    for (int k = 0; k < size; k += 1<<(d+1)) {

    for (int d = 0; d < log2(lastpower); d++) {
    tag += 1;
    for (int k = 0; k < lastpower; k += 1<<(d+1)) {
    int receiver = k + (1<<(d+1)) - 1;
    int sender = k + (1<<d) - 1;

    if (rank == 0)
    printf("x[%d] = x[%d] + x[%d]\n", receiver, sender, receiver);

    if (rank == receiver) {
    MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, sender, d);
    MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, sender, tag);
    mydata += recvbuffer;
    }
    else if (rank == sender) {
    MPI::COMM_WORLD.Send(&mydata, 1, MPI_INT, receiver, d);
    MPI::COMM_WORLD.Send(&mydata, 1, MPI_INT, receiver, tag);
    }
    }
    MPI::COMM_WORLD.Barrier();
    @@ -73,8 +110,8 @@ int main(int argc, char* argv[]) {
    for (int i = 0; i < size; i++)
    sum += data[i];


    if (rank == size-1) {
    if (rank == (1<<log2(size))-1) {
    printf("Final Result on Rank%d = %d\n", rank, result);
    printf("Correct result = %d\n", sum);
    }
  4. rmcgibbo created this gist Oct 27, 2013.
    82 changes: 82 additions & 0 deletions test.cpp
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,82 @@
    #include <mpi.h>
    #include <cstdio>
    #include <cmath>
    #include <vector>
    using std::vector;

    int log2(int v) {
    // http://graphics.stanford.edu/~seander/bithacks.html
    // i think this is 32 bit specific
    int r;
    static const int MultiplyDeBruijnBitPosition[32] =
    {
    0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
    8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
    };

    v |= v >> 1; // first round down to one less than a power of 2
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;

    r = MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
    return r;
    }


    int parallel_reduce(int mydata) {
    int recvbuffer;
    const int size = MPI::COMM_WORLD.Get_size();
    const int rank = MPI::COMM_WORLD.Get_rank();


    for (int d = 0; d < log2(size); d++) {
    for (int k = 0; k < size; k += 1<<(d+1)) {
    int receiver = k + (1<<(d+1)) - 1;
    int sender = k + (1<<d) - 1;

    if (rank == 0)
    printf("x[%d] = x[%d] + x[%d]\n", receiver, sender, receiver);

    if (rank == receiver) {
    MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, sender, d);
    mydata += recvbuffer;
    }
    else if (rank == sender) {
    MPI::COMM_WORLD.Send(&mydata, 1, MPI_INT, receiver, d);
    }
    }
    MPI::COMM_WORLD.Barrier();
    if (rank == 0)
    printf("\n");
    }
    return mydata;
    }

    int main(int argc, char* argv[]) {
    MPI::Init(argc, argv);
    const int size = MPI::COMM_WORLD.Get_size();
    const int rank = MPI::COMM_WORLD.Get_rank();

    vector<int> data(size);
    for (int i = 0; i < size; i++)
    data[i] = i;

    // each rank only gets one entry, and
    // they need to sum them by sending messages
    int result = parallel_reduce(data[rank]);
    MPI::COMM_WORLD.Barrier();

    // Compute the correct result
    int sum = 0;
    for (int i = 0; i < size; i++)
    sum += data[i];


    if (rank == size-1) {
    printf("Final Result on Rank%d = %d\n", rank, result);
    printf("Correct result = %d\n", sum);
    }
    MPI::Finalize();
    }