rmcgibbo · August 16, 2022 23:15 · Oct 27, 2013 · Oct 27, 2013 · Oct 27, 2013 · Oct 27, 2013
diff --git a/test.cpp → MPI_ManualReduce.cpp b/test.cpp → MPI_ManualReduce.cpp
diff --git a/test.cpp b/test.cpp
@@ -1,108 +1,92 @@
+/*
+ * An efficient MPI parallel reduction without MPI_Scan or MPI_Reduce. (i.e. 
+ * only send/recv). 
+ *
+ * TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 
+ *   0. You just DO WHAT THE FUCK YOU WANT TO.
+ */ 
 #include <mpi.h>
 #include <cstdio>
-#include <cmath>
 #include <vector>
-using std::vector;
-
-int log2(int v) {
-  // http://graphics.stanford.edu/~seander/bithacks.html
-  // i think this is 32 bit specific
-  int r;
-  static const int MultiplyDeBruijnBitPosition[32] = 
-  {
-    0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
-    8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
-  };
-
-  v |= v >> 1; // first round down to one less than a power of 2 
-  v |= v >> 2;
-  v |= v >> 4;
-  v |= v >> 8;
-  v |= v >> 16;
-
-  r = MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
-  return r;
-}
+static int fastlog2(uint32_t v);
 
 
-int parallel_reduce(int mydata) {
+int MPI_ManualReduce(int value) {
+  /* 
+   * Reduces values on all processes to a single value on rank 0.
+   *
+   * This function does the same thing as the function MPI_Reduce using
+   * only MPI_Send and MPI_Recv. As shown, it operates with additions on
+   * integers, so you could trivially use MPI_Reduce, but for operations
+   * on variable size structs for which you cannot define an MPI_Datatype,
+   * you can still use this method, by modifying it to use your op
+   * and your datastructure.
+   */ 
   int recvbuffer;
   int tag = 0;
   const int size = MPI::COMM_WORLD.Get_size();
   const int rank = MPI::COMM_WORLD.Get_rank();
-
-  for (int i = 0; i < size; i++) {
-    MPI::COMM_WORLD.Barrier();  
-    if (i == rank)
-     printf("data[%d]=%d\n", rank, mydata);
-  }
-  MPI::COMM_WORLD.Barrier();  
+  const int lastpower = 1 << fastlog2(size);
 
-
-
-
   // each of the ranks greater than the last power of 2 less than size
   // need to downshift their data, since the binary tree reduction below
   // only works when N is a power of two.
-  const int lastpower = 1<<log2(size);
   for (int i = lastpower; i < size; i++)
-    if (rank == i) {
-      printf("rank %d sending down to %d\n", i, i-lastpower);
-      MPI::COMM_WORLD.Send(&mydata, 1, MPI_INT, i-lastpower, tag);
-      mydata = 0;
-    }
+    if (rank == i)
+      MPI::COMM_WORLD.Send(&value, 1, MPI_INT, i-lastpower, tag);
   for (int i = 0; i < size-lastpower; i++)
     if (rank == i) {
-        printf("rank %d receiving up from %d\n", i, i+lastpower);
       MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, i+lastpower, tag);
-      mydata += recvbuffer;
+      value += recvbuffer; // your operation
     }
-
-  for (int i = 0; i < size; i++) {
-    MPI::COMM_WORLD.Barrier();  
-    if (i == rank)
-     printf("data[%d]=%d\n", rank, mydata);
-  }
-
-  MPI::COMM_WORLD.Barrier();  
-
-
-  for (int d = 0; d < log2(lastpower); d++) {
-    tag += 1;
-    for (int k = 0; k < lastpower; k += 1<<(d+1)) {
-      int receiver = k + (1<<(d+1)) - 1;
-      int sender = k + (1<<d) - 1;
-
-      if (rank == 0)
-        printf("x[%d] = x[%d] + x[%d]\n", receiver, sender, receiver);
 
+  for (int d = 0; d < fastlog2(lastpower); d++)
+    for (int k = 0; k < lastpower; k += 1 << (d + 1)) {
+      const int receiver = k;
+      const int sender = k + (1 << d);
       if (rank == receiver) {
         MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, sender, tag);
-        mydata += recvbuffer;
-      }
-      else if (rank == sender) {
-        MPI::COMM_WORLD.Send(&mydata, 1, MPI_INT, receiver, tag);
+        value += recvbuffer; // your operation
       }
+      else if (rank == sender)
+        MPI::COMM_WORLD.Send(&value, 1, MPI_INT, receiver, tag);
     }
-    MPI::COMM_WORLD.Barrier();
-    if (rank == 0)
-      printf("\n");
-  }
-  return mydata;
+  return value;
+}
+
+static int fastlog2(uint32_t v) {
+  // http://graphics.stanford.edu/~seander/bithacks.html
+  int r;
+  static const int MultiplyDeBruijnBitPosition[32] = 
+  {
+    0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+    8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
+  };
+
+  v |= v >> 1; // first round down to one less than a power of 2 
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+
+  r = MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
+  return r;
 }
 
+
+
 int main(int argc, char* argv[]) {
     MPI::Init(argc, argv);
     const int size = MPI::COMM_WORLD.Get_size();
     const int rank = MPI::COMM_WORLD.Get_rank();
 
-    vector<int> data(size);
+    std::vector<int> data(size);
     for (int i = 0; i < size; i++)
       data[i] = i;
 
     // each rank only gets one entry, and
     // they need to sum them by sending messages
-    int result = parallel_reduce(data[rank]);
+    int result = MPI_ManualReduce(data[rank]);
     MPI::COMM_WORLD.Barrier();
 
     // Compute the correct result
@@ -111,9 +95,9 @@ int main(int argc, char* argv[]) {
       sum += data[i];
 
 
-    if (rank == (1<<log2(size))-1) {
-      printf("Final Result on Rank%d =  %d\n", rank, result);
-      printf("Correct result = %d\n", sum);
+    if (rank == 0) {
+      printf("MPI Result     = %d\n", result);
+      printf("Correct Result = %d\n", sum);
     }
     MPI::Finalize();
-}
+}
diff --git a/test.cpp b/test.cpp
@@ -27,24 +27,61 @@ int log2(int v) {
 
 int parallel_reduce(int mydata) {
   int recvbuffer;
+  int tag = 0;
   const int size = MPI::COMM_WORLD.Get_size();
   const int rank = MPI::COMM_WORLD.Get_rank();
+
+  for (int i = 0; i < size; i++) {
+    MPI::COMM_WORLD.Barrier();  
+    if (i == rank)
+     printf("data[%d]=%d\n", rank, mydata);
+  }
+  MPI::COMM_WORLD.Barrier();  
+
+
+
+
+  // each of the ranks greater than the last power of 2 less than size
+  // need to downshift their data, since the binary tree reduction below
+  // only works when N is a power of two.
+  const int lastpower = 1<<log2(size);
+  for (int i = lastpower; i < size; i++)
+    if (rank == i) {
+      printf("rank %d sending down to %d\n", i, i-lastpower);
+      MPI::COMM_WORLD.Send(&mydata, 1, MPI_INT, i-lastpower, tag);
+      mydata = 0;
+    }
+  for (int i = 0; i < size-lastpower; i++)
+    if (rank == i) {
+        printf("rank %d receiving up from %d\n", i, i+lastpower);
+      MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, i+lastpower, tag);
+      mydata += recvbuffer;
+    }
+
+  for (int i = 0; i < size; i++) {
+    MPI::COMM_WORLD.Barrier();  
+    if (i == rank)
+     printf("data[%d]=%d\n", rank, mydata);
+  }
 
+  MPI::COMM_WORLD.Barrier();  
 
-  for (int d = 0; d < log2(size); d++) {
-    for (int k = 0; k < size; k += 1<<(d+1)) {
+
+  for (int d = 0; d < log2(lastpower); d++) {
+    tag += 1;
+    for (int k = 0; k < lastpower; k += 1<<(d+1)) {
       int receiver = k + (1<<(d+1)) - 1;
       int sender = k + (1<<d) - 1;
 
       if (rank == 0)
         printf("x[%d] = x[%d] + x[%d]\n", receiver, sender, receiver);
 
       if (rank == receiver) {
-        MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, sender, d);
+        MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, sender, tag);
         mydata += recvbuffer;
       }
       else if (rank == sender) {
-        MPI::COMM_WORLD.Send(&mydata, 1, MPI_INT, receiver, d);
+        MPI::COMM_WORLD.Send(&mydata, 1, MPI_INT, receiver, tag);
       }
     }
     MPI::COMM_WORLD.Barrier();
@@ -73,8 +110,8 @@ int main(int argc, char* argv[]) {
     for (int i = 0; i < size; i++)
       sum += data[i];
 
-
-    if (rank == size-1) {
+          
+    if (rank == (1<<log2(size))-1) {
       printf("Final Result on Rank%d =  %d\n", rank, result);
       printf("Correct result = %d\n", sum);
     }

diff --git a/test.cpp b/test.cpp
@@ -0,0 +1,82 @@
+#include <mpi.h>
+#include <cstdio>
+#include <cmath>
+#include <vector>
+using std::vector;
+
+int log2(int v) {
+  // http://graphics.stanford.edu/~seander/bithacks.html
+  // i think this is 32 bit specific
+  int r;
+  static const int MultiplyDeBruijnBitPosition[32] = 
+  {
+    0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+    8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
+  };
+
+  v |= v >> 1; // first round down to one less than a power of 2 
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+
+  r = MultiplyDeBruijnBitPosition[(uint32_t)(v * 0x07C4ACDDU) >> 27];
+  return r;
+}
+
+
+int parallel_reduce(int mydata) {
+  int recvbuffer;
+  const int size = MPI::COMM_WORLD.Get_size();
+  const int rank = MPI::COMM_WORLD.Get_rank();
+
+
+  for (int d = 0; d < log2(size); d++) {
+    for (int k = 0; k < size; k += 1<<(d+1)) {
+      int receiver = k + (1<<(d+1)) - 1;
+      int sender = k + (1<<d) - 1;
+
+      if (rank == 0)
+        printf("x[%d] = x[%d] + x[%d]\n", receiver, sender, receiver);
+
+      if (rank == receiver) {
+        MPI::COMM_WORLD.Recv(&recvbuffer, 1, MPI_INT, sender, d);
+        mydata += recvbuffer;
+      }
+      else if (rank == sender) {
+        MPI::COMM_WORLD.Send(&mydata, 1, MPI_INT, receiver, d);
+      }
+    }
+    MPI::COMM_WORLD.Barrier();
+    if (rank == 0)
+      printf("\n");
+  }
+  return mydata;
+}
+
+int main(int argc, char* argv[]) {
+    MPI::Init(argc, argv);
+    const int size = MPI::COMM_WORLD.Get_size();
+    const int rank = MPI::COMM_WORLD.Get_rank();
+
+    vector<int> data(size);
+    for (int i = 0; i < size; i++)
+      data[i] = i;
+
+    // each rank only gets one entry, and
+    // they need to sum them by sending messages
+    int result = parallel_reduce(data[rank]);
+    MPI::COMM_WORLD.Barrier();
+
+    // Compute the correct result
+    int sum = 0;
+    for (int i = 0; i < size; i++)
+      sum += data[i];
+
+
+    if (rank == size-1) {
+      printf("Final Result on Rank%d =  %d\n", rank, result);
+      printf("Correct result = %d\n", sum);
+    }
+    MPI::Finalize();
+}
No results found