3 files changed, 73 insertions, 7 deletions
diff --git a/Source/Parallelization/WarpXRegrid.cpp b/Source/Parallelization/WarpXRegrid.cpp
index 36a1ed776..43a03f5ed 100644
--- a/Source/Parallelization/WarpXRegrid.cpp
+++ b/Source/Parallelization/WarpXRegrid.cpp
@@ -30,22 +30,33 @@ WarpX::LoadBalance ()
     for (int lev = 0; lev <= nLevels; ++lev)
     {
 #ifdef AMREX_USE_MPI
-        // Parallel reduce the costs_heurisitc
+        // Parallel reduce the costs
         amrex::Vector<Real>::iterator it = (*costs[lev]).begin();
         amrex::Real* itAddr = &(*it);
-        ParallelAllReduce::Sum(itAddr,
-                               costs[lev]->size(),
-                               ParallelContext::CommunicatorSub());
+        ParallelAllReduce::Sum(itAddr, costs[lev]->size(), ParallelContext::CommunicatorSub());
 #endif
+        // Compute efficiency for the current distribution mapping
+        const DistributionMapping& currentdm = DistributionMap(lev);
+        amrex::Real currentEfficiency = 0.0;
+        if (load_balance_efficiency_ratio_threshold > 0.0)
+        {
+            ComputeDistributionMappingEfficiency(currentdm, *costs[lev],
+                                                 currentEfficiency);
+        }
+
         const amrex::Real nboxes = costs[lev]->size();
         const amrex::Real nprocs = ParallelContext::NProcsSub();
         const int nmax = static_cast<int>(std::ceil(nboxes/nprocs*load_balance_knapsack_factor));
 
+        amrex::Real proposedEfficiency = 0.0;
         const DistributionMapping newdm = (load_balance_with_sfc)
-            ? DistributionMapping::makeSFC(*costs[lev], boxArray(lev), false)
-            : DistributionMapping::makeKnapSack(*costs[lev], nmax);
+            ? DistributionMapping::makeSFC(*costs[lev], boxArray(lev), proposedEfficiency, false)
+            : DistributionMapping::makeKnapSack(*costs[lev], proposedEfficiency, nmax);
 
-        RemakeLevel(lev, t_new[lev], boxArray(lev), newdm);
+        if (proposedEfficiency > load_balance_efficiency_ratio_threshold*currentEfficiency)
+        {
+            RemakeLevel(lev, t_new[lev], boxArray(lev), newdm);
+        }
     }
     mypc->Redistribute();
 }
@@ -284,3 +295,39 @@ WarpX::ResetCosts ()
         costs[lev]->assign((*costs[lev]).size(), 0.0);
     }
 }
+
+void
+WarpX::ComputeDistributionMappingEfficiency (const DistributionMapping& dm,
+                                             const Vector<Real>& cost,
+                                             Real& efficiency)
+{
+    const Real nprocs = ParallelDescriptor::NProcs();
+
+    // Collect costs per fab corresponding to each rank, then collapse into vector
+    // of total cost per proc
+
+    // This will store mapping from (proc) --> ([cost_FAB_1, cost_FAB_2, ... ])
+    // for each proc
+    std::map<int, Vector<Real>> rankToCosts;
+
+    for (int i=0; i<cost.size(); ++i)
+    {
+        rankToCosts[dm[i]].push_back(cost[i]);
+    }
+
+    Real maxCost = -1.0;
+
+    // This will store mapping from (proc) --> (sum of cost) for each proc
+    Vector<Real> rankToCost = {0.0};
+    rankToCost.resize(nprocs);
+    for (int i=0; i<nprocs; ++i) {
+        const Real rwSum = std::accumulate(rankToCosts[i].begin(),
+                                           rankToCosts[i].end(), 0.0);
+        rankToCost[i] = rwSum;
+        maxCost = std::max(maxCost, rwSum);
+    }
+
+    // `efficiency` is mean cost per proc
+    efficiency = (std::accumulate(rankToCost.begin(),
+                  rankToCost.end(), 0.0) / (nprocs*maxCost));
+}
diff --git a/Source/WarpX.H b/Source/WarpX.H
index 3c66a6cd7..6d209b604 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -493,6 +493,18 @@ public:
      */
     void ComputeCostsHeuristic (amrex::Vector<std::unique_ptr<amrex::Vector<amrex::Real> > >& costs);
 
+    /** \brief Computes the average cost per MPI rank given a distribution mapping
+     * and vector of cost for each FAB.
+     * @param[in] dm distribution mapping (mapping from FAB to MPI processes)
+     * @param[in] cost vector which gives mapping from FAB index space to the
+     * respective FAB's cost
+     * @param[out] efficiency average cost per MPI process computed from the
+     * given distribution mapping and and cost
+     */
+    void ComputeDistributionMappingEfficiency (const amrex::DistributionMapping& dm,
+                                               const amrex::Vector<amrex::Real>& cost,
+                                               amrex::Real& efficiency);
+
 protected:
 
     /**
@@ -708,6 +720,12 @@ private:
      * `load_balance_knapsack_factor=2` limits the maximum number of boxes that can
      * be assigned to a rank to 8. */
     amrex::Real load_balance_knapsack_factor = 1.24;
+    /** Threshold value that controls whether to adopt the proposed distribution
+     * mapping during load balancing.  The new distribution mapping is adopted
+     * if the ratio of proposed distribution mapping efficiency to current
+     * distribution mapping efficiency is larger than the threshold; 'efficiency'
+     * here means the average cost per MPI rank.  */
+    amrex::Real load_balance_efficiency_ratio_threshold = 0;
     /** Weight factor for cells in `Heuristic` costs update.
      * Default values on GPU are determined from single-GPU tests on Summit.
      * The problem setup for these tests is an empty (i.e. no particles) domain
diff --git a/Source/WarpX.cpp b/Source/WarpX.cpp
index 3adc7c3d2..32c0b1c27 100644
--- a/Source/WarpX.cpp
+++ b/Source/WarpX.cpp
@@ -663,6 +663,7 @@ WarpX::ReadParameters ()
         pp.query("load_balance_int", load_balance_int);
         pp.query("load_balance_with_sfc", load_balance_with_sfc);
         pp.query("load_balance_knapsack_factor", load_balance_knapsack_factor);
+        pp.query("load_balance_efficiency_ratio_threshold", load_balance_efficiency_ratio_threshold);
 
         pp.query("do_dynamic_scheduling", do_dynamic_scheduling);