diff --git a/src/USER-TALLY/compute_force_tally.cpp b/src/USER-TALLY/compute_force_tally.cpp
index f4a056158b91ce3e5727054857b840b34459aeb6..d4bcb0158dcfa7fb48b8837c219b13b79d682fce 100644
--- a/src/USER-TALLY/compute_force_tally.cpp
+++ b/src/USER-TALLY/compute_force_tally.cpp
@@ -44,8 +44,7 @@ ComputeForceTally::ComputeForceTally(LAMMPS *lmp, int narg, char **arg) :
   extscalar = 1;
   peflag = 1;                   // we need Pair::ev_tally() to be run
 
-  did_compute = 0;
-  invoked_peratom = invoked_scalar = -1;
+  did_setup = invoked_peratom = invoked_scalar = -1;
   nmax = -1;
   fatom = NULL;
   vector = new double[size_peratom_cols];
@@ -77,15 +76,17 @@ void ComputeForceTally::init()
                     || force->improper || force->kspace)
       error->warning(FLERR,"Compute force/tally only called from pair style");
   }
-  did_compute = -1;
+  did_setup = -1;
 }
 
 /* ---------------------------------------------------------------------- */
 
-void ComputeForceTally::setup()
+void ComputeForceTally::pair_setup_callback(int, int)
 {
   const int ntotal = atom->nlocal + atom->nghost;
 
+  // grow per-atom storage, if needed
+
   if (atom->nmax > nmax) {
     memory->destroy(fatom);
     nmax = atom->nmax;
@@ -93,7 +94,7 @@ void ComputeForceTally::setup()
     array_atom = fatom;
   }
 
-  // clear storage as needed
+  // clear storage
 
   for (int i=0; i < ntotal; ++i)
     for (int j=0; j < size_peratom_cols; ++j)
@@ -101,10 +102,11 @@ void ComputeForceTally::setup()
 
   for (int i=0; i < size_peratom_cols; ++i)
     vector[i] = ftotal[i] = 0.0;
+
+  did_setup = update->ntimestep;
 }
 
 /* ---------------------------------------------------------------------- */
-
 void ComputeForceTally::pair_tally_callback(int i, int j, int nlocal, int newton,
                                             double, double, double fpair,
                                             double dx, double dy, double dz)
@@ -112,37 +114,6 @@ void ComputeForceTally::pair_tally_callback(int i, int j, int nlocal, int newton
   const int ntotal = atom->nlocal + atom->nghost;
   const int * const mask = atom->mask;
 
-  // do setup work that needs to be done only once per timestep
-
-  if (did_compute != update->ntimestep) {
-    did_compute = update->ntimestep;
-
-    // grow local force array if necessary
-    // needs to be atom->nmax in length
-
-    if (atom->nmax > nmax) {
-      memory->destroy(fatom);
-      nmax = atom->nmax;
-      memory->create(fatom,nmax,size_peratom_cols,"force/tally:fatom");
-      array_atom = fatom;
-    }
-
-    // clear storage as needed
-
-    if (newton) {
-      for (int i=0; i < ntotal; ++i)
-        for (int j=0; j < size_peratom_cols; ++j)
-          fatom[i][j] = 0.0;
-    } else {
-      for (int i=0; i < atom->nlocal; ++i)
-        for (int j=0; j < size_peratom_cols; ++j)
-          fatom[i][j] = 0.0;
-    }
-
-    for (int i=0; i < size_peratom_cols; ++i)
-      vector[i] = ftotal[i] = 0.0;
-  }
-
   if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
        || ((mask[i] & groupbit2) && (mask[j] & groupbit)) ) {
 
@@ -205,7 +176,8 @@ void ComputeForceTally::unpack_reverse_comm(int n, int *list, double *buf)
 double ComputeForceTally::compute_scalar()
 {
   invoked_scalar = update->ntimestep;
-  if ((did_compute != invoked_scalar) || (update->eflag_global != invoked_scalar))
+  if ((did_setup != invoked_scalar)
+      || (update->eflag_global != invoked_scalar))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // sum accumulated forces across procs
@@ -221,7 +193,8 @@ double ComputeForceTally::compute_scalar()
 void ComputeForceTally::compute_peratom()
 {
   invoked_peratom = update->ntimestep;
-  if (((atom->nlocal > 0) && (did_compute != invoked_peratom)) || (update->eflag_global != invoked_peratom))
+  if ((did_setup != invoked_peratom)
+      || (update->eflag_global != invoked_peratom))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // collect contributions from ghost atoms
@@ -229,6 +202,7 @@ void ComputeForceTally::compute_peratom()
   if (force->newton_pair) {
     comm->reverse_comm_compute(this);
 
+    // clear out ghost atom data after it has been collected to local atoms
     const int nall = atom->nlocal + atom->nghost;
     for (int i = atom->nlocal; i < nall; ++i)
       for (int j = 0; j < size_peratom_cols; ++j)
diff --git a/src/USER-TALLY/compute_force_tally.h b/src/USER-TALLY/compute_force_tally.h
index 96de68cdf2fc6e2e3f4daad50b112eeb51090b8c..ae2f06a096f0516e513c0102693b0338039c3961 100644
--- a/src/USER-TALLY/compute_force_tally.h
+++ b/src/USER-TALLY/compute_force_tally.h
@@ -31,7 +31,6 @@ class ComputeForceTally : public Compute {
   virtual ~ComputeForceTally();
 
   void init();
-  void setup();
 
   double compute_scalar();
   void compute_peratom();
@@ -40,12 +39,12 @@ class ComputeForceTally : public Compute {
   void unpack_reverse_comm(int, int *, double *);
   double memory_usage();
 
+  void pair_setup_callback(int, int);
   void pair_tally_callback(int, int, int, int,
                            double, double, double,
                            double, double, double);
-
  private:
-  bigint did_compute;
+  bigint did_setup;
   int nmax,igroup2,groupbit2;
   double **fatom;
   double ftotal[3];
diff --git a/src/USER-TALLY/compute_heat_flux_tally.cpp b/src/USER-TALLY/compute_heat_flux_tally.cpp
index b4da64b9d3e50adfe0e0f8cfd3bf6e46eacfe5f6..7834bdb0039579f6cf07df9891bd7a332e072547 100644
--- a/src/USER-TALLY/compute_heat_flux_tally.cpp
+++ b/src/USER-TALLY/compute_heat_flux_tally.cpp
@@ -43,7 +43,7 @@ ComputeHeatFluxTally::ComputeHeatFluxTally(LAMMPS *lmp, int narg, char **arg) :
   size_vector = 6;
   peflag = 1;                   // we need Pair::ev_tally() to be run
 
-  did_compute = 0;
+  did_setup = 0;
   invoked_peratom = invoked_scalar = -1;
   nmax = -1;
   stress = NULL;
@@ -80,63 +80,49 @@ void ComputeHeatFluxTally::init()
                     || force->improper || force->kspace)
       error->warning(FLERR,"Compute heat/flux/tally only called from pair style");
   }
-  did_compute = -1;
+  did_setup = -1;
 }
 
-
 /* ---------------------------------------------------------------------- */
-void ComputeHeatFluxTally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                             double evdwl, double ecoul, double fpair,
-                                             double dx, double dy, double dz)
+void ComputeHeatFluxTally::pair_setup_callback(int, int)
 {
   const int ntotal = atom->nlocal + atom->nghost;
-  const int * const mask = atom->mask;
 
-  // do setup work that needs to be done only once per timestep
+  // grow per-atom storage, if needed
 
-  if (did_compute != update->ntimestep) {
-    did_compute = update->ntimestep;
+  if (atom->nmax > nmax) {
+    memory->destroy(stress);
+    memory->destroy(eatom);
+    nmax = atom->nmax;
+    memory->create(stress,nmax,6,"heat/flux/tally:stress");
+    memory->create(eatom,nmax,"heat/flux/tally:eatom");
+  }
 
-    // grow local stress and eatom arrays if necessary
-    // needs to be atom->nmax in length
+  // clear storage
 
-    if (atom->nmax > nmax) {
-      memory->destroy(stress);
-      nmax = atom->nmax;
-      memory->create(stress,nmax,6,"heat/flux/tally:stress");
+  for (int i=0; i < ntotal; ++i) {
+    eatom[i] = 0.0;
+    stress[i][0] = 0.0;
+    stress[i][1] = 0.0;
+    stress[i][2] = 0.0;
+    stress[i][3] = 0.0;
+    stress[i][4] = 0.0;
+    stress[i][5] = 0.0;
+  }
 
-      memory->destroy(eatom);
-      nmax = atom->nmax;
-      memory->create(eatom,nmax,"heat/flux/tally:eatom");
-    }
+  for (int i=0; i < size_vector; ++i)
+    vector[i] = heatj[i] = 0.0;
 
-    // clear storage as needed
-
-    if (newton) {
-      for (int i=0; i < ntotal; ++i) {
-        eatom[i] = 0.0;
-        stress[i][0] = 0.0;
-        stress[i][1] = 0.0;
-        stress[i][2] = 0.0;
-        stress[i][3] = 0.0;
-        stress[i][4] = 0.0;
-        stress[i][5] = 0.0;
-      }
-    } else {
-      for (int i=0; i < atom->nlocal; ++i) {
-        eatom[i] = 0.0;
-        stress[i][0] = 0.0;
-        stress[i][1] = 0.0;
-        stress[i][2] = 0.0;
-        stress[i][3] = 0.0;
-        stress[i][4] = 0.0;
-        stress[i][5] = 0.0;
-      }
-    }
+  did_setup = update->ntimestep;
+}
 
-    for (int i=0; i < size_vector; ++i)
-      vector[i] = heatj[i] = 0.0;
-  }
+/* ---------------------------------------------------------------------- */
+void ComputeHeatFluxTally::pair_tally_callback(int i, int j, int nlocal, int newton,
+                                             double evdwl, double ecoul, double fpair,
+                                             double dx, double dy, double dz)
+{
+  const int ntotal = atom->nlocal + atom->nghost;
+  const int * const mask = atom->mask;
 
   if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
        || ((mask[i] & groupbit2) && (mask[j] & groupbit)) ) {
@@ -215,7 +201,7 @@ void ComputeHeatFluxTally::unpack_reverse_comm(int n, int *list, double *buf)
 void ComputeHeatFluxTally::compute_vector()
 {
   invoked_vector = update->ntimestep;
-  if ((did_compute != invoked_vector) || (update->eflag_global != invoked_vector))
+  if ((did_setup != invoked_vector) || (update->eflag_global != invoked_vector))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // collect contributions from ghost atoms
diff --git a/src/USER-TALLY/compute_heat_flux_tally.h b/src/USER-TALLY/compute_heat_flux_tally.h
index d8316e20614434f4614523730b3ccac83b6124b7..4158b2e29d9b16c04e93186b387eab0b2082d6c1 100644
--- a/src/USER-TALLY/compute_heat_flux_tally.h
+++ b/src/USER-TALLY/compute_heat_flux_tally.h
@@ -38,12 +38,13 @@ class ComputeHeatFluxTally : public Compute {
   void unpack_reverse_comm(int, int *, double *);
   double memory_usage();
 
+  void pair_setup_callback(int, int);
   void pair_tally_callback(int, int, int, int,
                            double, double, double,
                            double, double, double);
 
  private:
-  bigint did_compute;
+  bigint did_setup;
   int nmax,igroup2,groupbit2;
   double **stress,*eatom;
   double *heatj;
diff --git a/src/USER-TALLY/compute_pe_mol_tally.cpp b/src/USER-TALLY/compute_pe_mol_tally.cpp
index 8c616b26fa1742e8fb44a3ab4a65a0015e83b1e8..25a172b7f81eb6cd68a75fcb520c401bb89ec1de 100644
--- a/src/USER-TALLY/compute_pe_mol_tally.cpp
+++ b/src/USER-TALLY/compute_pe_mol_tally.cpp
@@ -42,7 +42,7 @@ ComputePEMolTally::ComputePEMolTally(LAMMPS *lmp, int narg, char **arg) :
   extvector = 1;
   peflag = 1;                   // we need Pair::ev_tally() to be run
 
-  did_compute = invoked_vector = -1;
+  did_setup = invoked_vector = -1;
   vector = new double[size_vector];
 }
 
@@ -74,9 +74,16 @@ void ComputePEMolTally::init()
                     || force->improper || force->kspace)
       error->warning(FLERR,"Compute pe/mol/tally only called from pair style");
   }
-  did_compute = -1;
+  did_setup = -1;
 }
 
+/* ---------------------------------------------------------------------- */
+
+void ComputePEMolTally::pair_setup_callback(int, int)
+{
+  etotal[0] = etotal[1] = etotal[2] = etotal[3] = 0.0;
+  did_setup = update->ntimestep;
+}
 
 /* ---------------------------------------------------------------------- */
 void ComputePEMolTally::pair_tally_callback(int i, int j, int nlocal, int newton,
@@ -86,14 +93,6 @@ void ComputePEMolTally::pair_tally_callback(int i, int j, int nlocal, int newton
   const int * const mask = atom->mask;
   const tagint * const molid = atom->molecule;
 
-  // do setup work that needs to be done only once per timestep
-
-  if (did_compute != update->ntimestep) {
-    did_compute = update->ntimestep;
-
-    etotal[0] = etotal[1] = etotal[2] = etotal[3] = 0.0;
-  }
-
   if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
      || ((mask[i] & groupbit2) && (mask[j] & groupbit)) ){
 
@@ -120,7 +119,7 @@ void ComputePEMolTally::pair_tally_callback(int i, int j, int nlocal, int newton
 void ComputePEMolTally::compute_vector()
 {
   invoked_vector = update->ntimestep;
-  if ((did_compute != invoked_vector) || (update->eflag_global != invoked_vector))
+  if ((did_setup != invoked_vector) || (update->eflag_global != invoked_vector))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // sum accumulated energies across procs
diff --git a/src/USER-TALLY/compute_pe_mol_tally.h b/src/USER-TALLY/compute_pe_mol_tally.h
index b2c5ffab7fc3ee3d23b538c8d4e19cfbfacf8668..1b022a9ef578cfb4d2cdf817605ace0ca53bfc79 100644
--- a/src/USER-TALLY/compute_pe_mol_tally.h
+++ b/src/USER-TALLY/compute_pe_mol_tally.h
@@ -33,12 +33,13 @@ class ComputePEMolTally : public Compute {
   void init();
   void compute_vector();
 
+  void pair_setup_callback(int, int);
   void pair_tally_callback(int, int, int, int,
                            double, double, double,
                            double, double, double);
 
  private:
-  bigint did_compute;
+  bigint did_setup;
   int igroup2,groupbit2;
   double etotal[4];
 };
diff --git a/src/USER-TALLY/compute_pe_tally.cpp b/src/USER-TALLY/compute_pe_tally.cpp
index 16a90d07673a03543ea534a2a99d301ec182b18f..f378a5e6b58caa8ab8ff1f2dd8581efe5c96ec1d 100644
--- a/src/USER-TALLY/compute_pe_tally.cpp
+++ b/src/USER-TALLY/compute_pe_tally.cpp
@@ -44,7 +44,7 @@ ComputePETally::ComputePETally(LAMMPS *lmp, int narg, char **arg) :
   extscalar = 1;
   peflag = 1;                   // we need Pair::ev_tally() to be run
 
-  did_compute = invoked_peratom = invoked_scalar = -1;
+  did_setup = invoked_peratom = invoked_scalar = -1;
   nmax = -1;
   eatom = NULL;
   vector = new double[size_peratom_cols];
@@ -76,48 +76,44 @@ void ComputePETally::init()
                     || force->improper || force->kspace)
       error->warning(FLERR,"Compute pe/tally only called from pair style");
   }
-  did_compute = -1;
+  did_setup = -1;
 }
 
-
 /* ---------------------------------------------------------------------- */
-void ComputePETally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                         double evdwl, double ecoul, double,
-                                         double, double, double)
+
+void ComputePETally::pair_setup_callback(int, int)
 {
   const int ntotal = atom->nlocal + atom->nghost;
-  const int * const mask = atom->mask;
 
-  // do setup work that needs to be done only once per timestep
+  // grow per-atom storage, if needed
 
-  if (did_compute != update->ntimestep) {
-    did_compute = update->ntimestep;
+  if (atom->nmax > nmax) {
+    memory->destroy(eatom);
+    nmax = atom->nmax;
+    memory->create(eatom,nmax,size_peratom_cols,"pe/tally:eatom");
+    array_atom = eatom;
+  }
 
-    // grow local eatom array if necessary
-    // needs to be atom->nmax in length
+  // clear storage
 
-    if (atom->nmax > nmax) {
-      memory->destroy(eatom);
-      nmax = atom->nmax;
-      memory->create(eatom,nmax,size_peratom_cols,"pe/tally:eatom");
-      array_atom = eatom;
-    }
+  for (int i=0; i < ntotal; ++i)
+    eatom[i][0] = eatom[i][1] = 0.0;
 
-    // clear storage as needed
+  vector[0] = etotal[0] = vector[1] = etotal[1] = 0.0;
 
-    if (newton) {
-      for (int i=0; i < ntotal; ++i)
-        eatom[i][0] = eatom[i][1] = 0.0;
-    } else {
-      for (int i=0; i < atom->nlocal; ++i)
-        eatom[i][0] = eatom[i][1] = 0.0;
-    }
+  did_setup = update->ntimestep;
+}
 
-    vector[0] = etotal[0] = vector[1] = etotal[1] = 0.0;
-  }
+/* ---------------------------------------------------------------------- */
+void ComputePETally::pair_tally_callback(int i, int j, int nlocal, int newton,
+                                         double evdwl, double ecoul, double,
+                                         double, double, double)
+{
+  const int ntotal = atom->nlocal + atom->nghost;
+  const int * const mask = atom->mask;
 
   if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
-     || ((mask[i] & groupbit2) && (mask[j] & groupbit)) ){
+       || ((mask[i] & groupbit2) && (mask[j] & groupbit)) ) {
 
     evdwl *= 0.5; ecoul *= 0.5;
     if (newton || i < nlocal) {
@@ -165,7 +161,8 @@ void ComputePETally::unpack_reverse_comm(int n, int *list, double *buf)
 double ComputePETally::compute_scalar()
 {
   invoked_scalar = update->ntimestep;
-  if ((did_compute != invoked_scalar) || (update->eflag_global != invoked_scalar))
+  if ((did_setup != invoked_scalar)
+      || (update->eflag_global != invoked_scalar))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // sum accumulated energies across procs
@@ -181,13 +178,16 @@ double ComputePETally::compute_scalar()
 void ComputePETally::compute_peratom()
 {
   invoked_peratom = update->ntimestep;
-  if ((did_compute != invoked_peratom) || (update->eflag_global != invoked_peratom))
+  if ((did_setup != invoked_peratom)
+      || (update->eflag_global != invoked_peratom))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // collect contributions from ghost atoms
 
   if (force->newton_pair) {
     comm->reverse_comm_compute(this);
+
+    // clear out ghost atom data after it has been collected to local atoms
     const int nall = atom->nlocal + atom->nghost;
     for (int i = atom->nlocal; i < nall; ++i)
       eatom[i][0] = eatom[i][1] = 0.0;
diff --git a/src/USER-TALLY/compute_pe_tally.h b/src/USER-TALLY/compute_pe_tally.h
index 2335bbeceeac278d45e9f62bfe8868a868a8071d..cd972e49dba3d0afce0e9387c7e0a461a9b1e881 100644
--- a/src/USER-TALLY/compute_pe_tally.h
+++ b/src/USER-TALLY/compute_pe_tally.h
@@ -39,12 +39,13 @@ class ComputePETally : public Compute {
   void unpack_reverse_comm(int, int *, double *);
   double memory_usage();
 
+  void pair_setup_callback(int, int);
   void pair_tally_callback(int, int, int, int,
                            double, double, double,
                            double, double, double);
 
  private:
-  bigint did_compute;
+  bigint did_setup;
   int nmax,igroup2,groupbit2;
   double **eatom;
   double etotal[2];
diff --git a/src/USER-TALLY/compute_stress_tally.cpp b/src/USER-TALLY/compute_stress_tally.cpp
index 153d788b8b5839f558ff0bf93077b9e7cd3fdf64..7373d53fd5ad195a0f24d8f2ffd20256b44c4ba3 100644
--- a/src/USER-TALLY/compute_stress_tally.cpp
+++ b/src/USER-TALLY/compute_stress_tally.cpp
@@ -44,11 +44,11 @@ ComputeStressTally::ComputeStressTally(LAMMPS *lmp, int narg, char **arg) :
   extscalar = 0;
   peflag = 1;                   // we need Pair::ev_tally() to be run
 
-  did_compute = 0;
-  invoked_peratom = invoked_scalar = -1;
+  did_setup = invoked_peratom = invoked_scalar = -1;
   nmax = -1;
   stress = NULL;
   vector = new double[size_peratom_cols];
+  virial = new double[size_peratom_cols];
 }
 
 /* ---------------------------------------------------------------------- */
@@ -57,6 +57,7 @@ ComputeStressTally::~ComputeStressTally()
 {
   if (force && force->pair) force->pair->del_tally_callback(this);
   memory->destroy(stress);
+  delete[] virial;
   delete[] vector;
 }
 
@@ -77,48 +78,43 @@ void ComputeStressTally::init()
                     || force->improper || force->kspace)
       error->warning(FLERR,"Compute stress/tally only called from pair style");
   }
-  did_compute = -1;
+  did_setup = -1;
 }
 
-
 /* ---------------------------------------------------------------------- */
-void ComputeStressTally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                             double, double, double fpair,
-                                             double dx, double dy, double dz)
+
+void ComputeStressTally::pair_setup_callback(int, int)
 {
   const int ntotal = atom->nlocal + atom->nghost;
-  const int * const mask = atom->mask;
 
-  // do setup work that needs to be done only once per timestep
+  // grow per-atom storage, if needed
 
-  if (did_compute != update->ntimestep) {
-    did_compute = update->ntimestep;
+  if (atom->nmax > nmax) {
+    memory->destroy(stress);
+    nmax = atom->nmax;
+    memory->create(stress,nmax,size_peratom_cols,"stress/tally:stress");
+    array_atom = stress;
+  }
 
-    // grow local stress array if necessary
-    // needs to be atom->nmax in length
+  // clear storage
 
-    if (atom->nmax > nmax) {
-      memory->destroy(stress);
-      nmax = atom->nmax;
-      memory->create(stress,nmax,size_peratom_cols,"stress/tally:stress");
-      array_atom = stress;
-    }
+  for (int i=0; i < ntotal; ++i)
+    for (int j=0; j < size_peratom_cols; ++j)
+      stress[i][j] = 0.0;
 
-    // clear storage as needed
+  for (int i=0; i < size_peratom_cols; ++i)
+    vector[i] = virial[i] = 0.0;
 
-    if (newton) {
-      for (int i=0; i < ntotal; ++i)
-        for (int j=0; j < size_peratom_cols; ++j)
-          stress[i][j] = 0.0;
-    } else {
-      for (int i=0; i < atom->nlocal; ++i)
-        for (int j=0; j < size_peratom_cols; ++j)
-          stress[i][j] = 0.0;
-    }
+  did_setup = update->ntimestep;
+}
 
-    for (int i=0; i < size_peratom_cols; ++i)
-      vector[i] = virial[i] = 0.0;
-  }
+/* ---------------------------------------------------------------------- */
+void ComputeStressTally::pair_tally_callback(int i, int j, int nlocal, int newton,
+                                             double, double, double fpair,
+                                             double dx, double dy, double dz)
+{
+  const int ntotal = atom->nlocal + atom->nghost;
+  const int * const mask = atom->mask;
 
   if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
        || ((mask[i] & groupbit2) && (mask[j] & groupbit)) ) {
@@ -192,7 +188,8 @@ void ComputeStressTally::unpack_reverse_comm(int n, int *list, double *buf)
 double ComputeStressTally::compute_scalar()
 {
   invoked_scalar = update->ntimestep;
-  if ((did_compute != invoked_scalar) || (update->eflag_global != invoked_scalar))
+  if ((did_setup != invoked_scalar)
+      || (update->eflag_global != invoked_scalar))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // sum accumulated forces across procs
@@ -212,7 +209,8 @@ double ComputeStressTally::compute_scalar()
 void ComputeStressTally::compute_peratom()
 {
   invoked_peratom = update->ntimestep;
-  if ((did_compute != invoked_peratom) || (update->eflag_global != invoked_peratom))
+  if ((did_setup != invoked_peratom)
+      || (update->eflag_global != invoked_peratom))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // collect contributions from ghost atoms
diff --git a/src/USER-TALLY/compute_stress_tally.h b/src/USER-TALLY/compute_stress_tally.h
index a677d2eef6b58909a104e272af70e9493df8e79c..22f27a4a41b4e04953b20a61b6f311e1e9b9d9b7 100644
--- a/src/USER-TALLY/compute_stress_tally.h
+++ b/src/USER-TALLY/compute_stress_tally.h
@@ -39,15 +39,16 @@ class ComputeStressTally : public Compute {
   void unpack_reverse_comm(int, int *, double *);
   double memory_usage();
 
+  void pair_setup_callback(int, int);
   void pair_tally_callback(int, int, int, int,
                            double, double, double,
                            double, double, double);
 
  private:
-  bigint did_compute;
+  bigint did_setup;
   int nmax,igroup2,groupbit2;
   double **stress;
-  double virial[6];
+  double *virial;
 };
 
 }
diff --git a/src/compute.h b/src/compute.h
index 7f12cd97e20d1c25b5d6b4e9eddffd81cd2657a5..8fd3880aa71f112ae9b69f012b3ea64f1bacd8dc 100644
--- a/src/compute.h
+++ b/src/compute.h
@@ -135,6 +135,7 @@ class Compute : protected Pointers {
 
   virtual double memory_usage() {return 0.0;}
 
+  virtual void pair_setup_callback(int, int) {}
   virtual void pair_tally_callback(int, int, int, int,
                                    double, double, double,
                                    double, double, double) {}
diff --git a/src/pair.cpp b/src/pair.cpp
index d90ed8bb43a90e422dccf6b39a296b092d2e938f..06792060ce507ad32ee9c97fd2481c47ca8a2a7d 100644
--- a/src/pair.cpp
+++ b/src/pair.cpp
@@ -814,6 +814,16 @@ void Pair::ev_setup(int eflag, int vflag, int alloc)
     if (vflag_atom == 0) vflag_either = 0;
     if (vflag_either == 0 && eflag_either == 0) evflag = 0;
   } else vflag_fdotr = 0;
+
+
+  // run ev_setup option for USER-TALLY computes
+
+  if (num_tally_compute > 0) {
+    for (int k=0; k < num_tally_compute; ++k) {
+      Compute *c = list_tally_compute[k];
+      c->pair_setup_callback(eflag,vflag);
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------