diff --git a/examples/USER/tally/log.12Jun17.force.1 b/examples/USER/tally/log.12Jun17.force.1
new file mode 100644
index 0000000000000000000000000000000000000000..98b3bad2457c44d328556eecf183f537be2b9b93
--- /dev/null
+++ b/examples/USER/tally/log.12Jun17.force.1
@@ -0,0 +1,177 @@
+LAMMPS (19 May 2017)
+
+units		real
+atom_style	full
+
+read_data	data.spce
+  orthogonal box = (0.02645 0.02645 0.02641) to (35.5328 35.5328 35.4736)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  4500 atoms
+  scanning bonds ...
+  2 = max bonds/atom
+  scanning angles ...
+  1 = max angles/atom
+  reading bonds ...
+  3000 bonds
+  reading angles ...
+  1500 angles
+  2 = max # of 1-2 neighbors
+  1 = max # of 1-3 neighbors
+  1 = max # of 1-4 neighbors
+  2 = max # of special neighbors
+
+pair_style	lj/cut/coul/long 12.0 12.0
+kspace_style	pppm 1.0e-4
+
+pair_coeff	1 1 0.15535 3.166
+pair_coeff	* 2 0.0000 0.0000
+
+bond_style	harmonic
+angle_style	harmonic
+dihedral_style	none
+improper_style	none
+
+bond_coeff	1 1000.00 1.000
+angle_coeff	1 100.0 109.47
+
+special_bonds   lj/coul 0.0 0.0 1.0
+  2 = max # of 1-2 neighbors
+  1 = max # of 1-3 neighbors
+  2 = max # of special neighbors
+
+neighbor        2.0 bin
+
+fix		1 all shake 0.0001 20 0 b 1 a 1
+  0 = # of size 2 clusters
+  0 = # of size 3 clusters
+  0 = # of size 4 clusters
+  1500 = # of frozen angles
+fix		2 all nvt temp 300.0 300.0 100.0
+
+# make certain that shake constraints are satisfied
+run 0 post no
+PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.218482
+  grid = 15 15 15
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0319435
+  estimated relative force accuracy = 9.61968e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 8000 3375
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 14
+  ghost atom cutoff = 14
+  binsize = 7, bins = 6 6 6
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 26.54 | 26.54 | 26.54 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0            0   -16692.358            0   -16692.358   -1289.8319 
+Loop time of 3e-06 on 1 procs for 0 steps with 4500 atoms
+
+
+group		one molecule 1 2
+6 atoms in group one
+
+# the following section shows equivalences between using the force/tally compute and other computes and thermo keywords
+
+# compute pairwise force between two molecules and everybody
+compute		fpa one group/group all pair yes kspace no boundary no
+# tally pairwise force between two molecules and the all molecules
+compute		c1 one force/tally all
+# tally the force of all with all (should be zero)
+compute		c2 all force/tally all
+# collect per atom data. only reduce over the first group.
+compute		one one reduce sum c_c1[1] c_c1[2] c_c1[3]
+compute		red all reduce sum c_c2[1] c_c2[2] c_c2[3]
+# determine magnitude of force
+variable	fpa equal sqrt(c_fpa[1]*c_fpa[1]+c_fpa[2]*c_fpa[2]+c_fpa[3]*c_fpa[3])
+variable	for equal sqrt(c_one[1]*c_one[1]+c_one[2]*c_one[2]+c_one[3]*c_one[3])
+# round to 10**-10 absolute precision.
+variable	ref equal round(1e10*sqrt(c_red[1]*c_red[1]+c_red[2]*c_red[2]+c_red[3]*c_red[3]))*1e-10
+variable	all equal round(1e10*c_c2)*1e-10
+
+velocity	all create 300 432567 dist uniform
+
+timestep	2.0
+
+# v_fpa and v_for and c_c1, c_fpa[] and c_one[] should all each have the same value. v_ref and c_c2 should be zero
+thermo_style    custom step v_fpa v_for c_c1 c_fpa[1] c_one[1] c_fpa[2] c_one[2] c_fpa[3] c_one[3] v_ref v_all
+thermo		10
+
+run 50
+PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.218482
+  grid = 15 15 15
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0319435
+  estimated relative force accuracy = 9.61968e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 8000 3375
+WARNING: Compute force/tally only called from pair style (../compute_force_tally.cpp:77)
+WARNING: Compute force/tally only called from pair style (../compute_force_tally.cpp:77)
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 14
+  ghost atom cutoff = 14
+  binsize = 7, bins = 6 6 6
+  2 neighbor lists, perpetual/occasional/extra = 1 1 0
+  (1) pair lj/cut/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+  (2) compute group/group, occasional, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 28.47 | 28.47 | 28.47 Mbytes
+Step v_fpa v_for c_c1 c_fpa[1] c_one[1] c_fpa[2] c_one[2] c_fpa[3] c_one[3] v_ref v_all 
+       0      22.7331      22.7331      22.7331   -17.068295   -17.068295   -8.8348335   -8.8348334   -12.141369   -12.141369            0            0 
+      10    11.736901    11.736901    11.736901   -3.3897029   -3.3897029    9.1193856    9.1193856   -6.5651786   -6.5651786            0            0 
+      20    5.6120339    5.6120339    5.6120339  -0.60046861  -0.60046861   -4.4481306   -4.4481306    3.3687528    3.3687528            0            0 
+      30     17.29261     17.29261     17.29261     6.179302     6.179302   -10.593979   -10.593979    12.190906    12.190906            0            0 
+      40    18.664433    18.664433    18.664433    5.4727782    5.4727782   -6.9329319   -6.9329319    16.442148    16.442148            0            0 
+      50    12.130407    12.130407    12.130407   -1.0321196   -1.0321196    8.0035558    8.0035558   -9.0567428   -9.0567428            0            0 
+Loop time of 13.9507 on 1 procs for 50 steps with 4500 atoms
+
+Performance: 0.619 ns/day, 38.752 hours/ns, 3.584 timesteps/s
+32.0% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 12.594     | 12.594     | 12.594     |   0.0 | 90.27
+Bond    | 7.3e-05    | 7.3e-05    | 7.3e-05    |   0.0 |  0.00
+Kspace  | 0.56296    | 0.56296    | 0.56296    |   0.0 |  4.04
+Neigh   | 0.65858    | 0.65858    | 0.65858    |   0.0 |  4.72
+Comm    | 0.019093   | 0.019093   | 0.019093   |   0.0 |  0.14
+Output  | 0.055025   | 0.055025   | 0.055025   |   0.0 |  0.39
+Modify  | 0.057276   | 0.057276   | 0.057276   |   0.0 |  0.41
+Other   |            | 0.004003   |            |       |  0.03
+
+Nlocal:    4500 ave 4500 max 4500 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    21131 ave 21131 max 21131 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    2.60198e+06 ave 2.60198e+06 max 2.60198e+06 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 2601983
+Ave neighs/atom = 578.218
+Ave special neighs/atom = 2
+Neighbor list builds = 4
+Dangerous builds = 1
+
+Total wall time: 0:00:15
diff --git a/examples/USER/tally/log.12Jun17.force.4 b/examples/USER/tally/log.12Jun17.force.4
new file mode 100644
index 0000000000000000000000000000000000000000..4238173fb60e3a5eb806da82101b0282c0185cac
--- /dev/null
+++ b/examples/USER/tally/log.12Jun17.force.4
@@ -0,0 +1,177 @@
+LAMMPS (19 May 2017)
+
+units		real
+atom_style	full
+
+read_data	data.spce
+  orthogonal box = (0.02645 0.02645 0.02641) to (35.5328 35.5328 35.4736)
+  2 by 2 by 1 MPI processor grid
+  reading atoms ...
+  4500 atoms
+  scanning bonds ...
+  2 = max bonds/atom
+  scanning angles ...
+  1 = max angles/atom
+  reading bonds ...
+  3000 bonds
+  reading angles ...
+  1500 angles
+  2 = max # of 1-2 neighbors
+  1 = max # of 1-3 neighbors
+  1 = max # of 1-4 neighbors
+  2 = max # of special neighbors
+
+pair_style	lj/cut/coul/long 12.0 12.0
+kspace_style	pppm 1.0e-4
+
+pair_coeff	1 1 0.15535 3.166
+pair_coeff	* 2 0.0000 0.0000
+
+bond_style	harmonic
+angle_style	harmonic
+dihedral_style	none
+improper_style	none
+
+bond_coeff	1 1000.00 1.000
+angle_coeff	1 100.0 109.47
+
+special_bonds   lj/coul 0.0 0.0 1.0
+  2 = max # of 1-2 neighbors
+  1 = max # of 1-3 neighbors
+  2 = max # of special neighbors
+
+neighbor        2.0 bin
+
+fix		1 all shake 0.0001 20 0 b 1 a 1
+  0 = # of size 2 clusters
+  0 = # of size 3 clusters
+  0 = # of size 4 clusters
+  1500 = # of frozen angles
+fix		2 all nvt temp 300.0 300.0 100.0
+
+# make certain that shake constraints are satisfied
+run 0 post no
+PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.218482
+  grid = 15 15 15
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0319435
+  estimated relative force accuracy = 9.61968e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 3380 960
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 14
+  ghost atom cutoff = 14
+  binsize = 7, bins = 6 6 6
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 10.6 | 10.61 | 10.61 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0            0   -16692.358            0   -16692.358   -1289.8319 
+Loop time of 4.5e-06 on 4 procs for 0 steps with 4500 atoms
+
+
+group		one molecule 1 2
+6 atoms in group one
+
+# the following section shows equivalences between using the force/tally compute and other computes and thermo keywords
+
+# compute pairwise force between two molecules and everybody
+compute		fpa one group/group all pair yes kspace no boundary no
+# tally pairwise force between two molecules and the all molecules
+compute		c1 one force/tally all
+# tally the force of all with all (should be zero)
+compute		c2 all force/tally all
+# collect per atom data. only reduce over the first group.
+compute		one one reduce sum c_c1[1] c_c1[2] c_c1[3]
+compute		red all reduce sum c_c2[1] c_c2[2] c_c2[3]
+# determine magnitude of force
+variable	fpa equal sqrt(c_fpa[1]*c_fpa[1]+c_fpa[2]*c_fpa[2]+c_fpa[3]*c_fpa[3])
+variable	for equal sqrt(c_one[1]*c_one[1]+c_one[2]*c_one[2]+c_one[3]*c_one[3])
+# round to 10**-10 absolute precision.
+variable	ref equal round(1e10*sqrt(c_red[1]*c_red[1]+c_red[2]*c_red[2]+c_red[3]*c_red[3]))*1e-10
+variable	all equal round(1e10*c_c2)*1e-10
+
+velocity	all create 300 432567 dist uniform
+
+timestep	2.0
+
+# v_fpa and v_for and c_c1, c_fpa[] and c_one[] should all each have the same value. v_ref and c_c2 should be zero
+thermo_style    custom step v_fpa v_for c_c1 c_fpa[1] c_one[1] c_fpa[2] c_one[2] c_fpa[3] c_one[3] v_ref v_all
+thermo		10
+
+run 50
+PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.218482
+  grid = 15 15 15
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0319435
+  estimated relative force accuracy = 9.61968e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 3380 960
+WARNING: Compute force/tally only called from pair style (../compute_force_tally.cpp:77)
+WARNING: Compute force/tally only called from pair style (../compute_force_tally.cpp:77)
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 14
+  ghost atom cutoff = 14
+  binsize = 7, bins = 6 6 6
+  2 neighbor lists, perpetual/occasional/extra = 1 1 0
+  (1) pair lj/cut/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+  (2) compute group/group, occasional, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 11.58 | 11.59 | 11.6 Mbytes
+Step v_fpa v_for c_c1 c_fpa[1] c_one[1] c_fpa[2] c_one[2] c_fpa[3] c_one[3] v_ref v_all 
+       0      22.7331      22.7331      22.7331   -17.068295   -17.068295   -8.8348335   -8.8348334   -12.141369   -12.141369            0            0 
+      10    11.736901    11.736901    11.736901   -3.3897029   -3.3897029    9.1193856    9.1193856   -6.5651786   -6.5651786            0            0 
+      20    5.6120339    5.6120339    5.6120339  -0.60046861  -0.60046861   -4.4481306   -4.4481306    3.3687528    3.3687528            0            0 
+      30     17.29261     17.29261     17.29261     6.179302     6.179302   -10.593979   -10.593979    12.190906    12.190906            0            0 
+      40    18.664433    18.664433    18.664433    5.4727782    5.4727782   -6.9329319   -6.9329319    16.442148    16.442148            0            0 
+      50    12.130407    12.130407    12.130407   -1.0321196   -1.0321196    8.0035558    8.0035558   -9.0567428   -9.0567428            0            0 
+Loop time of 4.31614 on 4 procs for 50 steps with 4500 atoms
+
+Performance: 2.002 ns/day, 11.989 hours/ns, 11.584 timesteps/s
+31.6% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 3.5075     | 3.6114     | 3.7489     |   4.7 | 83.67
+Bond    | 8.6e-05    | 0.00010525 | 0.000141   |   0.0 |  0.00
+Kspace  | 0.2581     | 0.39489    | 0.49723    |  14.2 |  9.15
+Neigh   | 0.19826    | 0.19888    | 0.19918    |   0.1 |  4.61
+Comm    | 0.034639   | 0.037137   | 0.038938   |   0.9 |  0.86
+Output  | 0.025465   | 0.025997   | 0.027558   |   0.6 |  0.60
+Modify  | 0.044022   | 0.044175   | 0.044407   |   0.1 |  1.02
+Other   |            | 0.003593   |            |       |  0.08
+
+Nlocal:    1125 ave 1148 max 1097 min
+Histogram: 1 0 0 1 0 0 0 0 1 1
+Nghost:    12212.5 ave 12269 max 12162 min
+Histogram: 1 0 0 1 0 1 0 0 0 1
+Neighs:    650496 ave 675112 max 631353 min
+Histogram: 1 0 0 1 1 0 0 0 0 1
+
+Total # of neighbors = 2601983
+Ave neighs/atom = 578.218
+Ave special neighs/atom = 2
+Neighbor list builds = 4
+Dangerous builds = 1
+
+Total wall time: 0:00:04
diff --git a/examples/USER/tally/log.21Aug15.pe.1 b/examples/USER/tally/log.12Jun17.pe.1
similarity index 50%
rename from examples/USER/tally/log.21Aug15.pe.1
rename to examples/USER/tally/log.12Jun17.pe.1
index aea553003871b4fec713295409636c14f3afb446..8b0f7534144b7b00ad05ec59926250503934a0a9 100644
--- a/examples/USER/tally/log.21Aug15.pe.1
+++ b/examples/USER/tally/log.12Jun17.pe.1
@@ -1,5 +1,4 @@
-LAMMPS (21 Aug 2015-ICMS)
-  using 1 OpenMP thread(s) per MPI task
+LAMMPS (19 May 2017)
 
 units		real
 atom_style	full
@@ -50,6 +49,35 @@ fix		1 all shake 0.0001 20 0 b 1 a 1
   1500 = # of frozen angles
 fix		2 all nvt temp 300.0 300.0 100.0
 
+# make certain that shake constraints are satisfied
+run 0 post no
+PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.218482
+  grid = 15 15 15
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0319435
+  estimated relative force accuracy = 9.61968e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 8000 3375
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 14
+  ghost atom cutoff = 14
+  binsize = 7, bins = 6 6 6
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 26.54 | 26.54 | 26.54 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0            0   -16692.358            0   -16692.358   -1289.8319 
+Loop time of 1e-06 on 1 procs for 0 steps with 4500 atoms
+
+
 group		oxy type 1
 1500 atoms in group oxy
 group		hyd type 2
@@ -88,6 +116,7 @@ thermo		10
 
 run 50
 PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
   G vector (1/distance) = 0.218482
   grid = 15 15 15
   stencil order = 5
@@ -95,38 +124,49 @@ PPPM initialization ...
   estimated relative force accuracy = 9.61968e-05
   using double precision FFTs
   3d grid and FFT values/proc = 8000 3375
-WARNING: Compute pe/tally only called from pair style (../compute_pe_tally.cpp:75)
-WARNING: Compute pe/tally only called from pair style (../compute_pe_tally.cpp:75)
+WARNING: Compute pe/tally only called from pair style (../compute_pe_tally.cpp:77)
+WARNING: Compute pe/tally only called from pair style (../compute_pe_tally.cpp:77)
 Neighbor list info ...
-  2 neighbor list requests
   update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
   master list distance cutoff = 14
   ghost atom cutoff = 14
-  binsize = 7 -> bins = 6 6 6
-Memory usage per processor = 17.381 Mbytes
-Step epa epa E_vdwl vdwl E_coul coul eref pe c2 pair 
-       0   -516632.19   -516632.19    3169.9382    3169.9382    46213.889    46213.889    49383.827    49383.827    49383.827    49383.827 
-      10   -517027.36   -517027.36    3099.1322    3099.1322     45891.84     45891.84    48990.972    48990.972    48990.972    48990.972 
-      20   -516828.06   -516828.06    3101.4321    3101.4321     45884.14     45884.14    48985.572    48985.572    48985.572    48985.572 
-      30    -517032.1    -517032.1    3198.5939    3198.5939    45793.571    45793.571    48992.165    48992.165    48992.165    48992.165 
-      40   -517095.56   -517095.56    3244.0797    3244.0797    45715.265    45715.265    48959.345    48959.345    48959.345    48959.345 
-      50   -517273.54   -517273.54    3274.9142    3274.9142    45665.997    45665.997    48940.911    48940.911    48940.911    48940.911 
-
-Loop time of 4.31105 on 1 procs for 50 steps with 4500 atoms
-100.1% CPU use with 1 MPI tasks x 1 OpenMP threads
-Performance: 2.004 ns/day  11.975 hours/ns  11.598 timesteps/s
-
-MPI task timings breakdown:
+  binsize = 7, bins = 6 6 6
+  2 neighbor lists, perpetual/occasional/extra = 1 1 0
+  (1) pair lj/cut/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+  (2) compute group/group, occasional, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 29.08 | 29.08 | 29.08 Mbytes
+Step c_epa v_epa E_vdwl v_vdwl E_coul v_coul v_eref v_pe c_c2 v_pair 
+       0   -516634.27   -516634.27    3169.9427    3169.9427    46212.482    46212.482    49382.425    49382.425    49382.425    49382.425 
+      10   -517027.35   -517027.35    3099.1374    3099.1374    45891.866    45891.866    48991.003    48991.003    48991.003    48991.003 
+      20   -516828.05   -516828.05    3101.4373    3101.4373    45884.156    45884.156    48985.594    48985.594    48985.594    48985.594 
+      30   -517032.07   -517032.07    3198.5951    3198.5951    45793.595    45793.595    48992.191    48992.191    48992.191    48992.191 
+      40   -517095.54   -517095.54    3244.0771    3244.0771    45715.292    45715.292    48959.369    48959.369    48959.369    48959.369 
+      50    -517273.5    -517273.5    3274.9097    3274.9097    45666.025    45666.025    48940.935    48940.935    48940.935    48940.935 
+Loop time of 15.3339 on 1 procs for 50 steps with 4500 atoms
+
+Performance: 0.563 ns/day, 42.594 hours/ns, 3.261 timesteps/s
+32.0% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 3.5071     | 3.5071     | 3.5071     |   0.0 | 81.35
-Bond    | 0.00025034 | 0.00025034 | 0.00025034 |   0.0 |  0.01
-Kspace  | 0.19991    | 0.19991    | 0.19991    |   0.0 |  4.64
-Neigh   | 0.31459    | 0.31459    | 0.31459    |   0.0 |  7.30
-Comm    | 0.010338   | 0.010338   | 0.010338   |   0.0 |  0.24
-Output  | 0.24722    | 0.24722    | 0.24722    |   0.0 |  5.73
-Modify  | 0.029466   | 0.029466   | 0.029466   |   0.0 |  0.68
-Other   |            | 0.002182   |            |       |  0.05
+Pair    | 13.432     | 13.432     | 13.432     |   0.0 | 87.60
+Bond    | 0.000365   | 0.000365   | 0.000365   |   0.0 |  0.00
+Kspace  | 0.581      | 0.581      | 0.581      |   0.0 |  3.79
+Neigh   | 0.66081    | 0.66081    | 0.66081    |   0.0 |  4.31
+Comm    | 0.019908   | 0.019908   | 0.019908   |   0.0 |  0.13
+Output  | 0.57731    | 0.57731    | 0.57731    |   0.0 |  3.76
+Modify  | 0.058515   | 0.058515   | 0.058515   |   0.0 |  0.38
+Other   |            | 0.003889   |            |       |  0.03
 
 Nlocal:    4500 ave 4500 max 4500 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
@@ -135,10 +175,10 @@ Histogram: 1 0 0 0 0 0 0 0 0 0
 Neighs:    2.60198e+06 ave 2.60198e+06 max 2.60198e+06 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 
-Total # of neighbors = 2601984
-Ave neighs/atom = 578.219
+Total # of neighbors = 2601983
+Ave neighs/atom = 578.218
 Ave special neighs/atom = 2
 Neighbor list builds = 4
 Dangerous builds = 1
 
-Total wall time: 0:00:04
+Total wall time: 0:00:16
diff --git a/examples/USER/tally/log.21Aug15.pe.4 b/examples/USER/tally/log.12Jun17.pe.4
similarity index 50%
rename from examples/USER/tally/log.21Aug15.pe.4
rename to examples/USER/tally/log.12Jun17.pe.4
index 303af555ff2655073827812bfdeab7e737731b39..f684fabe01032051684737adf1241a6b0471a78a 100644
--- a/examples/USER/tally/log.21Aug15.pe.4
+++ b/examples/USER/tally/log.12Jun17.pe.4
@@ -1,5 +1,4 @@
-LAMMPS (21 Aug 2015-ICMS)
-  using 1 OpenMP thread(s) per MPI task
+LAMMPS (19 May 2017)
 
 units		real
 atom_style	full
@@ -50,6 +49,35 @@ fix		1 all shake 0.0001 20 0 b 1 a 1
   1500 = # of frozen angles
 fix		2 all nvt temp 300.0 300.0 100.0
 
+# make certain that shake constraints are satisfied
+run 0 post no
+PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.218482
+  grid = 15 15 15
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0319435
+  estimated relative force accuracy = 9.61968e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 3380 960
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 14
+  ghost atom cutoff = 14
+  binsize = 7, bins = 6 6 6
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 10.6 | 10.61 | 10.61 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0            0   -16692.358            0   -16692.358   -1289.8319 
+Loop time of 1.75e-06 on 4 procs for 0 steps with 4500 atoms
+
+
 group		oxy type 1
 1500 atoms in group oxy
 group		hyd type 2
@@ -88,6 +116,7 @@ thermo		10
 
 run 50
 PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
   G vector (1/distance) = 0.218482
   grid = 15 15 15
   stencil order = 5
@@ -95,38 +124,49 @@ PPPM initialization ...
   estimated relative force accuracy = 9.61968e-05
   using double precision FFTs
   3d grid and FFT values/proc = 3380 960
-WARNING: Compute pe/tally only called from pair style (../compute_pe_tally.cpp:75)
-WARNING: Compute pe/tally only called from pair style (../compute_pe_tally.cpp:75)
+WARNING: Compute pe/tally only called from pair style (../compute_pe_tally.cpp:77)
+WARNING: Compute pe/tally only called from pair style (../compute_pe_tally.cpp:77)
 Neighbor list info ...
-  2 neighbor list requests
   update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
   master list distance cutoff = 14
   ghost atom cutoff = 14
-  binsize = 7 -> bins = 6 6 6
-Memory usage per processor = 8.44413 Mbytes
-Step epa epa E_vdwl vdwl E_coul coul eref pe c2 pair 
-       0   -516632.19   -516632.19    3169.9382    3169.9382    46213.889    46213.889    49383.827    49383.827    49383.827    49383.827 
-      10   -517027.36   -517027.36    3099.1322    3099.1322     45891.84     45891.84    48990.972    48990.972    48990.972    48990.972 
-      20   -516828.06   -516828.06    3101.4321    3101.4321     45884.14     45884.14    48985.572    48985.572    48985.572    48985.572 
-      30    -517032.1    -517032.1    3198.5939    3198.5939    45793.571    45793.571    48992.165    48992.165    48992.165    48992.165 
-      40   -517095.56   -517095.56    3244.0797    3244.0797    45715.265    45715.265    48959.345    48959.345    48959.345    48959.345 
-      50   -517273.54   -517273.54    3274.9142    3274.9142    45665.997    45665.997    48940.911    48940.911    48940.911    48940.911 
-
-Loop time of 1.20533 on 4 procs for 50 steps with 4500 atoms
-100.0% CPU use with 4 MPI tasks x 1 OpenMP threads
-Performance: 7.168 ns/day  3.348 hours/ns  41.482 timesteps/s
-
-MPI task timings breakdown:
+  binsize = 7, bins = 6 6 6
+  2 neighbor lists, perpetual/occasional/extra = 1 1 0
+  (1) pair lj/cut/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+  (2) compute group/group, occasional, copy from (1)
+      attributes: half, newton on
+      pair build: copy
+      stencil: none
+      bin: none
+Per MPI rank memory allocation (min/avg/max) = 11.86 | 11.87 | 11.88 Mbytes
+Step c_epa v_epa E_vdwl v_vdwl E_coul v_coul v_eref v_pe c_c2 v_pair 
+       0   -516634.27   -516634.27    3169.9427    3169.9427    46212.482    46212.482    49382.425    49382.425    49382.425    49382.425 
+      10   -517027.35   -517027.35    3099.1374    3099.1374    45891.866    45891.866    48991.003    48991.003    48991.003    48991.003 
+      20   -516828.05   -516828.05    3101.4373    3101.4373    45884.156    45884.156    48985.594    48985.594    48985.594    48985.594 
+      30   -517032.07   -517032.07    3198.5951    3198.5951    45793.595    45793.595    48992.191    48992.191    48992.191    48992.191 
+      40   -517095.54   -517095.54    3244.0771    3244.0771    45715.292    45715.292    48959.369    48959.369    48959.369    48959.369 
+      50    -517273.5    -517273.5    3274.9097    3274.9097    45666.025    45666.025    48940.935    48940.935    48940.935    48940.935 
+Loop time of 2.32344 on 4 procs for 50 steps with 4500 atoms
+
+Performance: 3.719 ns/day, 6.454 hours/ns, 21.520 timesteps/s
+64.0% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.87053    | 0.90325    | 0.94364    |   2.8 | 74.94
-Bond    | 0.00015402 | 0.00018191 | 0.00020909 |   0.2 |  0.02
-Kspace  | 0.061657   | 0.10164    | 0.13394    |   8.4 |  8.43
-Neigh   | 0.088292   | 0.088332   | 0.088373   |   0.0 |  7.33
-Comm    | 0.017319   | 0.017806   | 0.018291   |   0.4 |  1.48
-Output  | 0.07067    | 0.070706   | 0.070813   |   0.0 |  5.87
-Modify  | 0.021655   | 0.021694   | 0.02173    |   0.0 |  1.80
-Other   |            | 0.001719   |            |       |  0.14
+Pair    | 1.5561     | 1.8883     | 2.0327     |  14.1 | 81.27
+Bond    | 8.8e-05    | 0.000116   | 0.000135   |   0.0 |  0.00
+Kspace  | 0.094718   | 0.1933     | 0.26055    |  14.1 |  8.32
+Neigh   | 0.085117   | 0.1073     | 0.1147     |   3.9 |  4.62
+Comm    | 0.014156   | 0.017907   | 0.020005   |   1.8 |  0.77
+Output  | 0.071634   | 0.090599   | 0.097665   |   3.6 |  3.90
+Modify  | 0.019447   | 0.024101   | 0.026277   |   1.8 |  1.04
+Other   |            | 0.001804   |            |       |  0.08
 
 Nlocal:    1125 ave 1148 max 1097 min
 Histogram: 1 0 0 1 0 0 0 0 1 1
@@ -135,10 +175,10 @@ Histogram: 1 0 0 1 0 1 0 0 0 1
 Neighs:    650496 ave 675112 max 631353 min
 Histogram: 1 0 0 1 1 0 0 0 0 1
 
-Total # of neighbors = 2601984
-Ave neighs/atom = 578.219
+Total # of neighbors = 2601983
+Ave neighs/atom = 578.218
 Ave special neighs/atom = 2
 Neighbor list builds = 4
 Dangerous builds = 1
 
-Total wall time: 0:00:01
+Total wall time: 0:00:02
diff --git a/examples/USER/tally/log.21Aug15.stress.1 b/examples/USER/tally/log.12Jun17.stress.1
similarity index 56%
rename from examples/USER/tally/log.21Aug15.stress.1
rename to examples/USER/tally/log.12Jun17.stress.1
index c20b51559655d55f10d69024485f95f791214975..a76012487cca3c6d73f4331fc27f62165d326c8d 100644
--- a/examples/USER/tally/log.21Aug15.stress.1
+++ b/examples/USER/tally/log.12Jun17.stress.1
@@ -1,5 +1,4 @@
-LAMMPS (21 Aug 2015-ICMS)
-  using 1 OpenMP thread(s) per MPI task
+LAMMPS (19 May 2017)
 
 units		real
 atom_style	full
@@ -50,6 +49,35 @@ fix		1 all shake 0.0001 20 0 b 1 a 1
   1500 = # of frozen angles
 fix		2 all nvt temp 300.0 300.0 100.0
 
+# make certain that shake constraints are satisfied
+run 0 post no
+PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.218482
+  grid = 15 15 15
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0319435
+  estimated relative force accuracy = 9.61968e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 8000 3375
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 14
+  ghost atom cutoff = 14
+  binsize = 7, bins = 6 6 6
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 26.54 | 26.54 | 26.54 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0            0   -16692.358            0   -16692.358   -1289.8319 
+Loop time of 2e-06 on 1 procs for 0 steps with 4500 atoms
+
+
 group		one molecule 1 2
 6 atoms in group one
 
@@ -79,6 +107,7 @@ thermo		10
 
 run 50
 PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
   G vector (1/distance) = 0.218482
   grid = 15 15 15
   stencil order = 5
@@ -86,38 +115,32 @@ PPPM initialization ...
   estimated relative force accuracy = 9.61968e-05
   using double precision FFTs
   3d grid and FFT values/proc = 8000 3375
-WARNING: Compute stress/tally only called from pair style (../compute_stress_tally.cpp:75)
-WARNING: Compute stress/tally only called from pair style (../compute_stress_tally.cpp:75)
-Neighbor list info ...
-  1 neighbor list requests
-  update every 1 steps, delay 10 steps, check yes
-  master list distance cutoff = 14
-  ghost atom cutoff = 14
-  binsize = 7 -> bins = 6 6 6
-Memory usage per processor = 24.631 Mbytes
-Step press spa press one ref 
-       0    26497.547    26497.547    26497.547   -2357033.6   -2357033.6 
-      10    23665.073    23665.073    23665.073   -2096057.3   -2096057.3 
-      20    23338.149    23338.149    23338.149     -2034283     -2034283 
-      30      25946.4      25946.4      25946.4     -2002817     -2002817 
-      40    27238.349    27238.349    27238.349   -2155411.5   -2155411.5 
-      50    27783.092    27783.092    27783.092   -1862190.3   -1862190.3 
-
-Loop time of 4.15609 on 1 procs for 50 steps with 4500 atoms
-100.1% CPU use with 1 MPI tasks x 1 OpenMP threads
-Performance: 2.079 ns/day  11.545 hours/ns  12.031 timesteps/s
-
-MPI task timings breakdown:
+WARNING: Compute stress/tally only called from pair style (../compute_stress_tally.cpp:79)
+WARNING: Compute stress/tally only called from pair style (../compute_stress_tally.cpp:79)
+Per MPI rank memory allocation (min/avg/max) = 35.9 | 35.9 | 35.9 Mbytes
+Step c_press v_spa v_press v_one v_ref 
+       0    26496.811    26496.811    26496.811   -2356992.7   -2356992.7 
+      10    23665.129    23665.129    23665.129     -2096059     -2096059 
+      20    23338.197    23338.197    23338.197   -2034284.1   -2034284.1 
+      30    25946.434    25946.434    25946.434   -2002815.3   -2002815.3 
+      40    27238.374    27238.374    27238.374   -2155408.7   -2155408.7 
+      50    27783.107    27783.107    27783.107   -1862191.5   -1862191.5 
+Loop time of 14.2089 on 1 procs for 50 steps with 4500 atoms
+
+Performance: 0.608 ns/day, 39.469 hours/ns, 3.519 timesteps/s
+32.0% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 3.6444     | 3.6444     | 3.6444     |   0.0 | 87.69
-Bond    | 0.0016105  | 0.0016105  | 0.0016105  |   0.0 |  0.04
-Kspace  | 0.22345    | 0.22345    | 0.22345    |   0.0 |  5.38
-Neigh   | 0.23588    | 0.23588    | 0.23588    |   0.0 |  5.68
-Comm    | 0.010035   | 0.010035   | 0.010035   |   0.0 |  0.24
-Output  | 0.0084085  | 0.0084085  | 0.0084085  |   0.0 |  0.20
-Modify  | 0.029978   | 0.029978   | 0.029978   |   0.0 |  0.72
-Other   |            | 0.002368   |            |       |  0.06
+Pair    | 12.983     | 12.983     | 12.983     |   0.0 | 91.37
+Bond    | 0.002788   | 0.002788   | 0.002788   |   0.0 |  0.02
+Kspace  | 0.62745    | 0.62745    | 0.62745    |   0.0 |  4.42
+Neigh   | 0.49839    | 0.49839    | 0.49839    |   0.0 |  3.51
+Comm    | 0.018597   | 0.018597   | 0.018597   |   0.0 |  0.13
+Output  | 0.015852   | 0.015852   | 0.015852   |   0.0 |  0.11
+Modify  | 0.058415   | 0.058415   | 0.058415   |   0.0 |  0.41
+Other   |            | 0.004126   |            |       |  0.03
 
 Nlocal:    4500 ave 4500 max 4500 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
@@ -132,4 +155,4 @@ Ave special neighs/atom = 2
 Neighbor list builds = 3
 Dangerous builds = 0
 
-Total wall time: 0:00:04
+Total wall time: 0:00:15
diff --git a/examples/USER/tally/log.21Aug15.stress.4 b/examples/USER/tally/log.12Jun17.stress.4
similarity index 55%
rename from examples/USER/tally/log.21Aug15.stress.4
rename to examples/USER/tally/log.12Jun17.stress.4
index c681960c99d4759c302907fbb07791281d6c48d9..37bb60f013ee2b589a569a11a65139b6cc0a3f10 100644
--- a/examples/USER/tally/log.21Aug15.stress.4
+++ b/examples/USER/tally/log.12Jun17.stress.4
@@ -1,5 +1,4 @@
-LAMMPS (21 Aug 2015-ICMS)
-  using 1 OpenMP thread(s) per MPI task
+LAMMPS (19 May 2017)
 
 units		real
 atom_style	full
@@ -50,6 +49,35 @@ fix		1 all shake 0.0001 20 0 b 1 a 1
   1500 = # of frozen angles
 fix		2 all nvt temp 300.0 300.0 100.0
 
+# make certain that shake constraints are satisfied
+run 0 post no
+PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
+  G vector (1/distance) = 0.218482
+  grid = 15 15 15
+  stencil order = 5
+  estimated absolute RMS force accuracy = 0.0319435
+  estimated relative force accuracy = 9.61968e-05
+  using double precision FFTs
+  3d grid and FFT values/proc = 3380 960
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 14
+  ghost atom cutoff = 14
+  binsize = 7, bins = 6 6 6
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut/coul/long, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 10.6 | 10.61 | 10.61 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0            0   -16692.358            0   -16692.358   -1289.8319 
+Loop time of 4e-06 on 4 procs for 0 steps with 4500 atoms
+
+
 group		one molecule 1 2
 6 atoms in group one
 
@@ -79,6 +107,7 @@ thermo		10
 
 run 50
 PPPM initialization ...
+WARNING: Using 12-bit tables for long-range coulomb (../kspace.cpp:321)
   G vector (1/distance) = 0.218482
   grid = 15 15 15
   stencil order = 5
@@ -86,44 +115,38 @@ PPPM initialization ...
   estimated relative force accuracy = 9.61968e-05
   using double precision FFTs
   3d grid and FFT values/proc = 3380 960
-WARNING: Compute stress/tally only called from pair style (../compute_stress_tally.cpp:75)
-WARNING: Compute stress/tally only called from pair style (../compute_stress_tally.cpp:75)
-Neighbor list info ...
-  1 neighbor list requests
-  update every 1 steps, delay 10 steps, check yes
-  master list distance cutoff = 14
-  ghost atom cutoff = 14
-  binsize = 7 -> bins = 6 6 6
-Memory usage per processor = 12.0691 Mbytes
-Step press spa press one ref 
-       0    26497.547    26497.547    26497.547   -2357033.6   -2357033.6 
-      10    23665.073    23665.073    23665.073   -2096057.3   -2096057.3 
-      20    23338.149    23338.149    23338.149     -2034283     -2034283 
-      30      25946.4      25946.4      25946.4     -2002817     -2002817 
-      40    27238.349    27238.349    27238.349   -2155411.5   -2155411.5 
-      50    27783.092    27783.092    27783.092   -1862190.3   -1862190.3 
-
-Loop time of 1.17266 on 4 procs for 50 steps with 4500 atoms
-100.0% CPU use with 4 MPI tasks x 1 OpenMP threads
-Performance: 7.368 ns/day  3.257 hours/ns  42.638 timesteps/s
-
-MPI task timings breakdown:
+WARNING: Compute stress/tally only called from pair style (../compute_stress_tally.cpp:79)
+WARNING: Compute stress/tally only called from pair style (../compute_stress_tally.cpp:79)
+Per MPI rank memory allocation (min/avg/max) = 15.25 | 15.26 | 15.27 Mbytes
+Step c_press v_spa v_press v_one v_ref 
+       0    26496.811    26496.811    26496.811   -2356992.7   -2356992.7 
+      10    23665.129    23665.129    23665.129     -2096059     -2096059 
+      20    23338.197    23338.197    23338.197   -2034284.1   -2034284.1 
+      30    25946.434    25946.434    25946.434   -2002815.3   -2002815.3 
+      40    27238.374    27238.374    27238.374   -2155408.7   -2155408.7 
+      50    27783.107    27783.107    27783.107   -1862191.5   -1862191.5 
+Loop time of 4.32017 on 4 procs for 50 steps with 4500 atoms
+
+Performance: 2.000 ns/day, 12.000 hours/ns, 11.574 timesteps/s
+31.8% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.89832    | 0.93222    | 0.98611    |   3.4 | 79.50
-Bond    | 0.00081754 | 0.00096095 | 0.0011327  |   0.4 |  0.08
-Kspace  | 0.068058   | 0.12154    | 0.15522    |   9.4 | 10.36
-Neigh   | 0.065756   | 0.065785   | 0.065824   |   0.0 |  5.61
-Comm    | 0.017489   | 0.017982   | 0.018623   |   0.4 |  1.53
-Output  | 0.010985   | 0.011017   | 0.011111   |   0.1 |  0.94
-Modify  | 0.021429   | 0.021491   | 0.021551   |   0.0 |  1.83
-Other   |            | 0.001671   |            |       |  0.14
+Pair    | 3.5816     | 3.6917     | 3.839      |   4.9 | 85.45
+Bond    | 0.001579   | 0.0016563  | 0.001709   |   0.1 |  0.04
+Kspace  | 0.22505    | 0.3716     | 0.48023    |  15.3 |  8.60
+Neigh   | 0.14558    | 0.14568    | 0.14575    |   0.0 |  3.37
+Comm    | 0.032009   | 0.03441    | 0.036274   |   0.8 |  0.80
+Output  | 0.02253    | 0.023115   | 0.024844   |   0.7 |  0.54
+Modify  | 0.046954   | 0.047086   | 0.047132   |   0.0 |  1.09
+Other   |            | 0.004935   |            |       |  0.11
 
 Nlocal:    1125 ave 1154 max 1092 min
 Histogram: 1 0 0 0 1 0 1 0 0 1
 Nghost:    12263.5 ave 12300 max 12219 min
 Histogram: 1 0 1 0 0 0 0 0 0 2
-Neighs:    650438 ave 678786 max 626279 min
+Neighs:    650438 ave 678787 max 626279 min
 Histogram: 1 0 0 1 1 0 0 0 0 1
 
 Total # of neighbors = 2601750
@@ -132,4 +155,4 @@ Ave special neighs/atom = 2
 Neighbor list builds = 3
 Dangerous builds = 0
 
-Total wall time: 0:00:01
+Total wall time: 0:00:04
diff --git a/examples/USER/tally/log.21Aug15.force.1 b/examples/USER/tally/log.21Aug15.force.1
deleted file mode 100644
index 8e7bdb9520deef2758c2a556edb69f5483e1604d..0000000000000000000000000000000000000000
--- a/examples/USER/tally/log.21Aug15.force.1
+++ /dev/null
@@ -1,136 +0,0 @@
-LAMMPS (21 Aug 2015-ICMS)
-  using 1 OpenMP thread(s) per MPI task
-
-units		real
-atom_style	full
-
-read_data	data.spce
-  orthogonal box = (0.02645 0.02645 0.02641) to (35.5328 35.5328 35.4736)
-  1 by 1 by 1 MPI processor grid
-  reading atoms ...
-  4500 atoms
-  scanning bonds ...
-  2 = max bonds/atom
-  scanning angles ...
-  1 = max angles/atom
-  reading bonds ...
-  3000 bonds
-  reading angles ...
-  1500 angles
-  2 = max # of 1-2 neighbors
-  1 = max # of 1-3 neighbors
-  1 = max # of 1-4 neighbors
-  2 = max # of special neighbors
-
-pair_style	lj/cut/coul/long 12.0 12.0
-kspace_style	pppm 1.0e-4
-
-pair_coeff	1 1 0.15535 3.166
-pair_coeff	* 2 0.0000 0.0000
-
-bond_style	harmonic
-angle_style	harmonic
-dihedral_style	none
-improper_style	none
-
-bond_coeff	1 1000.00 1.000
-angle_coeff	1 100.0 109.47
-
-special_bonds   lj/coul 0.0 0.0 1.0
-  2 = max # of 1-2 neighbors
-  1 = max # of 1-3 neighbors
-  2 = max # of special neighbors
-
-neighbor        2.0 bin
-
-fix		1 all shake 0.0001 20 0 b 1 a 1
-  0 = # of size 2 clusters
-  0 = # of size 3 clusters
-  0 = # of size 4 clusters
-  1500 = # of frozen angles
-fix		2 all nvt temp 300.0 300.0 100.0
-
-group		one molecule 1 2
-6 atoms in group one
-
-# the following section shows equivalences between using the pe/tally compute and other computes and thermo keywords
-
-# compute pairwise force between two molecules and everybody
-compute		fpa one group/group all pair yes kspace no boundary no
-# tally pairwise force between two molecules and the all molecules
-compute		c1 one force/tally all
-# tally the force of all with all (should be zero)
-compute		c2 all force/tally all
-# collect per atom data. only reduce over the first group.
-compute		one one reduce sum c_c1[1] c_c1[2] c_c1[3]
-compute		red all reduce sum c_c2[1] c_c2[2] c_c2[3]
-# determine magnitude of force
-variable	fpa equal sqrt(c_fpa[1]*c_fpa[1]+c_fpa[2]*c_fpa[2]+c_fpa[3]*c_fpa[3])
-variable	for equal sqrt(c_one[1]*c_one[1]+c_one[2]*c_one[2]+c_one[3]*c_one[3])
-# round to 10**-10 absolute precision.
-variable	ref equal round(1e10*sqrt(c_red[1]*c_red[1]+c_red[2]*c_red[2]+c_red[3]*c_red[3]))*1e-10
-
-velocity	all create 300 432567 dist uniform
-
-timestep	2.0
-
-# v_fpa and v_for and c_c1, c_fpa[] and c_one[] should all each have the same value. v_ref and c_c2 should be zero
-thermo_style    custom step v_fpa v_for c_c1 c_fpa[1] c_one[1] c_fpa[2] c_one[2] c_fpa[3] c_one[3] v_ref  c_c2
-thermo		10
-
-run 50
-PPPM initialization ...
-  G vector (1/distance) = 0.218482
-  grid = 15 15 15
-  stencil order = 5
-  estimated absolute RMS force accuracy = 0.0319435
-  estimated relative force accuracy = 9.61968e-05
-  using double precision FFTs
-  3d grid and FFT values/proc = 8000 3375
-WARNING: Compute force/tally only called from pair style (../compute_force_tally.cpp:75)
-WARNING: Compute force/tally only called from pair style (../compute_force_tally.cpp:75)
-Neighbor list info ...
-  2 neighbor list requests
-  update every 1 steps, delay 10 steps, check yes
-  master list distance cutoff = 14
-  ghost atom cutoff = 14
-  binsize = 7 -> bins = 6 6 6
-Memory usage per processor = 16.7648 Mbytes
-Step fpa for c1 fpa[1] one[1] fpa[2] one[2] fpa[3] one[3] ref c2 
-       0    22.732789    22.732789    22.732789   -17.068392   -17.068392   -8.8345214   -8.8345214   -12.140878   -12.140878            0            0 
-      10    11.736915    11.736915    11.736915   -3.3898298   -3.3898298     9.119272     9.119272   -6.5652948   -6.5652948            0            0 
-      20    5.6119761    5.6119761    5.6119761  -0.60028931  -0.60028931   -4.4479886   -4.4479886     3.368876     3.368876            0            0 
-      30    17.292617    17.292617    17.292617    6.1793856    6.1793856   -10.593927   -10.593927    12.190919    12.190919            0            0 
-      40    18.664226    18.664226    18.664226    5.4725079    5.4725079    -6.933046    -6.933046    16.441955    16.441955            0            0 
-      50    12.130282    12.130282    12.130282   -1.0321244   -1.0321244    8.0032646    8.0032646   -9.0568326   -9.0568326            0            0 
-
-Loop time of 4.11825 on 1 procs for 50 steps with 4500 atoms
-100.0% CPU use with 1 MPI tasks x 1 OpenMP threads
-Performance: 2.098 ns/day  11.440 hours/ns  12.141 timesteps/s
-
-MPI task timings breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 3.5286     | 3.5286     | 3.5286     |   0.0 | 85.68
-Bond    | 6.1274e-05 | 6.1274e-05 | 6.1274e-05 |   0.0 |  0.00
-Kspace  | 0.1937     | 0.1937     | 0.1937     |   0.0 |  4.70
-Neigh   | 0.31454    | 0.31454    | 0.31454    |   0.0 |  7.64
-Comm    | 0.01037    | 0.01037    | 0.01037    |   0.0 |  0.25
-Output  | 0.039355   | 0.039355   | 0.039355   |   0.0 |  0.96
-Modify  | 0.029273   | 0.029273   | 0.029273   |   0.0 |  0.71
-Other   |            | 0.002351   |            |       |  0.06
-
-Nlocal:    4500 ave 4500 max 4500 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    21131 ave 21131 max 21131 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    2.60198e+06 ave 2.60198e+06 max 2.60198e+06 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 2601984
-Ave neighs/atom = 578.219
-Ave special neighs/atom = 2
-Neighbor list builds = 4
-Dangerous builds = 1
-
-Total wall time: 0:00:04
diff --git a/examples/USER/tally/log.21Aug15.force.4 b/examples/USER/tally/log.21Aug15.force.4
deleted file mode 100644
index 17cbc50f02fba59ef993419f88e91428ac824653..0000000000000000000000000000000000000000
--- a/examples/USER/tally/log.21Aug15.force.4
+++ /dev/null
@@ -1,136 +0,0 @@
-LAMMPS (21 Aug 2015-ICMS)
-  using 1 OpenMP thread(s) per MPI task
-
-units		real
-atom_style	full
-
-read_data	data.spce
-  orthogonal box = (0.02645 0.02645 0.02641) to (35.5328 35.5328 35.4736)
-  2 by 2 by 1 MPI processor grid
-  reading atoms ...
-  4500 atoms
-  scanning bonds ...
-  2 = max bonds/atom
-  scanning angles ...
-  1 = max angles/atom
-  reading bonds ...
-  3000 bonds
-  reading angles ...
-  1500 angles
-  2 = max # of 1-2 neighbors
-  1 = max # of 1-3 neighbors
-  1 = max # of 1-4 neighbors
-  2 = max # of special neighbors
-
-pair_style	lj/cut/coul/long 12.0 12.0
-kspace_style	pppm 1.0e-4
-
-pair_coeff	1 1 0.15535 3.166
-pair_coeff	* 2 0.0000 0.0000
-
-bond_style	harmonic
-angle_style	harmonic
-dihedral_style	none
-improper_style	none
-
-bond_coeff	1 1000.00 1.000
-angle_coeff	1 100.0 109.47
-
-special_bonds   lj/coul 0.0 0.0 1.0
-  2 = max # of 1-2 neighbors
-  1 = max # of 1-3 neighbors
-  2 = max # of special neighbors
-
-neighbor        2.0 bin
-
-fix		1 all shake 0.0001 20 0 b 1 a 1
-  0 = # of size 2 clusters
-  0 = # of size 3 clusters
-  0 = # of size 4 clusters
-  1500 = # of frozen angles
-fix		2 all nvt temp 300.0 300.0 100.0
-
-group		one molecule 1 2
-6 atoms in group one
-
-# the following section shows equivalences between using the pe/tally compute and other computes and thermo keywords
-
-# compute pairwise force between two molecules and everybody
-compute		fpa one group/group all pair yes kspace no boundary no
-# tally pairwise force between two molecules and the all molecules
-compute		c1 one force/tally all
-# tally the force of all with all (should be zero)
-compute		c2 all force/tally all
-# collect per atom data. only reduce over the first group.
-compute		one one reduce sum c_c1[1] c_c1[2] c_c1[3]
-compute		red all reduce sum c_c2[1] c_c2[2] c_c2[3]
-# determine magnitude of force
-variable	fpa equal sqrt(c_fpa[1]*c_fpa[1]+c_fpa[2]*c_fpa[2]+c_fpa[3]*c_fpa[3])
-variable	for equal sqrt(c_one[1]*c_one[1]+c_one[2]*c_one[2]+c_one[3]*c_one[3])
-# round to 10**-10 absolute precision.
-variable	ref equal round(1e10*sqrt(c_red[1]*c_red[1]+c_red[2]*c_red[2]+c_red[3]*c_red[3]))*1e-10
-
-velocity	all create 300 432567 dist uniform
-
-timestep	2.0
-
-# v_fpa and v_for and c_c1, c_fpa[] and c_one[] should all each have the same value. v_ref and c_c2 should be zero
-thermo_style    custom step v_fpa v_for c_c1 c_fpa[1] c_one[1] c_fpa[2] c_one[2] c_fpa[3] c_one[3] v_ref  c_c2
-thermo		10
-
-run 50
-PPPM initialization ...
-  G vector (1/distance) = 0.218482
-  grid = 15 15 15
-  stencil order = 5
-  estimated absolute RMS force accuracy = 0.0319435
-  estimated relative force accuracy = 9.61968e-05
-  using double precision FFTs
-  3d grid and FFT values/proc = 3380 960
-WARNING: Compute force/tally only called from pair style (../compute_force_tally.cpp:75)
-WARNING: Compute force/tally only called from pair style (../compute_force_tally.cpp:75)
-Neighbor list info ...
-  2 neighbor list requests
-  update every 1 steps, delay 10 steps, check yes
-  master list distance cutoff = 14
-  ghost atom cutoff = 14
-  binsize = 7 -> bins = 6 6 6
-Memory usage per processor = 8.16441 Mbytes
-Step fpa for c1 fpa[1] one[1] fpa[2] one[2] fpa[3] one[3] ref c2 
-       0    22.732789    22.732789    22.732789   -17.068392   -17.068392   -8.8345214   -8.8345214   -12.140878   -12.140878            0            0 
-      10    11.736915    11.736915    11.736915   -3.3898298   -3.3898298     9.119272     9.119272   -6.5652948   -6.5652948            0            0 
-      20    5.6119761    5.6119761    5.6119761  -0.60028931  -0.60028931   -4.4479886   -4.4479886     3.368876     3.368876            0            0 
-      30    17.292617    17.292617    17.292617    6.1793856    6.1793856   -10.593927   -10.593927    12.190919    12.190919            0            0 
-      40    18.664226    18.664226    18.664226    5.4725079    5.4725079    -6.933046    -6.933046    16.441955    16.441955            0            0 
-      50    12.130282    12.130282    12.130282   -1.0321244   -1.0321244    8.0032646    8.0032646   -9.0568326   -9.0568326            0            0 
-
-Loop time of 1.13658 on 4 procs for 50 steps with 4500 atoms
-100.0% CPU use with 4 MPI tasks x 1 OpenMP threads
-Performance: 7.602 ns/day  3.157 hours/ns  43.991 timesteps/s
-
-MPI task timings breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
----------------------------------------------------------------
-Pair    | 0.85795    | 0.89088    | 0.93636    |   3.0 | 78.38
-Bond    | 3.4571e-05 | 4.4644e-05 | 5.4598e-05 |   0.1 |  0.00
-Kspace  | 0.059847   | 0.1051     | 0.1384     |   8.9 |  9.25
-Neigh   | 0.085891   | 0.085954   | 0.086      |   0.0 |  7.56
-Comm    | 0.01758    | 0.018091   | 0.019178   |   0.5 |  1.59
-Output  | 0.013697   | 0.013725   | 0.013805   |   0.0 |  1.21
-Modify  | 0.021068   | 0.021137   | 0.021205   |   0.0 |  1.86
-Other   |            | 0.001656   |            |       |  0.15
-
-Nlocal:    1125 ave 1148 max 1097 min
-Histogram: 1 0 0 1 0 0 0 0 1 1
-Nghost:    12212.5 ave 12269 max 12162 min
-Histogram: 1 0 0 1 0 1 0 0 0 1
-Neighs:    650496 ave 675112 max 631353 min
-Histogram: 1 0 0 1 1 0 0 0 0 1
-
-Total # of neighbors = 2601984
-Ave neighs/atom = 578.219
-Ave special neighs/atom = 2
-Neighbor list builds = 4
-Dangerous builds = 1
-
-Total wall time: 0:00:01
diff --git a/src/USER-TALLY/compute_force_tally.cpp b/src/USER-TALLY/compute_force_tally.cpp
index e97a1c751c323e4482d14477c1b76f75a448c2eb..cb7e3a4f23e54d1086b53bad14fba071c31e3dad 100644
--- a/src/USER-TALLY/compute_force_tally.cpp
+++ b/src/USER-TALLY/compute_force_tally.cpp
@@ -44,8 +44,7 @@ ComputeForceTally::ComputeForceTally(LAMMPS *lmp, int narg, char **arg) :
   extscalar = 1;
   peflag = 1;                   // we need Pair::ev_tally() to be run
 
-  did_compute = 0;
-  invoked_peratom = invoked_scalar = -1;
+  did_setup = invoked_peratom = invoked_scalar = -1;
   nmax = -1;
   fatom = NULL;
   vector = new double[size_peratom_cols];
@@ -69,55 +68,50 @@ void ComputeForceTally::init()
   else
     force->pair->add_tally_callback(this);
 
-  if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-    error->warning(FLERR,"Compute force/tally used with incompatible pair style");
+  if (comm->me == 0) {
+    if (force->pair->single_enable == 0 || force->pair->manybody_flag)
+      error->warning(FLERR,"Compute force/tally used with incompatible pair style");
 
-  if ((comm->me == 0) && (force->bond || force->angle || force->dihedral
-                          || force->improper || force->kspace))
-    error->warning(FLERR,"Compute force/tally only called from pair style");
-
-  did_compute = -1;
+    if (force->bond || force->angle || force->dihedral
+                    || force->improper || force->kspace)
+      error->warning(FLERR,"Compute force/tally only called from pair style");
+  }
+  did_setup = -1;
 }
 
-
 /* ---------------------------------------------------------------------- */
-void ComputeForceTally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                            double, double, double fpair,
-                                            double dx, double dy, double dz)
+
+void ComputeForceTally::pair_setup_callback(int, int)
 {
   const int ntotal = atom->nlocal + atom->nghost;
-  const int * const mask = atom->mask;
 
-  // do setup work that needs to be done only once per timestep
+  // grow per-atom storage, if needed
 
-  if (did_compute != update->ntimestep) {
-    did_compute = update->ntimestep;
+  if (atom->nmax > nmax) {
+    memory->destroy(fatom);
+    nmax = atom->nmax;
+    memory->create(fatom,nmax,size_peratom_cols,"force/tally:fatom");
+    array_atom = fatom;
+  }
 
-    // grow local force array if necessary
-    // needs to be atom->nmax in length
+  // clear storage
 
-    if (atom->nmax > nmax) {
-      memory->destroy(fatom);
-      nmax = atom->nmax;
-      memory->create(fatom,nmax,size_peratom_cols,"force/tally:fatom");
-      array_atom = fatom;
-    }
+  for (int i=0; i < ntotal; ++i)
+    for (int j=0; j < size_peratom_cols; ++j)
+      fatom[i][j] = 0.0;
 
-    // clear storage as needed
+  for (int i=0; i < size_peratom_cols; ++i)
+    vector[i] = ftotal[i] = 0.0;
 
-    if (newton) {
-      for (int i=0; i < ntotal; ++i)
-        for (int j=0; j < size_peratom_cols; ++j)
-          fatom[i][j] = 0.0;
-    } else {
-      for (int i=0; i < atom->nlocal; ++i)
-        for (int j=0; j < size_peratom_cols; ++j)
-          fatom[i][j] = 0.0;
-    }
+  did_setup = update->ntimestep;
+}
 
-    for (int i=0; i < size_peratom_cols; ++i)
-      vector[i] = ftotal[i] = 0.0;
-  }
+/* ---------------------------------------------------------------------- */
+void ComputeForceTally::pair_tally_callback(int i, int j, int nlocal, int newton,
+                                            double, double, double fpair,
+                                            double dx, double dy, double dz)
+{
+  const int * const mask = atom->mask;
 
   if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
        || ((mask[i] & groupbit2) && (mask[j] & groupbit)) ) {
@@ -181,7 +175,8 @@ void ComputeForceTally::unpack_reverse_comm(int n, int *list, double *buf)
 double ComputeForceTally::compute_scalar()
 {
   invoked_scalar = update->ntimestep;
-  if ((did_compute != invoked_scalar) || (update->eflag_global != invoked_scalar))
+  if ((did_setup != invoked_scalar)
+      || (update->eflag_global != invoked_scalar))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // sum accumulated forces across procs
@@ -197,7 +192,8 @@ double ComputeForceTally::compute_scalar()
 void ComputeForceTally::compute_peratom()
 {
   invoked_peratom = update->ntimestep;
-  if ((did_compute != invoked_peratom) || (update->eflag_global != invoked_peratom))
+  if ((did_setup != invoked_peratom)
+      || (update->eflag_global != invoked_peratom))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // collect contributions from ghost atoms
@@ -205,6 +201,7 @@ void ComputeForceTally::compute_peratom()
   if (force->newton_pair) {
     comm->reverse_comm_compute(this);
 
+    // clear out ghost atom data after it has been collected to local atoms
     const int nall = atom->nlocal + atom->nghost;
     for (int i = atom->nlocal; i < nall; ++i)
       for (int j = 0; j < size_peratom_cols; ++j)
diff --git a/src/USER-TALLY/compute_force_tally.h b/src/USER-TALLY/compute_force_tally.h
index 0f7bc35a6d7ab774e3ef7574abb64bcf66fbdc6d..ae2f06a096f0516e513c0102693b0338039c3961 100644
--- a/src/USER-TALLY/compute_force_tally.h
+++ b/src/USER-TALLY/compute_force_tally.h
@@ -39,12 +39,12 @@ class ComputeForceTally : public Compute {
   void unpack_reverse_comm(int, int *, double *);
   double memory_usage();
 
+  void pair_setup_callback(int, int);
   void pair_tally_callback(int, int, int, int,
                            double, double, double,
                            double, double, double);
-
  private:
-  bigint did_compute;
+  bigint did_setup;
   int nmax,igroup2,groupbit2;
   double **fatom;
   double ftotal[3];
diff --git a/src/USER-TALLY/compute_heat_flux_tally.cpp b/src/USER-TALLY/compute_heat_flux_tally.cpp
index 48cad538d5e09d5f9e32eff23ce3cbf02fc43afd..65f57b7678096e5a066f3710a54f2c4f70faedd3 100644
--- a/src/USER-TALLY/compute_heat_flux_tally.cpp
+++ b/src/USER-TALLY/compute_heat_flux_tally.cpp
@@ -43,12 +43,13 @@ ComputeHeatFluxTally::ComputeHeatFluxTally(LAMMPS *lmp, int narg, char **arg) :
   size_vector = 6;
   peflag = 1;                   // we need Pair::ev_tally() to be run
 
-  did_compute = 0;
+  did_setup = 0;
   invoked_peratom = invoked_scalar = -1;
   nmax = -1;
   stress = NULL;
   eatom = NULL;
   vector = new double[size_vector];
+  heatj = new double[size_vector];
 }
 
 /* ---------------------------------------------------------------------- */
@@ -56,6 +57,9 @@ ComputeHeatFluxTally::ComputeHeatFluxTally(LAMMPS *lmp, int narg, char **arg) :
 ComputeHeatFluxTally::~ComputeHeatFluxTally()
 {
   if (force && force->pair) force->pair->del_tally_callback(this);
+  memory->destroy(stress);
+  memory->destroy(eatom);
+  delete[] heatj;
   delete[] vector;
 }
 
@@ -68,70 +72,56 @@ void ComputeHeatFluxTally::init()
   else
     force->pair->add_tally_callback(this);
 
-  if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-    error->warning(FLERR,"Compute heat/flux/tally used with incompatible pair style");
+  if (comm->me == 0) {
+    if (force->pair->single_enable == 0 || force->pair->manybody_flag)
+      error->warning(FLERR,"Compute heat/flux/tally used with incompatible pair style");
 
-  if ((comm->me == 0) && (force->bond || force->angle || force->dihedral
-                          || force->improper || force->kspace))
-    error->warning(FLERR,"Compute heat/flux/tally only called from pair style");
-
-  did_compute = -1;
+    if (force->bond || force->angle || force->dihedral
+                    || force->improper || force->kspace)
+      error->warning(FLERR,"Compute heat/flux/tally only called from pair style");
+  }
+  did_setup = -1;
 }
 
-
 /* ---------------------------------------------------------------------- */
-void ComputeHeatFluxTally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                             double evdwl, double ecoul, double fpair,
-                                             double dx, double dy, double dz)
+void ComputeHeatFluxTally::pair_setup_callback(int, int)
 {
   const int ntotal = atom->nlocal + atom->nghost;
-  const int * const mask = atom->mask;
 
-  // do setup work that needs to be done only once per timestep
+  // grow per-atom storage, if needed
 
-  if (did_compute != update->ntimestep) {
-    did_compute = update->ntimestep;
+  if (atom->nmax > nmax) {
+    memory->destroy(stress);
+    memory->destroy(eatom);
+    nmax = atom->nmax;
+    memory->create(stress,nmax,6,"heat/flux/tally:stress");
+    memory->create(eatom,nmax,"heat/flux/tally:eatom");
+  }
 
-    // grow local stress and eatom arrays if necessary
-    // needs to be atom->nmax in length
+  // clear storage
 
-    if (atom->nmax > nmax) {
-      memory->destroy(stress);
-      nmax = atom->nmax;
-      memory->create(stress,nmax,6,"heat/flux/tally:stress");
+  for (int i=0; i < ntotal; ++i) {
+    eatom[i] = 0.0;
+    stress[i][0] = 0.0;
+    stress[i][1] = 0.0;
+    stress[i][2] = 0.0;
+    stress[i][3] = 0.0;
+    stress[i][4] = 0.0;
+    stress[i][5] = 0.0;
+  }
 
-      memory->destroy(eatom);
-      nmax = atom->nmax;
-      memory->create(eatom,nmax,"heat/flux/tally:eatom");
-    }
+  for (int i=0; i < size_vector; ++i)
+    vector[i] = heatj[i] = 0.0;
 
-    // clear storage as needed
-
-    if (newton) {
-      for (int i=0; i < ntotal; ++i) {
-        eatom[i] = 0.0;
-        stress[i][0] = 0.0;
-        stress[i][1] = 0.0;
-        stress[i][2] = 0.0;
-        stress[i][3] = 0.0;
-        stress[i][4] = 0.0;
-        stress[i][5] = 0.0;
-      }
-    } else {
-      for (int i=0; i < atom->nlocal; ++i) {
-        eatom[i] = 0.0;
-        stress[i][0] = 0.0;
-        stress[i][1] = 0.0;
-        stress[i][2] = 0.0;
-        stress[i][3] = 0.0;
-        stress[i][4] = 0.0;
-        stress[i][5] = 0.0;
-      }
-    }
+  did_setup = update->ntimestep;
+}
 
-    for (int i=0; i < size_vector; ++i)
-      vector[i] = heatj[i] = 0.0;
-  }
+/* ---------------------------------------------------------------------- */
+void ComputeHeatFluxTally::pair_tally_callback(int i, int j, int nlocal, int newton,
+                                             double evdwl, double ecoul, double fpair,
+                                             double dx, double dy, double dz)
+{
+  const int * const mask = atom->mask;
 
   if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
        || ((mask[i] & groupbit2) && (mask[j] & groupbit)) ) {
@@ -210,7 +200,7 @@ void ComputeHeatFluxTally::unpack_reverse_comm(int n, int *list, double *buf)
 void ComputeHeatFluxTally::compute_vector()
 {
   invoked_vector = update->ntimestep;
-  if ((did_compute != invoked_vector) || (update->eflag_global != invoked_vector))
+  if ((did_setup != invoked_vector) || (update->eflag_global != invoked_vector))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // collect contributions from ghost atoms
@@ -243,6 +233,7 @@ void ComputeHeatFluxTally::compute_vector()
   const double pfactor = 0.5 * force->mvv2e;
   double **v = atom->v;
   double *mass = atom->mass;
+  double *rmass = atom->rmass;
   int *type = atom->type;
 
   double jc[3] = {0.0,0.0,0.0};
@@ -250,17 +241,21 @@ void ComputeHeatFluxTally::compute_vector()
 
   for (int i = 0; i < nlocal; i++) {
     if (mask[i] & groupbit) {
-      double ke_i = pfactor * mass[type[i]] *
-                  (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]);
-      jc[0] += (ke_i + eatom[i]) * v[i][0];
-      jc[1] += (ke_i + eatom[i]) * v[i][1];
-      jc[2] += (ke_i + eatom[i]) * v[i][2];
-      jv[0] += stress[i][0]*v[i][0] + stress[i][3]*v[i][1] +
-                stress[i][4]*v[i][2];
-      jv[1] += stress[i][3]*v[i][0] + stress[i][1]*v[i][1] +
-                stress[i][5]*v[i][2];
-      jv[2] += stress[i][4]*v[i][0] + stress[i][5]*v[i][1] +
-                stress[i][2]*v[i][2];
+      const double * const vi = v[i];
+      const double * const si = stress[i];
+      double ke_i;
+
+      if (rmass) ke_i = pfactor * rmass[i];
+      else ke_i *= pfactor * mass[type[i]];
+      ke_i *= (vi[0]*vi[0] + vi[1]*vi[1] + vi[2]*vi[2]);
+      ke_i += eatom[i];
+
+      jc[0] += ke_i*vi[0];
+      jc[1] += ke_i*vi[1];
+      jc[2] += ke_i*vi[2];
+      jv[0] += si[0]*vi[0] + si[3]*vi[1] + si[4]*vi[2];
+      jv[1] += si[3]*vi[0] + si[1]*vi[1] + si[5]*vi[2];
+      jv[2] += si[4]*vi[0] + si[5]*vi[1] + si[2]*vi[2];
     }
   }
 
diff --git a/src/USER-TALLY/compute_heat_flux_tally.h b/src/USER-TALLY/compute_heat_flux_tally.h
index 8c6671cf1e59c04fa7ca25e765021233e41d135b..4158b2e29d9b16c04e93186b387eab0b2082d6c1 100644
--- a/src/USER-TALLY/compute_heat_flux_tally.h
+++ b/src/USER-TALLY/compute_heat_flux_tally.h
@@ -38,15 +38,16 @@ class ComputeHeatFluxTally : public Compute {
   void unpack_reverse_comm(int, int *, double *);
   double memory_usage();
 
+  void pair_setup_callback(int, int);
   void pair_tally_callback(int, int, int, int,
                            double, double, double,
                            double, double, double);
 
  private:
-  bigint did_compute;
+  bigint did_setup;
   int nmax,igroup2,groupbit2;
   double **stress,*eatom;
-  double heatj[6];
+  double *heatj;
 };
 
 }
diff --git a/src/USER-TALLY/compute_pe_mol_tally.cpp b/src/USER-TALLY/compute_pe_mol_tally.cpp
index a30f2d6b9a9c9bc0d8edf509059a5f054abc0b49..25a172b7f81eb6cd68a75fcb520c401bb89ec1de 100644
--- a/src/USER-TALLY/compute_pe_mol_tally.cpp
+++ b/src/USER-TALLY/compute_pe_mol_tally.cpp
@@ -42,7 +42,7 @@ ComputePEMolTally::ComputePEMolTally(LAMMPS *lmp, int narg, char **arg) :
   extvector = 1;
   peflag = 1;                   // we need Pair::ev_tally() to be run
 
-  did_compute = invoked_vector = -1;
+  did_setup = invoked_vector = -1;
   vector = new double[size_vector];
 }
 
@@ -66,16 +66,24 @@ void ComputePEMolTally::init()
   if (atom->molecule_flag == 0)
     error->all(FLERR,"Compute pe/mol/tally requires molecule IDs");
 
-  if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-    error->warning(FLERR,"Compute pe/mol/tally used with incompatible pair style");
+  if (comm->me == 0) {
+    if (force->pair->single_enable == 0 || force->pair->manybody_flag)
+      error->warning(FLERR,"Compute pe/mol/tally used with incompatible pair style");
 
-  if ((comm->me == 0) && (force->bond || force->angle || force->dihedral
-                          || force->improper || force->kspace))
-    error->warning(FLERR,"Compute pe/mol/tally only called from pair style");
-
-  did_compute = -1;
+    if (force->bond || force->angle || force->dihedral
+                    || force->improper || force->kspace)
+      error->warning(FLERR,"Compute pe/mol/tally only called from pair style");
+  }
+  did_setup = -1;
 }
 
+/* ---------------------------------------------------------------------- */
+
+void ComputePEMolTally::pair_setup_callback(int, int)
+{
+  etotal[0] = etotal[1] = etotal[2] = etotal[3] = 0.0;
+  did_setup = update->ntimestep;
+}
 
 /* ---------------------------------------------------------------------- */
 void ComputePEMolTally::pair_tally_callback(int i, int j, int nlocal, int newton,
@@ -85,14 +93,6 @@ void ComputePEMolTally::pair_tally_callback(int i, int j, int nlocal, int newton
   const int * const mask = atom->mask;
   const tagint * const molid = atom->molecule;
 
-  // do setup work that needs to be done only once per timestep
-
-  if (did_compute != update->ntimestep) {
-    did_compute = update->ntimestep;
-
-    etotal[0] = etotal[1] = etotal[2] = etotal[3] = 0.0;
-  }
-
   if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
      || ((mask[i] & groupbit2) && (mask[j] & groupbit)) ){
 
@@ -119,7 +119,7 @@ void ComputePEMolTally::pair_tally_callback(int i, int j, int nlocal, int newton
 void ComputePEMolTally::compute_vector()
 {
   invoked_vector = update->ntimestep;
-  if ((did_compute != invoked_vector) || (update->eflag_global != invoked_vector))
+  if ((did_setup != invoked_vector) || (update->eflag_global != invoked_vector))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // sum accumulated energies across procs
diff --git a/src/USER-TALLY/compute_pe_mol_tally.h b/src/USER-TALLY/compute_pe_mol_tally.h
index b2c5ffab7fc3ee3d23b538c8d4e19cfbfacf8668..1b022a9ef578cfb4d2cdf817605ace0ca53bfc79 100644
--- a/src/USER-TALLY/compute_pe_mol_tally.h
+++ b/src/USER-TALLY/compute_pe_mol_tally.h
@@ -33,12 +33,13 @@ class ComputePEMolTally : public Compute {
   void init();
   void compute_vector();
 
+  void pair_setup_callback(int, int);
   void pair_tally_callback(int, int, int, int,
                            double, double, double,
                            double, double, double);
 
  private:
-  bigint did_compute;
+  bigint did_setup;
   int igroup2,groupbit2;
   double etotal[4];
 };
diff --git a/src/USER-TALLY/compute_pe_tally.cpp b/src/USER-TALLY/compute_pe_tally.cpp
index 2117f2cb15952e09189d0600a4b96ad17ed39f74..e7c0bdd03cca25e152d24ef35df936edcede0fd9 100644
--- a/src/USER-TALLY/compute_pe_tally.cpp
+++ b/src/USER-TALLY/compute_pe_tally.cpp
@@ -44,7 +44,7 @@ ComputePETally::ComputePETally(LAMMPS *lmp, int narg, char **arg) :
   extscalar = 1;
   peflag = 1;                   // we need Pair::ev_tally() to be run
 
-  did_compute = invoked_peratom = invoked_scalar = -1;
+  did_setup = invoked_peratom = invoked_scalar = -1;
   nmax = -1;
   eatom = NULL;
   vector = new double[size_peratom_cols];
@@ -68,55 +68,51 @@ void ComputePETally::init()
   else
     force->pair->add_tally_callback(this);
 
-  if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-    error->warning(FLERR,"Compute pe/tally used with incompatible pair style");
+  if (comm->me == 0) {
+    if (force->pair->single_enable == 0 || force->pair->manybody_flag)
+      error->warning(FLERR,"Compute pe/tally used with incompatible pair style");
 
-  if ((comm->me == 0) && (force->bond || force->angle || force->dihedral
-                          || force->improper || force->kspace))
-    error->warning(FLERR,"Compute pe/tally only called from pair style");
-
-  did_compute = -1;
+    if (force->bond || force->angle || force->dihedral
+                    || force->improper || force->kspace)
+      error->warning(FLERR,"Compute pe/tally only called from pair style");
+  }
+  did_setup = -1;
 }
 
-
 /* ---------------------------------------------------------------------- */
-void ComputePETally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                         double evdwl, double ecoul, double,
-                                         double, double, double)
+
+void ComputePETally::pair_setup_callback(int, int)
 {
   const int ntotal = atom->nlocal + atom->nghost;
-  const int * const mask = atom->mask;
 
-  // do setup work that needs to be done only once per timestep
+  // grow per-atom storage, if needed
 
-  if (did_compute != update->ntimestep) {
-    did_compute = update->ntimestep;
+  if (atom->nmax > nmax) {
+    memory->destroy(eatom);
+    nmax = atom->nmax;
+    memory->create(eatom,nmax,size_peratom_cols,"pe/tally:eatom");
+    array_atom = eatom;
+  }
 
-    // grow local eatom array if necessary
-    // needs to be atom->nmax in length
+  // clear storage
 
-    if (atom->nmax > nmax) {
-      memory->destroy(eatom);
-      nmax = atom->nmax;
-      memory->create(eatom,nmax,size_peratom_cols,"pe/tally:eatom");
-      array_atom = eatom;
-    }
+  for (int i=0; i < ntotal; ++i)
+    eatom[i][0] = eatom[i][1] = 0.0;
 
-    // clear storage as needed
+  vector[0] = etotal[0] = vector[1] = etotal[1] = 0.0;
 
-    if (newton) {
-      for (int i=0; i < ntotal; ++i)
-        eatom[i][0] = eatom[i][1] = 0.0;
-    } else {
-      for (int i=0; i < atom->nlocal; ++i)
-        eatom[i][0] = eatom[i][1] = 0.0;
-    }
+  did_setup = update->ntimestep;
+}
 
-    vector[0] = etotal[0] = vector[1] = etotal[1] = 0.0;
-  }
+/* ---------------------------------------------------------------------- */
+void ComputePETally::pair_tally_callback(int i, int j, int nlocal, int newton,
+                                         double evdwl, double ecoul, double,
+                                         double, double, double)
+{
+  const int * const mask = atom->mask;
 
   if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
-     || ((mask[i] & groupbit2) && (mask[j] & groupbit)) ){
+       || ((mask[i] & groupbit2) && (mask[j] & groupbit)) ) {
 
     evdwl *= 0.5; ecoul *= 0.5;
     if (newton || i < nlocal) {
@@ -164,7 +160,8 @@ void ComputePETally::unpack_reverse_comm(int n, int *list, double *buf)
 double ComputePETally::compute_scalar()
 {
   invoked_scalar = update->ntimestep;
-  if ((did_compute != invoked_scalar) || (update->eflag_global != invoked_scalar))
+  if ((did_setup != invoked_scalar)
+      || (update->eflag_global != invoked_scalar))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // sum accumulated energies across procs
@@ -180,13 +177,16 @@ double ComputePETally::compute_scalar()
 void ComputePETally::compute_peratom()
 {
   invoked_peratom = update->ntimestep;
-  if ((did_compute != invoked_peratom) || (update->eflag_global != invoked_peratom))
+  if ((did_setup != invoked_peratom)
+      || (update->eflag_global != invoked_peratom))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // collect contributions from ghost atoms
 
   if (force->newton_pair) {
     comm->reverse_comm_compute(this);
+
+    // clear out ghost atom data after it has been collected to local atoms
     const int nall = atom->nlocal + atom->nghost;
     for (int i = atom->nlocal; i < nall; ++i)
       eatom[i][0] = eatom[i][1] = 0.0;
diff --git a/src/USER-TALLY/compute_pe_tally.h b/src/USER-TALLY/compute_pe_tally.h
index 2335bbeceeac278d45e9f62bfe8868a868a8071d..cd972e49dba3d0afce0e9387c7e0a461a9b1e881 100644
--- a/src/USER-TALLY/compute_pe_tally.h
+++ b/src/USER-TALLY/compute_pe_tally.h
@@ -39,12 +39,13 @@ class ComputePETally : public Compute {
   void unpack_reverse_comm(int, int *, double *);
   double memory_usage();
 
+  void pair_setup_callback(int, int);
   void pair_tally_callback(int, int, int, int,
                            double, double, double,
                            double, double, double);
 
  private:
-  bigint did_compute;
+  bigint did_setup;
   int nmax,igroup2,groupbit2;
   double **eatom;
   double etotal[2];
diff --git a/src/USER-TALLY/compute_stress_tally.cpp b/src/USER-TALLY/compute_stress_tally.cpp
index 66df9f6e4ffc28d9dc68587e1393c03a7caf3f34..28baafb9f8f451f37771ea84d3e4a1b432fcb62f 100644
--- a/src/USER-TALLY/compute_stress_tally.cpp
+++ b/src/USER-TALLY/compute_stress_tally.cpp
@@ -44,11 +44,11 @@ ComputeStressTally::ComputeStressTally(LAMMPS *lmp, int narg, char **arg) :
   extscalar = 0;
   peflag = 1;                   // we need Pair::ev_tally() to be run
 
-  did_compute = 0;
-  invoked_peratom = invoked_scalar = -1;
+  did_setup = invoked_peratom = invoked_scalar = -1;
   nmax = -1;
   stress = NULL;
   vector = new double[size_peratom_cols];
+  virial = new double[size_peratom_cols];
 }
 
 /* ---------------------------------------------------------------------- */
@@ -57,6 +57,7 @@ ComputeStressTally::~ComputeStressTally()
 {
   if (force && force->pair) force->pair->del_tally_callback(this);
   memory->destroy(stress);
+  delete[] virial;
   delete[] vector;
 }
 
@@ -69,55 +70,50 @@ void ComputeStressTally::init()
   else
     force->pair->add_tally_callback(this);
 
-  if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-    error->warning(FLERR,"Compute stress/tally used with incompatible pair style");
+  if (comm->me == 0) {
+    if (force->pair->single_enable == 0 || force->pair->manybody_flag)
+      error->warning(FLERR,"Compute stress/tally used with incompatible pair style");
 
-  if ((comm->me == 0) && (force->bond || force->angle || force->dihedral
-                          || force->improper || force->kspace))
-    error->warning(FLERR,"Compute stress/tally only called from pair style");
-
-  did_compute = -1;
+    if (force->bond || force->angle || force->dihedral
+                    || force->improper || force->kspace)
+      error->warning(FLERR,"Compute stress/tally only called from pair style");
+  }
+  did_setup = -1;
 }
 
-
 /* ---------------------------------------------------------------------- */
-void ComputeStressTally::pair_tally_callback(int i, int j, int nlocal, int newton,
-                                             double, double, double fpair,
-                                             double dx, double dy, double dz)
+
+void ComputeStressTally::pair_setup_callback(int, int)
 {
   const int ntotal = atom->nlocal + atom->nghost;
-  const int * const mask = atom->mask;
 
-  // do setup work that needs to be done only once per timestep
+  // grow per-atom storage, if needed
 
-  if (did_compute != update->ntimestep) {
-    did_compute = update->ntimestep;
+  if (atom->nmax > nmax) {
+    memory->destroy(stress);
+    nmax = atom->nmax;
+    memory->create(stress,nmax,size_peratom_cols,"stress/tally:stress");
+    array_atom = stress;
+  }
 
-    // grow local stress array if necessary
-    // needs to be atom->nmax in length
+  // clear storage
 
-    if (atom->nmax > nmax) {
-      memory->destroy(stress);
-      nmax = atom->nmax;
-      memory->create(stress,nmax,size_peratom_cols,"stress/tally:stress");
-      array_atom = stress;
-    }
+  for (int i=0; i < ntotal; ++i)
+    for (int j=0; j < size_peratom_cols; ++j)
+      stress[i][j] = 0.0;
 
-    // clear storage as needed
+  for (int i=0; i < size_peratom_cols; ++i)
+    vector[i] = virial[i] = 0.0;
 
-    if (newton) {
-      for (int i=0; i < ntotal; ++i)
-        for (int j=0; j < size_peratom_cols; ++j)
-          stress[i][j] = 0.0;
-    } else {
-      for (int i=0; i < atom->nlocal; ++i)
-        for (int j=0; j < size_peratom_cols; ++j)
-          stress[i][j] = 0.0;
-    }
+  did_setup = update->ntimestep;
+}
 
-    for (int i=0; i < size_peratom_cols; ++i)
-      vector[i] = virial[i] = 0.0;
-  }
+/* ---------------------------------------------------------------------- */
+void ComputeStressTally::pair_tally_callback(int i, int j, int nlocal, int newton,
+                                             double, double, double fpair,
+                                             double dx, double dy, double dz)
+{
+  const int * const mask = atom->mask;
 
   if ( ((mask[i] & groupbit) && (mask[j] & groupbit2))
        || ((mask[i] & groupbit2) && (mask[j] & groupbit)) ) {
@@ -191,7 +187,8 @@ void ComputeStressTally::unpack_reverse_comm(int n, int *list, double *buf)
 double ComputeStressTally::compute_scalar()
 {
   invoked_scalar = update->ntimestep;
-  if ((did_compute != invoked_scalar) || (update->eflag_global != invoked_scalar))
+  if ((did_setup != invoked_scalar)
+      || (update->eflag_global != invoked_scalar))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // sum accumulated forces across procs
@@ -211,7 +208,8 @@ double ComputeStressTally::compute_scalar()
 void ComputeStressTally::compute_peratom()
 {
   invoked_peratom = update->ntimestep;
-  if ((did_compute != invoked_peratom) || (update->eflag_global != invoked_peratom))
+  if ((did_setup != invoked_peratom)
+      || (update->eflag_global != invoked_peratom))
     error->all(FLERR,"Energy was not tallied on needed timestep");
 
   // collect contributions from ghost atoms
diff --git a/src/USER-TALLY/compute_stress_tally.h b/src/USER-TALLY/compute_stress_tally.h
index a677d2eef6b58909a104e272af70e9493df8e79c..22f27a4a41b4e04953b20a61b6f311e1e9b9d9b7 100644
--- a/src/USER-TALLY/compute_stress_tally.h
+++ b/src/USER-TALLY/compute_stress_tally.h
@@ -39,15 +39,16 @@ class ComputeStressTally : public Compute {
   void unpack_reverse_comm(int, int *, double *);
   double memory_usage();
 
+  void pair_setup_callback(int, int);
   void pair_tally_callback(int, int, int, int,
                            double, double, double,
                            double, double, double);
 
  private:
-  bigint did_compute;
+  bigint did_setup;
   int nmax,igroup2,groupbit2;
   double **stress;
-  double virial[6];
+  double *virial;
 };
 
 }
diff --git a/src/compute.h b/src/compute.h
index 7f12cd97e20d1c25b5d6b4e9eddffd81cd2657a5..8fd3880aa71f112ae9b69f012b3ea64f1bacd8dc 100644
--- a/src/compute.h
+++ b/src/compute.h
@@ -135,6 +135,7 @@ class Compute : protected Pointers {
 
   virtual double memory_usage() {return 0.0;}
 
+  virtual void pair_setup_callback(int, int) {}
   virtual void pair_tally_callback(int, int, int, int,
                                    double, double, double,
                                    double, double, double) {}
diff --git a/src/pair.cpp b/src/pair.cpp
index d90ed8bb43a90e422dccf6b39a296b092d2e938f..06792060ce507ad32ee9c97fd2481c47ca8a2a7d 100644
--- a/src/pair.cpp
+++ b/src/pair.cpp
@@ -814,6 +814,16 @@ void Pair::ev_setup(int eflag, int vflag, int alloc)
     if (vflag_atom == 0) vflag_either = 0;
     if (vflag_either == 0 && eflag_either == 0) evflag = 0;
   } else vflag_fdotr = 0;
+
+
+  // run ev_setup option for USER-TALLY computes
+
+  if (num_tally_compute > 0) {
+    for (int k=0; k < num_tally_compute; ++k) {
+      Compute *c = list_tally_compute[k];
+      c->pair_setup_callback(eflag,vflag);
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------