Fix negative mutator time in GC stats in prof builds

Because garbage collector calls `retainerProfile()` and `heapCensus()`, GC times normally include some of PROF times too. To fix this we have these lines: // heapCensus() is called by the GC, so RP and HC time are // included in the GC stats. We therefore subtract them to // obtain the actual GC cpu time. stats.gc_cpu_ns -= prof_cpu; stats.gc_elapsed_ns -= prof_elapsed; These variables are later used for calculating GC time excluding the final GC (which should be attributed to EXIT). exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed; The problem is if we subtract PROF times from `gc_elapsed_ns` and then subtract `start_exit_gc_elapsed` from the result, we end up subtracting PROF times twice, because `start_exit_gc_elapsed` also includes PROF times. We now subtract PROF times from GC after the calculations for EXIT and MUT times. The existing assertion that checks INIT + MUT + GC + EXIT = TOTAL now holds. When we subtract PROF numbers from GC, and a new assertion INIT + MUT + GC + PROF + EXIT = TOTAL also holds. Fixes #15897. New assertions added in this commit also revealed #16102, which is also fixed by this commit.
author: Ömer Sinan Ağacan <omeragacan@gmail.com> 2018-12-28 10:51:28 (GMT)
committer: Ömer Sinan Ağacan <omeragacan@gmail.com> 2019-01-12 11:52:38 (GMT)
commit: 19670bc397d858b04eb9b4eb01480f7f8c59e2f5 (patch)
tree: 0e84040446183e032bcd05f2a8c1fd620ba1b9b2
parent: 74cd4ec5d2f9321aad5db3285cb60d78f2562996 (diff)
download: ghc-19670bc397d858b04eb9b4eb01480f7f8c59e2f5.zip
ghc-19670bc397d858b04eb9b4eb01480f7f8c59e2f5.tar.gz
ghc-19670bc397d858b04eb9b4eb01480f7f8c59e2f5.tar.bz2
6 files changed, 90 insertions, 49 deletions
diff --git a/rts/Schedule.c b/rts/Schedule.c
index eb9203f..02055d2 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -2018,6 +2018,10 @@ forkProcess(HsStablePtr *entry
 
     } else { // child
 
+        // Current process times reset in the child process, so we should reset
+        // the stats too. See #16102.
+        resetChildProcessStats();
+
 #if defined(THREADED_RTS)
         initMutex(&sched_mutex);
         initMutex(&sm_mutex);
diff --git a/rts/Stats.c b/rts/Stats.c
index e42524f..b2e48c4 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -43,12 +43,6 @@ static Time HC_start_time, HC_tot_time = 0;     // heap census prof user time
 static Time HCe_start_time, HCe_tot_time = 0;   // heap census prof elap time
 #endif
 
-#if defined(PROFILING)
-#define PROF_VAL(x)   (x)
-#else
-#define PROF_VAL(x)   0
-#endif
-
 #if defined(PROF_SPIN)
 volatile StgWord64 whitehole_lockClosure_spin = 0;
 volatile StgWord64 whitehole_lockClosure_yield = 0;
@@ -204,11 +198,11 @@ initStats0(void)
    initStats1() can be called after setupRtsFlags()
    ------------------------------------------------------------------------ */
 
+void initGenerationStats(void);
+
 void
 initStats1 (void)
 {
-    uint32_t i;
-
     if (RtsFlags.GcFlags.giveStats >= VERBOSE_GC_STATS) {
         statsPrintf("    Alloc    Copied     Live     GC     GC      TOT      TOT  Page Flts\n");
         statsPrintf("    bytes     bytes     bytes   user   elap     user     elap\n");
@@ -225,13 +219,29 @@ initStats1 (void)
         (Time *)stgMallocBytes(
             sizeof(Time)*RtsFlags.GcFlags.generations,
             "initStats");
-    for (i = 0; i < RtsFlags.GcFlags.generations; i++) {
+    initGenerationStats();
+}
+
+void
+initGenerationStats()
+{
+    for (uint32_t i = 0; i < RtsFlags.GcFlags.generations; i++) {
         GC_coll_cpu[i] = 0;
         GC_coll_elapsed[i] = 0;
         GC_coll_max_pause[i] = 0;
     }
 }
 
+/* ---------------------------------------------------------------------------
+   Reset stats of child process after fork()
+   ------------------------------------------------------------------------ */
+
+void resetChildProcessStats()
+{
+    initStats0();
+    initGenerationStats();
+}
+
 /* -----------------------------------------------------------------------------
    Initialisation time...
    -------------------------------------------------------------------------- */
@@ -517,14 +527,14 @@ stat_endRP(
     RP_tot_time += user - RP_start_time;
     RPe_tot_time += elapsed - RPe_start_time;
 
-  fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
-    retainerGeneration, mut_user_time_during_RP());
+    fprintf(prof_file, "Retainer Profiling: %d, at %f seconds\n",
+      retainerGeneration, mut_user_time_during_RP());
 #if defined(DEBUG_RETAINER)
-  fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
-  fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
+    fprintf(prof_file, "\tMax C stack size = %u\n", maxCStackSize);
+    fprintf(prof_file, "\tMax auxiliary stack size = %u\n", maxStackSize);
 #endif
-  fprintf(prof_file, "\tAverage number of visits per object = %f\n",
-          averageNumVisit);
+    fprintf(prof_file, "\tAverage number of visits per object = %f\n",
+            averageNumVisit);
 }
 #endif /* PROFILING */
 
@@ -1049,21 +1059,15 @@ void
 stat_exit (void)
 {
     RTSSummaryStats sum;
-    uint32_t g;
-
     init_RTSSummaryStats(&sum);
+
     if (RtsFlags.GcFlags.giveStats != NO_GC_STATS) {
         // First we tidy the times in stats, and populate the times in sum.
         // In particular, we adjust the gc_* time in stats to remove
         // profiling times.
         {
             Time now_cpu_ns, now_elapsed_ns;
-            Time exit_gc_cpu = 0;
-            Time exit_gc_elapsed = 0;
-            Time prof_cpu = 0;
-            Time prof_elapsed = 0;
-
-            getProcessTimes( &now_cpu_ns, &now_elapsed_ns);
+            getProcessTimes(&now_cpu_ns, &now_elapsed_ns);
 
             stats.cpu_ns = now_cpu_ns - start_init_cpu;
             stats.elapsed_ns = now_elapsed_ns - start_init_elapsed;
@@ -1072,15 +1076,6 @@ stat_exit (void)
             if (stats.cpu_ns <= 0) { stats.cpu_ns = 1; }
             if (stats.elapsed_ns <= 0) { stats.elapsed_ns = 1; }
 
-            prof_cpu = PROF_VAL(RP_tot_time + HC_tot_time);
-            prof_elapsed = PROF_VAL(RPe_tot_time + HCe_tot_time);
-
-            // heapCensus() is called by the GC, so RP and HC time are
-            // included in the GC stats.  We therefore subtract them to
-            // obtain the actual GC cpu time.
-            stats.gc_cpu_ns      -=  prof_cpu;
-            stats.gc_elapsed_ns  -=  prof_elapsed;
-
 #if defined(PROFILING)
             sum.rp_cpu_ns = RP_tot_time;
             sum.rp_elapsed_ns = RPe_tot_time;
@@ -1091,8 +1086,11 @@ stat_exit (void)
             // We do a GC during the EXIT phase. We'll attribute the cost of
             // that to GC instead of EXIT, so carefully subtract it from the
             // EXIT time.
-            exit_gc_cpu          = stats.gc_cpu_ns - start_exit_gc_cpu;
-            exit_gc_elapsed      = stats.gc_elapsed_ns - start_exit_gc_elapsed;
+            // Note that exit_gc includes RP and HC for the exit GC too.
+            Time exit_gc_cpu     = stats.gc_cpu_ns - start_exit_gc_cpu;
+            Time exit_gc_elapsed = stats.gc_elapsed_ns - start_exit_gc_elapsed;
+
+            ASSERT(exit_gc_elapsed > 0);
 
             sum.exit_cpu_ns     = end_exit_cpu
                                       - start_exit_cpu
@@ -1101,14 +1099,16 @@ stat_exit (void)
                                        - start_exit_elapsed
                                        - exit_gc_elapsed;
 
+            ASSERT(sum.exit_elapsed_ns >= 0);
+
             stats.mutator_cpu_ns     = start_exit_cpu
                                  - end_init_cpu
-                                 - (stats.gc_cpu_ns - exit_gc_cpu)
-                                 - prof_cpu;
+                                 - (stats.gc_cpu_ns - exit_gc_cpu);
             stats.mutator_elapsed_ns = start_exit_elapsed
                                  - end_init_elapsed
-                                 - (stats.gc_elapsed_ns - exit_gc_elapsed)
-                                 - prof_elapsed;
+                                 - (stats.gc_elapsed_ns - exit_gc_elapsed);
+
+            ASSERT(stats.mutator_elapsed_ns >= 0);
 
             if (stats.mutator_cpu_ns < 0) { stats.mutator_cpu_ns = 0; }
 
@@ -1116,12 +1116,29 @@ stat_exit (void)
             // and subtracting, so the parts should add up to the total exactly.
             // Note that stats->total_ns is captured a tiny bit later than
             // end_exit_elapsed, so we don't use it here.
-            ASSERT(stats.init_elapsed_ns \
-                   + stats.mutator_elapsed_ns \
-                   + stats.gc_elapsed_ns \
-                   + sum.exit_elapsed_ns \
+            ASSERT(stats.init_elapsed_ns // INIT
+                   + stats.mutator_elapsed_ns // MUT
+                   + stats.gc_elapsed_ns // GC
+                   + sum.exit_elapsed_ns // EXIT
                    == end_exit_elapsed - start_init_elapsed);
 
+            // heapCensus() is called by the GC, so RP and HC time are
+            // included in the GC stats.  We therefore subtract them to
+            // obtain the actual GC cpu time.
+            Time prof_cpu     = sum.rp_cpu_ns + sum.hc_cpu_ns;
+            Time prof_elapsed = sum.rp_elapsed_ns + sum.hc_elapsed_ns;
+
+            stats.gc_cpu_ns      -=  prof_cpu;
+            stats.gc_elapsed_ns  -=  prof_elapsed;
+
+            // This assertion is probably not necessary; make sure the
+            // subdivision with PROF also makes sense
+            ASSERT(stats.init_elapsed_ns // INIT
+                   + stats.mutator_elapsed_ns // MUT
+                   + stats.gc_elapsed_ns // GC
+                   + sum.exit_elapsed_ns // EXIT
+                   + (sum.rp_elapsed_ns + sum.hc_elapsed_ns) // PROF
+                   == end_exit_elapsed - start_init_elapsed);
         }
 
         // REVIEWERS: it's not clear to me why the following isn't done in
@@ -1142,10 +1159,9 @@ stat_exit (void)
         // We populate the remainder (non-time elements) of sum
         {
     #if defined(THREADED_RTS)
-            uint32_t i;
             sum.bound_task_count = taskCount - workerCount;
 
-            for (i = 0; i < n_capabilities; i++) {
+            for (uint32_t i = 0; i < n_capabilities; i++) {
                 sum.sparks.created   += capabilities[i]->spark_stats.created;
                 sum.sparks.dud       += capabilities[i]->spark_stats.dud;
                 sum.sparks.overflowed+=
@@ -1195,22 +1211,22 @@ stat_exit (void)
             sum.productivity_cpu_percent =
                 TimeToSecondsDbl(stats.cpu_ns
                                 - stats.gc_cpu_ns
-                                - sum.rp_cpu_ns
-                                - sum.hc_cpu_ns
                                 - stats.init_cpu_ns
                                 - sum.exit_cpu_ns)
                 / TimeToSecondsDbl(stats.cpu_ns);
 
+            ASSERT(sum.productivity_cpu_percent >= 0);
+
             sum.productivity_elapsed_percent =
                 TimeToSecondsDbl(stats.elapsed_ns
                                 - stats.gc_elapsed_ns
-                                - sum.rp_elapsed_ns
-                                - sum.hc_elapsed_ns
                                 - stats.init_elapsed_ns
                                 - sum.exit_elapsed_ns)
                 / TimeToSecondsDbl(stats.elapsed_ns);
 
-            for(g = 0; g < RtsFlags.GcFlags.generations; ++g) {
+            ASSERT(sum.productivity_elapsed_percent >= 0);
+
+            for(uint32_t g = 0; g < RtsFlags.GcFlags.generations; ++g) {
                 const generation* gen = &generations[g];
                 GenerationSummaryStats* gen_stats = &sum.gc_summary_stats[g];
                 gen_stats->collections = gen->collections;
diff --git a/rts/Stats.h b/rts/Stats.h
index fbcca11..ed24798 100644
--- a/rts/Stats.h
+++ b/rts/Stats.h
@@ -59,6 +59,7 @@ void      stat_workerStop(void);
 
 void      initStats0(void);
 void      initStats1(void);
+void      resetChildProcessStats(void);
 
 double    mut_user_time_until(Time t);
 double    mut_user_time(void);
diff --git a/testsuite/tests/profiling/should_run/Makefile b/testsuite/tests/profiling/should_run/Makefile
index d52bda1..19a682f 100644
--- a/testsuite/tests/profiling/should_run/Makefile
+++ b/testsuite/tests/profiling/should_run/Makefile
@@ -22,3 +22,11 @@ T14257:
 	./T14257 +RTS -hc
 	# Make sure that samples are monotonically increasing
 	awk 'BEGIN{t=0} /BEGIN_SAMPLE/{if ($$2 < t) print "uh oh", $$t, $$0; t=$$2;}' T14257.hp
+
+.PHONY: T15897
+T15897:
+	# The bug is caught by an assertion so we run the tests with debug runtime
+	# and ignore the outputs
+	"$(TEST_HC)" -prof -fprof-auto -debug -v0 T15897.hs
+	./T15897 10000000 +RTS -s -hc 2>/dev/null
+	./T15897 10000000 +RTS -s -hr 2>/dev/null
diff --git a/testsuite/tests/profiling/should_run/T15897.hs b/testsuite/tests/profiling/should_run/T15897.hs
new file mode 100644
index 0000000..b004cd1
--- /dev/null
+++ b/testsuite/tests/profiling/should_run/T15897.hs
@@ -0,0 +1,7 @@
+import Control.Monad
+import Data.IORef
+import System.Environment
+
+main = do
+    [n] <- getArgs
+    replicateM (read n) (newIORef [1,2,3])
diff --git a/testsuite/tests/profiling/should_run/all.T b/testsuite/tests/profiling/should_run/all.T
index 968d678..9bca86b 100644
--- a/testsuite/tests/profiling/should_run/all.T
+++ b/testsuite/tests/profiling/should_run/all.T
@@ -145,3 +145,8 @@ test('toplevel_scc_1',
 test('T12962', [], compile_and_run, [''])
 
 test('T14257', [], run_command, ['$MAKE -s --no-print-directory T14257'])
+
+test('T15897',
+     [extra_ways(['profasm']), only_ways(['profasm']), run_timeout_multiplier(2)],
+     run_command,
+     ['$MAKE -s --no-print-directory T15897'])
author	Ömer Sinan Ağacan <omeragacan@gmail.com>	2018-12-28 10:51:28 (GMT)
committer	Ömer Sinan Ağacan <omeragacan@gmail.com>	2019-01-12 11:52:38 (GMT)
commit	19670bc397d858b04eb9b4eb01480f7f8c59e2f5 (patch)
tree	0e84040446183e032bcd05f2a8c1fd620ba1b9b2
parent	74cd4ec5d2f9321aad5db3285cb60d78f2562996 (diff)
download	ghc-19670bc397d858b04eb9b4eb01480f7f8c59e2f5.zip ghc-19670bc397d858b04eb9b4eb01480f7f8c59e2f5.tar.gz ghc-19670bc397d858b04eb9b4eb01480f7f8c59e2f5.tar.bz2