>From f685b18be0bbf08899b84818174de19bfc8b53e4 Mon Sep 17 00:00:00 2001
From: "Eric S. Raymond" <address@hidden>
Date: Thu, 26 Feb 2015 17:33:55 -0500
Subject: [PATCH] Refactor the SHM refclock driver and add protective memory
 barriers.

This change factors the SHM segment querying out of shm_timer() into a new
function shm_query() which isolates all the stuff that is  dependent on the
SHM segment data layout. The new function fills in a struct containing the
extracted data and returns a useful status code. Age control and the delta
check remain in shm_timer().

The memory_barrier() function needs a C11 extension from stdatomic.h; it
is guarded by STD_ATOMIC_H. If the compiler does not have this feature, but
does compile memcpy to an uninterruptible bitlblt instruction, update
atomicity should still be guaranteed.  If neither of these is true, the
pre-existing count check will still yield a CLASH status when a concurrent
write might screw up a read.

The shm_query() code has been tested and works as part of the new ntpmon
utility in GPSD.  The integration with the revised shm_timer() code has
not been tested but should be straightforward to understand and validate.
---
 ntpd/refclock_shm.c              | 300 +++++++++++++++++++++++++--------------
 ntpdc/nl.pl                      |   2 +-
 sntp/libevent/evconfig-private.h |   2 +-
 3 files changed, 194 insertions(+), 110 deletions(-)

diff --git a/ntpd/refclock_shm.c b/ntpd/refclock_shm.c
index 7174abd..8eccf48 100644
--- a/ntpd/refclock_shm.c
+++ b/ntpd/refclock_shm.c
@@ -348,10 +348,153 @@ shm_poll(
 	shm_clockstats(unit, peer);
 }
 
+
+enum segstat_t {
+    OK, NO_SEGMENT, NOT_READY, BAD_MODE, CLASH
+};
+
+struct shm_stat_t {
+    int status;
+    int mode;
+    struct timespec tvc, tvr, tvt;
+    int precision;
+    int leap;
+};
+
+static inline void memory_barrier(void)
+{
+#ifdef STD_ATOMIC_H
+    atomic_thread_fence(memory_order_seq_cst);
+#endif /* STD_ATOMIC_H */
+}
+
+static enum segstat_t shm_query(volatile struct shmTime *shm_in, struct shm_stat_t *shm_stat)
+/* try to grab a sample from the specified SHM segment */
+{
+    volatile struct shmTime shmcopy, *shm = shm_in;
+    volatile int cnt;
+
+    unsigned int cns_new, rns_new;
+
+    /*
+     * This is the main routine. It snatches the time from the shm
+     * board and tacks on a local timestamp.
+     */
+    if (shm == NULL) {
+	shm_stat->status = NO_SEGMENT;
+	return NO_SEGMENT;
+    }
+
+    /address@hidden@*//* splint is confused about struct timespec */
+    shm_stat->tvc.tv_sec = shm_stat->tvc.tv_nsec = 0;
+    clock_gettime(CLOCK_REALTIME, &shm_stat->tvc);
+
+    /* relying on word access to be atomic here */
+    if (shm->valid == 0) {
+	shm_stat->status = NOT_READY;
+	return NOT_READY;
+    }
+
+    cnt = shm->count;
+
+    /*
+     * This is proof against concurrency issues if either
+     * (a) the memory_barrier() call works on this host, or
+     * (b) memset compiles to an uninterruptible single-instruction bitblt.
+     */
+    memory_barrier();
+    memcpy((void *)&shmcopy, (void *)shm, sizeof(struct shmTime));
+    shm->valid = 0;
+    memory_barrier();
+
+    /* 
+     * Clash detection in case neither (a) nor (b) was true.
+     * Not supported in mode 0, and word access to the count field 
+     * must be atomic for this to work.
+     */
+    if (shmcopy.mode > 0 && cnt != shm->count) {
+	shm_stat->status = CLASH;
+	return shm_stat->status;
+    }
+
+    shm_stat->status = OK;
+    shm_stat->mode = shmcopy.mode;
+
+    switch (shmcopy.mode) {
+    case 0:
+	shm_stat->tvr.tv_sec	= shmcopy.receiveTimeStampSec;
+	shm_stat->tvr.tv_nsec	= shmcopy.receiveTimeStampUSec * 1000;
+	rns_new		= shmcopy.receiveTimeStampNSec;
+	shm_stat->tvt.tv_sec	= shmcopy.clockTimeStampSec;
+	shm_stat->tvt.tv_nsec	= shmcopy.clockTimeStampUSec * 1000;
+	cns_new		= shmcopy.clockTimeStampNSec;
+
+	/* Since the following comparisons are between unsigned
+	** variables they are always well defined, and any
+	** (signed) underflow will turn into very large unsigned
+	** values, well above the 1000 cutoff.
+	**
+	** Note: The usecs *must* be a *truncated*
+	** representation of the nsecs. This code will fail for
+	** *rounded* usecs, and the logic to deal with
+	** wrap-arounds in the presence of rounded values is
+	** much more convoluted.
+	*/
+	if (   ((cns_new - (unsigned)shm_stat->tvt.tv_nsec) < 1000)
+	       && ((rns_new - (unsigned)shm_stat->tvr.tv_nsec) < 1000)) {
+	    shm_stat->tvt.tv_nsec = cns_new;
+	    shm_stat->tvr.tv_nsec = rns_new;
+	}
+	/* At this point shm_stat->tvr and shm_stat->tvt contain valid ns-level
+	** timestamps, possibly generated by extending the old
+	** us-level timestamps
+	*/
+	break;
+
+    case 1:
+
+	shm_stat->tvr.tv_sec	= shmcopy.receiveTimeStampSec;
+	shm_stat->tvr.tv_nsec	= shmcopy.receiveTimeStampUSec * 1000;
+	rns_new		= shmcopy.receiveTimeStampNSec;
+	shm_stat->tvt.tv_sec	= shmcopy.clockTimeStampSec;
+	shm_stat->tvt.tv_nsec	= shmcopy.clockTimeStampUSec * 1000;
+	cns_new		= shmcopy.clockTimeStampNSec;
+		
+	/* See the case above for an explanation of the
+	** following test.
+	*/
+	if (   ((cns_new - (unsigned)shm_stat->tvt.tv_nsec) < 1000)
+	       && ((rns_new - (unsigned)shm_stat->tvr.tv_nsec) < 1000)) {
+	    shm_stat->tvt.tv_nsec = cns_new;
+	    shm_stat->tvr.tv_nsec = rns_new;
+	}
+	/* At this point shm_stat->tvr and shm_stat->tvt contains valid ns-level
+	** timestamps, possibly generated by extending the old
+	** us-level timestamps
+	*/
+	break;
+
+    default:
+	shm_stat->status = BAD_MODE;
+	break;
+    }
+    /address@hidden@*/
+
+    /*
+     * leap field is not a leap offset but a leap notification code.
+     * The values are magic numbers used by NTP and set by GPSD, if at all, in
+     * the subframe code.
+     */
+    shm_stat->leap = shmcopy.leap;
+    shm_stat->precision = shmcopy.precision;
+
+    return shm_stat->status;
+}
+
 /*
- * shm_timer - called onece every second.
+ * shm_timer - called once every second.
  *
- * This tries to grab a sample from the SHM segment
+ * This tries to grab a sample from the SHM segment, filtering bad ones
  */
 static void
 shm_timer(
@@ -362,33 +505,20 @@ shm_timer(
 	struct refclockproc * const pp = peer->procptr;
 	struct shmunit *      const up = pp->unitptr;
 
-	/* access order is important for lock-free SHM access; we
-	** enforce order by treating the whole structure volatile.
-	**
-	** IMPORTANT NOTE: This does not protect from reordering on CPU
-	** level, and it does nothing for cache consistency and
-	** visibility of changes by other cores. We need atomic and/or
-	** fence instructions for that.
-	*/
 	volatile struct shmTime *shm;
 
-	struct timespec tvr;
-	struct timespec tvt;
 	l_fp tsrcv;
 	l_fp tsref;
-	unsigned int c;
-	unsigned int cns_new, rns_new;
-	int cnt;
+	int c;
 
 	/* for formatting 'a_lastcode': */
 	struct calendar cd;
-	time_t tt, now;
+	time_t tt;
 	vint64 ts;
 
-	/*
-	 * This is the main routine. It snatches the time from the shm
-	 * board and tacks on a local timestamp.
-	 */
+	enum segstat_t status;
+	struct shm_stat_t shm_stat;
+
 	up->ticks++;
 	if ((shm = up->shm) == NULL) {
 		/* try to map again - this may succeed if meanwhile some-
@@ -400,88 +530,43 @@ shm_timer(
 			return;
 		}
 	}
-	if ( ! shm->valid) {
-		DPRINTF(1, ("%s: SHM not ready\n",
-			    refnumtoa(&peer->srcadr)));
-		up->notready++;
-		return;
-	}
-
-	switch (shm->mode) {
-	case 0:
-		tvr.tv_sec	= shm->receiveTimeStampSec;
-		tvr.tv_nsec	= shm->receiveTimeStampUSec * 1000;
-		rns_new		= shm->receiveTimeStampNSec;
-		tvt.tv_sec	= shm->clockTimeStampSec;
-		tvt.tv_nsec	= shm->clockTimeStampUSec * 1000;
-		cns_new		= shm->clockTimeStampNSec;
-
-		/* Since the following comparisons are between unsigned
-		** variables they are always well defined, and any
-		** (signed) underflow will turn into very large unsigned
-		** values, well above the 1000 cutoff.
-		**
-		** Note: The usecs *must* be a *truncated*
-		** representation of the nsecs. This code will fail for
-		** *rounded* usecs, and the logic to deal with
-		** wrap-arounds in the presence of rounded values is
-		** much more convoluted.
-		*/
-		if (   ((cns_new - (unsigned)tvt.tv_nsec) < 1000)
-		    && ((rns_new - (unsigned)tvr.tv_nsec) < 1000)) {
-			tvt.tv_nsec = cns_new;
-			tvr.tv_nsec = rns_new;
-		}
-		/* At this point tvr and tvt contains valid ns-level
-		** timestamps, possibly generated by extending the old
-		** us-level timestamps
-		*/
- 		DPRINTF(2, ("%s: SHM type 0 sample\n",
-			    refnumtoa(&peer->srcadr)));
-		break;
-
-	case 1:
-		cnt = shm->count;
-
-		tvr.tv_sec	= shm->receiveTimeStampSec;
-		tvr.tv_nsec	= shm->receiveTimeStampUSec * 1000;
-		rns_new		= shm->receiveTimeStampNSec;
-		tvt.tv_sec	= shm->clockTimeStampSec;
-		tvt.tv_nsec	= shm->clockTimeStampUSec * 1000;
-		cns_new		= shm->clockTimeStampNSec;
-		if (cnt != shm->count) {
-			DPRINTF(1, ("%s: type 1 access clash\n",
-				    refnumtoa(&peer->srcadr)));
-			msyslog (LOG_NOTICE, "SHM: access clash in shared memory");
-			up->clash++;
-			return;
-		}
-		
-		/* See the case above for an explanation of the
-		** following test.
-		*/
-		if (   ((cns_new - (unsigned)tvt.tv_nsec) < 1000)
-		    && ((rns_new - (unsigned)tvr.tv_nsec) < 1000)) {
-			tvt.tv_nsec = cns_new;
-			tvr.tv_nsec = rns_new;
-		}
-		/* At this point tvr and tvt contains valid ns-level
-		** timestamps, possibly generated by extending the old
-		** us-level timestamps
-		*/
- 		DPRINTF(2, ("%s: SHM type 1 sample\n",
-			    refnumtoa(&peer->srcadr)));
-		break;
 
+	/* query the segment, atomically */
+	status = shm_query(shm, &shm_stat);
+
+	switch (status) {
+	case OK:
+	    DPRINTF(2, ("%s: SHM type %d sample\n",
+			refnumtoa(&peer->srcadr), shm_stat.mode));
+	    break;
+	case NO_SEGMENT:
+	    /* should never happen, but is harmless */
+	    return;
+	case NOT_READY:
+	    DPRINTF(1, ("%s: SHM not ready\n",refnumtoa(&peer->srcadr)));
+	    up->notready++;
+	    return;
+	case BAD_MODE:
+	    DPRINTF(1, ("%s: SHM type blooper, mode=%d\n",
+			refnumtoa(&peer->srcadr), shm->mode));
+	    up->bad++;
+	    msyslog (LOG_ERR, "SHM: bad mode found in shared memory: %d",
+		     shm->mode);
+	    return;
+	case CLASH:
+	    DPRINTF(1, ("%s: type 1 access clash\n",
+			refnumtoa(&peer->srcadr)));
+	    msyslog (LOG_NOTICE, "SHM: access clash in shared memory");
+	    up->clash++;
+	    return;
 	default:
- 		DPRINTF(1, ("%s: SHM type blooper, mode=%d\n",
-			    refnumtoa(&peer->srcadr), shm->mode));
-		up->bad++;
-		msyslog (LOG_ERR, "SHM: bad mode found in shared memory: %d",
-			 shm->mode);
-		return;
+	    DPRINTF(1, ("%s: internal error, unknown SHM fetch status\n",
+			refnumtoa(&peer->srcadr)));
+	    msyslog (LOG_NOTICE, "internal error, unknown SHM fetch status");
+	    up->bad++;
+	    return;
 	}
-	shm->valid = 0;
+
 
 	/* format the last time code in human-readable form into
 	 * 'pp->a_lastcode'. Someone claimed: "NetBSD has incompatible
@@ -489,7 +574,7 @@ shm_timer(
 	 * around that potential problem. BTW, simply casting a pointer
 	 * is a receipe for disaster on some architectures.
 	 */
-	tt = (time_t)tvt.tv_sec;
+	tt = (time_t)shm_stat.tvt.tv_sec;
 	ts = time_to_vint64(&tt);
 	ntpcal_time_to_date(&cd, &ts);
 		
@@ -498,12 +583,11 @@ shm_timer(
 		     "%04u-%02u-%02uT%02u:%02u:%02u.%09ldZ",
 		     cd.year, cd.month, cd.monthday,
 		     cd.hour, cd.minute, cd.second,
-		     (long)tvt.tv_nsec);
+		     (long)shm_stat.tvt.tv_nsec);
 	pp->lencode = (c < sizeof(pp->a_lastcode)) ? c : 0;
 
 	/* check 1: age control of local time stamp */
-	time(&now);
-	tt = now - tvr.tv_sec;
+	tt = shm_stat.tvc.tv_sec - shm_stat.tvr.tv_sec;
 	if (tt < 0 || tt > up->max_delay) {
 		DPRINTF(1, ("%s:SHM stale/bad receive time, delay=%llds\n",
 			    refnumtoa(&peer->srcadr), (long long)tt));
@@ -514,7 +598,7 @@ shm_timer(
 	}
 
 	/* check 2: delta check */
-	tt = tvr.tv_sec - tvt.tv_sec - (tvr.tv_nsec < tvt.tv_nsec);
+	tt = shm_stat.tvr.tv_sec - shm_stat.tvt.tv_sec - (shm_stat.tvr.tv_nsec < shm_stat.tvt.tv_nsec);
 	if (tt < 0)
 		tt = -tt;
 	if (up->max_delta > 0 && tt > up->max_delta) {
@@ -529,10 +613,10 @@ shm_timer(
 	/* if we really made it to this point... we're winners! */
 	DPRINTF(2, ("%s: SHM feeding data\n",
 		    refnumtoa(&peer->srcadr)));
-	tsrcv = tspec_stamp_to_lfp(tvr);
-	tsref = tspec_stamp_to_lfp(tvt);
-	pp->leap = shm->leap;
-	peer->precision = shm->precision;
+	tsrcv = tspec_stamp_to_lfp(shm_stat.tvr);
+	tsref = tspec_stamp_to_lfp(shm_stat.tvt);
+	pp->leap = shm_stat.leap;
+	peer->precision = shm_stat.precision;
 	refclock_process_offset(pp, tsref, tsrcv, pp->fudgetime1);
 	up->good++;
 }
diff --git a/ntpdc/nl.pl b/ntpdc/nl.pl
index 4a418fb..546f9ba 100755
--- a/ntpdc/nl.pl
+++ b/ntpdc/nl.pl
@@ -1,4 +1,4 @@
-#! /usr/local/perl-5.8.8/bin/perl -w
+#! /usr/bin/perl -w
 
 $found = 0;
 $last = 0;
diff --git a/sntp/libevent/evconfig-private.h b/sntp/libevent/evconfig-private.h
index ea4940d..f6b08e3 100644
--- a/sntp/libevent/evconfig-private.h
+++ b/sntp/libevent/evconfig-private.h
@@ -26,7 +26,7 @@
 #endif
 
 /* Number of bits in a file offset, on hosts where this is settable. */
-#define _FILE_OFFSET_BITS 64
+/* #undef _FILE_OFFSET_BITS */
 /* Define for large files, on AIX-style hosts. */
 /* #undef _LARGE_FILES */
 
-- 
2.1.0