[RFC PATCH v3 06/12] lib: vdso: __iter_div_u64_rem() is suboptimal for 32 bit time

Tue Jan 14 04:08:44 AEDT 2020

Using __iter_div_ulong_rem() is suboptimal on 32 bits.
Nanoseconds are only 32 bits, and VDSO data is updated every 10ms
so nsec will never overflow 32 bits.

Add an equivalent of __iter_div_u64_rem() but based
on unsigned long to better fit with 32 bits arches.

Before:
gettimeofday:    vdso: 1078 nsec/call
clock-gettime-monotonic-raw:    vdso: 1317 nsec/call
clock-gettime-monotonic:    vdso: 1255 nsec/call

After:
gettimeofday:    vdso: 1032 nsec/call
clock-gettime-monotonic-raw:    vdso: 1312 nsec/call
clock-gettime-monotonic:    vdso: 1243 nsec/call
Signed-off-by: Christophe Leroy <christophe.leroy at c-s.fr>
---
 lib/vdso/gettimeofday.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/lib/vdso/gettimeofday.c b/lib/vdso/gettimeofday.c
index decd3f2b37af..da15a8842825 100644
--- a/lib/vdso/gettimeofday.c
+++ b/lib/vdso/gettimeofday.c
@@ -38,12 +38,32 @@ u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
 }
 #endif
 
+static __always_inline u32
+__iter_div_ulong_rem(unsigned long dividend, u32 divisor, unsigned long *remainder)
+{
+	u32 ret = 0;
+
+	while (dividend >= divisor) {
+		/* The following asm() prevents the compiler from
+		   optimising this loop into a modulo operation.  */
+		asm("" : "+rm"(dividend));
+
+		dividend -= divisor;
+		ret++;
+	}
+
+	*remainder = dividend;
+
+	return ret;
+}
+
 static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
 				   struct __kernel_timespec *ts)
 {
 	const struct vdso_timestamp *vdso_ts = &vd->basetime[clk];
 	u64 cycles, last, sec, ns;
 	u32 seq;
+	unsigned long nsec;
 
 	do {
 		seq = vdso_read_begin(vd);
@@ -54,7 +74,7 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
 			return -1;
 
 		ns += vdso_calc_delta(cycles, last, vd->mask, vd->mult);
-		ns >>= vd->shift;
+		nsec = ns >> vd->shift;
 		sec = vdso_ts->sec;
 	} while (unlikely(vdso_read_retry(vd, seq)));
 
@@ -62,8 +82,8 @@ static __always_inline int do_hres(const struct vdso_data *vd, clockid_t clk,
 	 * Do this outside the loop: a race inside the loop could result
 	 * in __iter_div_u64_rem() being extremely slow.
 	 */
-	ts->tv_sec = sec + __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
-	ts->tv_nsec = ns;
+	ts->tv_sec = sec + __iter_div_ulong_rem(nsec, NSEC_PER_SEC, &nsec);
+	ts->tv_nsec = nsec;
 
 	return 0;
 }
-- 
2.13.3