[PATCH 3/3] powerpc/e6500: hw tablewalk: order the memory access when acquire/release tcd lock
Kevin Hao
haokexin at gmail.com
Mon Aug 17 21:19:14 AEST 2015
On Fri, Aug 14, 2015 at 07:44:23PM -0500, Scott Wood wrote:
> On Fri, 2015-08-14 at 15:13 +0800, Kevin Hao wrote:
> > On Thu, Aug 13, 2015 at 10:39:19PM -0500, Scott Wood wrote:
> > > On Thu, 2015-08-13 at 19:51 +0800, Kevin Hao wrote:
> > > > I didn't find anything unusual. But I think we do need to order the
> > > > load/store of esel_next when acquire/release tcd lock. For acquire,
> > > > add a data dependency to order the loads of lock and esel_next.
> > > > For release, even there already have a "isync" here, but it doesn't
> > > > guarantee any memory access order. So we still need "lwsync" for
> > > > the two stores for lock and esel_next.
> > >
> > > I was going to say that esel_next is just a hint and it doesn't really
> > > matter
> > > if we occasionally get the wrong value, unless it happens often enough to
> > > cause more performance degradation than the lwsync causes. However, with
> > > the
> > > A-008139 workaround we do need to read the same value from esel_next both
> > > times. It might be less costly to save/restore an additional register
> > > instead of lwsync, though.
> >
> > I will try to get some benchmark number to compare which method is a bit
> > better.
> > Do you have any recommended benchmark for a case this is?
>
> lmbench lat_mem_rd with a stride chosen to maximize TLB misses. For the
> uncontended case, one instance; for the contended case, two instances, one
> pinned to each thread of a core.
I have tried the method to save/restore an additional register for the esel
with the following codes:
diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h
index a8b52b61043f..8267c1404050 100644
--- a/arch/powerpc/include/asm/exception-64e.h
+++ b/arch/powerpc/include/asm/exception-64e.h
@@ -69,9 +69,9 @@
#define EX_TLB_ESR ( 9 * 8) /* Level 0 and 2 only */
#define EX_TLB_SRR0 (10 * 8)
#define EX_TLB_SRR1 (11 * 8)
+#define EX_TLB_R9 (12 * 8)
#ifdef CONFIG_BOOK3E_MMU_TLB_STATS
-#define EX_TLB_R8 (12 * 8)
-#define EX_TLB_R9 (13 * 8)
+#define EX_TLB_R8 (13 * 8)
#define EX_TLB_LR (14 * 8)
#define EX_TLB_SIZE (15 * 8)
#else
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index e4185581c5a7..8d184dd530c4 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -68,11 +68,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
ld r14,PACAPGD(r13)
std r15,EX_TLB_R15(r12)
std r10,EX_TLB_CR(r12)
+ std r9,EX_TLB_R9(r12)
TLB_MISS_PROLOG_STATS
.endm
.macro tlb_epilog_bolted
ld r14,EX_TLB_CR(r12)
+ ld r9,EX_TLB_R9(r12)
ld r10,EX_TLB_R10(r12)
ld r11,EX_TLB_R11(r12)
ld r13,EX_TLB_R13(r12)
@@ -297,6 +299,7 @@ itlb_miss_fault_bolted:
* r13 = PACA
* r11 = tlb_per_core ptr
* r10 = crap (free to use)
+ * r9 = esel entry
*/
tlb_miss_common_e6500:
crmove cr2*4+2,cr0*4+2 /* cr2.eq != 0 if kernel address */
@@ -334,8 +337,8 @@ BEGIN_FTR_SECTION /* CPU_FTR_SMT */
* with tlbilx before overwriting.
*/
- lbz r15,TCD_ESEL_NEXT(r11)
- rlwinm r10,r15,16,0xff0000
+ lbz r9,TCD_ESEL_NEXT(r11)
+ rlwinm r10,r9,16,0xff0000
oris r10,r10,MAS0_TLBSEL(1)@h
mtspr SPRN_MAS0,r10
isync
@@ -429,15 +432,14 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_SMT)
mtspr SPRN_MAS2,r15
tlb_miss_huge_done_e6500:
- lbz r15,TCD_ESEL_NEXT(r11)
lbz r16,TCD_ESEL_MAX(r11)
lbz r14,TCD_ESEL_FIRST(r11)
- rlwimi r10,r15,16,0x00ff0000 /* insert esel_next into MAS0 */
- addi r15,r15,1 /* increment esel_next */
+ rlwimi r10,r9,16,0x00ff0000 /* insert esel_next into MAS0 */
+ addi r9,r9,1 /* increment esel_next */
mtspr SPRN_MAS0,r10
- cmpw r15,r16
- iseleq r15,r14,r15 /* if next == last use first */
- stb r15,TCD_ESEL_NEXT(r11)
+ cmpw r9,r16
+ iseleq r9,r14,r9 /* if next == last use first */
+ stb r9,TCD_ESEL_NEXT(r11)
tlbwe
The following is the benchmark number on a t2080rdb board:
For uncontended case (taskset -c 0 lat_mem_rd 2048 2097152):
origin lwsync r9
2.00000 1.958 1.958 1.958
3.00000 1.958 1.958 1.958
4.00000 1.958 1.958 1.958
6.00000 1.958 1.958 1.958
8.00000 1.958 1.958 1.958
12.00000 6.528 6.528 6.528
16.00000 6.528 6.528 6.528
24.00000 37.862 37.862 37.861
32.00000 37.862 37.862 37.862
48.00000 37.862 37.862 37.862
64.00000 37.862 37.862 37.862
96.00000 37.862 37.863 37.862
128.00000 221.621 232.067 222.927
192.00000 221.874 232.333 222.925
256.00000 221.623 232.066 222.927
384.00000 221.758 232.331 222.927
512.00000 221.621 232.165 222.926
768.00000 221.776 236.870 226.598
1024.00000 264.199 271.351 259.281
1536.00000 370.748 380.910 372.544
2048.00000 392.185 404.696 395.881
For contended case (taskset -c 0 lat_mem_rd 2048 2097152 &
taskset -c 1 lat_mem_rd 2048 2097152 >/dev/null 2>&1):
origin lwsync r9
2.00000 3.366 2.944 3.086
3.00000 2.915 3.256 3.095
4.00000 3.043 2.443 2.617
6.00000 2.742 3.367 2.629
8.00000 3.145 3.365 2.443
12.00000 18.267 11.885 13.736
16.00000 15.607 13.312 18.048
24.00000 37.856 37.208 37.855
32.00000 37.856 37.861 37.855
48.00000 37.856 37.861 37.855
64.00000 57.487 37.861 52.505
96.00000 270.445 229.641 143.241
128.00000 284.535 279.907 305.540
192.00000 275.491 298.282 295.592
256.00000 265.155 331.212 259.096
384.00000 276.084 291.023 251.282
512.00000 275.852 335.602 267.074
768.00000 289.682 354.521 312.180
1024.00000 344.968 355.977 343.725
1536.00000 410.961 454.381 412.790
2048.00000 392.984 461.818 397.185
It seems that the using of an additional register do have better performance in
both cases.
Thanks,
Kevin
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 473 bytes
Desc: not available
URL: <http://lists.ozlabs.org/pipermail/linuxppc-dev/attachments/20150817/811e70ed/attachment.sig>
More information about the Linuxppc-dev
mailing list