[RFC PATCH v2 05/18] sched: add task flag for preempt IRQ tracking

Josh Poimboeuf jpoimboe at redhat.com
Fri Jun 24 06:40:17 AEST 2016


On Thu, Jun 23, 2016 at 01:31:32PM -0500, Josh Poimboeuf wrote:
> On Thu, Jun 23, 2016 at 09:35:29AM -0700, Andy Lutomirski wrote:
> > > So which is the least-bad option?  To summarize:
> > >
> > >   1) task flag(s) for preemption and page faults
> > >
> > >   2) turn pt_regs into a stack frame
> > >
> > >   3) annotate all calls from entry code in a table
> > >
> > >   4) encode rbp on entry
> > >
> > > They all have their issues, though I'm partial to #2.
> > >
> > > Any more hare-brained ideas? :-)
> > 
> > I'll try to take a closer look at #2 and see just how much I dislike
> > all the stack frame munging.
> 
> Ok.
> 
> > Also, in principle, it's only the
> > sleeping calls and the calls that make it into real (non-entry) kernel
> > code that really want to be unwindable through this mechanism.
> 
> Yeah, that's true.  We could modify options 2 or 3 to be less absolute.
> Though I think that makes them more prone to future breakage.
> 
> > FWIW, I don't care that much about preserving gdb's partial ability to
> > unwind through pt_regs, especially because gdb really ought to be able
> > to use DWARF, too.
> 
> Hm, that's a good point.  I really don't know if there are any other
> external tools out there that would care.  Maybe we could try option 4
> and then see if anybody complains.

I'm starting to think hare-brained option 4 is the way to go.  Any
external tooling should really be relying on DWARF anyway.

Here's a sneak preview.  If this general approach looks ok to you, I'll
go ahead and port all the in-tree unwinders and post a proper patch.

Instead of using xor -1 on the pt_regs pointer, I just cleared the
high-order bit.  That makes the unwinding experience much more pleasant
for a human stack walker, and also ensures that anybody trying to
dereference it gets slapped with an oops, at least in the 48-bit address
space era.

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 9a9e588..bf397426 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -201,6 +201,23 @@ For 32-bit we have the following conventions - kernel is built with
 	.byte 0xf1
 	.endm
 
+	/*
+	 * This is a sneaky trick to help the unwinder find pt_regs on the
+	 * stack.  The frame pointer is replaced with an encoded pointer to
+	 * pt_regs.  The encoding is just a clearing of the highest-order bit,
+	 * which makes it an invalid address and is also a signal to the
+	 * unwinder that it's a pt_regs pointer in disguise.
+	 *
+	 * NOTE: This must be called *after* SAVE_EXTRA_REGS because it
+	 * corrupts rbp.
+	 */
+	.macro ENCODE_FRAME_POINTER ptregs_offset=0
+#ifdef CONFIG_FRAME_POINTER
+	leaq \ptregs_offset(%rsp), %rbp
+	btr $63, %rbp
+#endif
+	.endm
+
 #endif /* CONFIG_X86_64 */
 
 /*
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9ee0da1..eb79652 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -431,6 +431,7 @@ END(irq_entries_start)
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
 	SAVE_EXTRA_REGS
+	ENCODE_FRAME_POINTER
 
 	testb	$3, CS(%rsp)
 	jz	1f
@@ -893,6 +894,7 @@ ENTRY(xen_failsafe_callback)
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
 	SAVE_EXTRA_REGS
+	ENCODE_FRAME_POINTER
 	jmp	error_exit
 END(xen_failsafe_callback)
 
@@ -936,6 +938,7 @@ ENTRY(paranoid_entry)
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
+	ENCODE_FRAME_POINTER 8
 	movl	$1, %ebx
 	movl	$MSR_GS_BASE, %ecx
 	rdmsr
@@ -983,6 +986,7 @@ ENTRY(error_entry)
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
+	ENCODE_FRAME_POINTER 8
 	xorl	%ebx, %ebx
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
@@ -1165,6 +1169,7 @@ ENTRY(nmi)
 	pushq	%r13		/* pt_regs->r13 */
 	pushq	%r14		/* pt_regs->r14 */
 	pushq	%r15		/* pt_regs->r15 */
+	ENCODE_FRAME_POINTER
 
 	/*
 	 * At this point we no longer need to worry about stack damage
@@ -1182,7 +1187,7 @@ ENTRY(nmi)
 	 * do_nmi doesn't modify pt_regs.
 	 */
 	SWAPGS
-	jmp	restore_c_regs_and_iret
+	jmp	restore_regs_and_iret
 
 .Lnmi_from_kernel:
 	/*


More information about the Linuxppc-dev mailing list