[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[PATCH 4/5] x86_64: refactor segment register handling
From: |
Luca Dariz |
Subject: |
[PATCH 4/5] x86_64: refactor segment register handling |
Date: |
Sat, 29 Jul 2023 19:47:52 +0200 |
The actual values are not saved together with the rest of the thread
state, both because it would be quite espensive (reading MSR, unless
rdfsbase instructions are supported, but that's optional) and not
really needed. The only way the user has to change its value is with a
specific RPC, so we can intercept the change easily. Furthermore,
Leaving the values there exposes them to being corrupted in case of a
double interruption, e.g. an irq is handled just before iretq but
after starting to restore the thread state. This solution was
suggested by Sergey Bugaev.
* i386/i386/db_trace.c: remove fsbase/gsbase from the registers
available
* i386/i386/debug_i386.c: remove fsbase/gsbase from the printed thread
state
* i386/i386/i386asm.sym: remove fsbase/gsbase as it's not needed in
asm anymore
* i386/i386/pcb.c: point fsbase/gsbase to the new location
* i386/i386/thread.h: move fsbase/gsbase to the machine state
* x86_64/locore.S: generalize segment-handling including es/ds/gs/fs
and remove fsbase/gsbase handling. Also, factor out kernel segment
selector setting to a macro.
---
i386/i386/db_trace.c | 2 -
i386/i386/debug_i386.c | 2 -
i386/i386/i386asm.sym | 2 -
i386/i386/pcb.c | 12 +--
i386/i386/thread.h | 22 +++-
x86_64/locore.S | 228 +++++++++++++++--------------------------
6 files changed, 108 insertions(+), 160 deletions(-)
diff --git a/i386/i386/db_trace.c b/i386/i386/db_trace.c
index 2b6ad741..8bd86fa5 100644
--- a/i386/i386/db_trace.c
+++ b/i386/i386/db_trace.c
@@ -78,8 +78,6 @@ struct db_variable db_regs[] = {
{ "r13",(long *)&ddb_regs.r13, db_i386_reg_value },
{ "r14",(long *)&ddb_regs.r14, db_i386_reg_value },
{ "r15",(long *)&ddb_regs.r15, db_i386_reg_value },
- { "fsb",(long *)&ddb_regs.fsbase,db_i386_reg_value },
- { "gsb",(long *)&ddb_regs.gsbase,db_i386_reg_value },
#endif
};
struct db_variable *db_eregs = db_regs + sizeof(db_regs)/sizeof(db_regs[0]);
diff --git a/i386/i386/debug_i386.c b/i386/i386/debug_i386.c
index b5465796..41d032e3 100644
--- a/i386/i386/debug_i386.c
+++ b/i386/i386/debug_i386.c
@@ -40,8 +40,6 @@ void dump_ss(const struct i386_saved_state *st)
st->r8, st->r9, st->r10, st->r11);
printf("R12 %016lx R13 %016lx R14 %016lx R15 %016lx\n",
st->r12, st->r13, st->r14, st->r15);
- printf("FSBASE %016lx GSBASE %016lx\n",
- st->fsbase, st->gsbase);
printf("RIP %016lx EFLAGS %08lx\n", st->eip, st->efl);
#else
printf("EAX %08lx EBX %08lx ECX %08lx EDX %08lx\n",
diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym
index fd0be557..1b9b40bb 100644
--- a/i386/i386/i386asm.sym
+++ b/i386/i386/i386asm.sym
@@ -108,8 +108,6 @@ offset i386_saved_state r r12
offset i386_saved_state r r13
offset i386_saved_state r r14
offset i386_saved_state r r15
-offset i386_saved_state r fsbase
-offset i386_saved_state r gsbase
#endif
offset i386_interrupt_state i eip
diff --git a/i386/i386/pcb.c b/i386/i386/pcb.c
index d1c5fb50..1cf87eb1 100644
--- a/i386/i386/pcb.c
+++ b/i386/i386/pcb.c
@@ -229,8 +229,8 @@ void switch_ktss(pcb_t pcb)
#endif /* MACH_PV_DESCRIPTORS */
#if defined(__x86_64__) && !defined(USER32)
- wrmsr(MSR_REG_FSBASE, pcb->iss.fsbase);
- wrmsr(MSR_REG_GSBASE, pcb->iss.gsbase);
+ wrmsr(MSR_REG_FSBASE, pcb->ims.sbs.fsbase);
+ wrmsr(MSR_REG_GSBASE, pcb->ims.sbs.gsbase);
#endif
db_load_context(pcb);
@@ -687,8 +687,8 @@ kern_return_t thread_setstatus(
return KERN_INVALID_ARGUMENT;
state = (struct i386_fsgs_base_state *) tstate;
- thread->pcb->iss.fsbase = state->fs_base;
- thread->pcb->iss.gsbase = state->gs_base;
+ thread->pcb->ims.sbs.fsbase = state->fs_base;
+ thread->pcb->ims.sbs.gsbase = state->gs_base;
if (thread == current_thread()) {
wrmsr(MSR_REG_FSBASE, state->fs_base);
wrmsr(MSR_REG_GSBASE, state->gs_base);
@@ -889,8 +889,8 @@ kern_return_t thread_getstatus(
return KERN_INVALID_ARGUMENT;
state = (struct i386_fsgs_base_state *) tstate;
- state->fs_base = thread->pcb->iss.fsbase;
- state->gs_base = thread->pcb->iss.gsbase;
+ state->fs_base = thread->pcb->ims.sbs.fsbase;
+ state->gs_base = thread->pcb->ims.sbs.gsbase;
*count = i386_FSGS_BASE_STATE_COUNT;
break;
}
diff --git a/i386/i386/thread.h b/i386/i386/thread.h
index 2378154f..86a44098 100644
--- a/i386/i386/thread.h
+++ b/i386/i386/thread.h
@@ -51,10 +51,6 @@
*/
struct i386_saved_state {
-#ifdef __x86_64__
- unsigned long fsbase;
- unsigned long gsbase;
-#endif
unsigned long gs;
unsigned long fs;
unsigned long es;
@@ -162,6 +158,13 @@ struct v86_assist_state {
#define V86_IF_PENDING 0x8000 /* unused bit */
#endif
+#if defined(__x86_64__) && !defined(USER32)
+struct i386_segment_base_state {
+ unsigned long fsbase;
+ unsigned long gsbase;
+};
+#endif
+
/*
* i386_interrupt_state:
*
@@ -206,14 +209,23 @@ struct i386_machine_state {
#endif
struct real_descriptor user_gdt[USER_GDT_SLOTS];
struct i386_debug_state ids;
+#if defined(__x86_64__) && !defined(USER32)
+ struct i386_segment_base_state sbs;
+#endif
};
typedef struct pcb {
+ /* START of the exception stack.
+ * NOTE: this area is used as exception stack when switching
+ * CPL, and it MUST be big enough to save the thread state and
+ * switch to a proper stack area, even considering recursive
+ * exceptions, otherwise it could corrupt nearby memory */
struct i386_interrupt_state iis[2]; /* interrupt and NMI */
#ifdef __x86_64__
- unsigned long pad; /* ensure exception stack is aligned to 16 */
+ unsigned long pad; /* ensure exception stack is aligned to 16 */
#endif
struct i386_saved_state iss;
+ /* END of exception stack*/
struct i386_machine_state ims;
decl_simple_lock_data(, lock)
unsigned short init_control; /* Initial FPU control to set */
diff --git a/x86_64/locore.S b/x86_64/locore.S
index 7127957b..66a9436a 100644
--- a/x86_64/locore.S
+++ b/x86_64/locore.S
@@ -39,6 +39,10 @@
#include <i386/i386/cpu_number.h>
#include <i386/i386/xen.h>
+
+/*
+ * Helpers for thread state as saved in the pcb area, during trap or irq
handling
+ */
#define pusha \
pushq %rax ;\
pushq %rcx ;\
@@ -75,45 +79,74 @@
popq %rcx ;\
popq %rax
+/*
+ * Note that we have to load the kernel segment registers even if this
+ * is a trap from the kernel, because the kernel uses user segment
+ * registers for copyin/copyout.
+ * (XXX Would it be smarter just to use fs or gs for that?)
+ */
#ifdef USER32
-#define PUSH_FSGS \
+#define PUSH_SEGMENTS(reg) \
+ movq %ds,reg ;\
+ pushq reg ;\
+ movq %es,reg ;\
+ pushq reg ;\
pushq %fs ;\
- pushq %gs ;\
- subq $16,%rsp
+ pushq %gs
#else
-#define PUSH_FSGS \
+#define PUSH_SEGMENTS(reg) \
subq $32,%rsp
#endif
#ifdef USER32
-#define POP_FSGS \
+#define POP_SEGMENTS(reg) \
popq %gs ;\
popq %fs ;\
- addq $16,%rsp
+ popq reg ;\
+ movq reg,%es ;\
+ popq reg ;\
+ movq reg,%ds
#else
-#define POP_FSGS \
+#define POP_SEGMENTS(reg) \
addq $32,%rsp
#endif
#ifdef USER32
-#define PUSH_FSGS_ISR \
+#define PUSH_SEGMENTS_ISR(reg) \
+ movq %ds,reg ;\
+ pushq reg ;\
+ movq %es,reg ;\
+ pushq reg ;\
pushq %fs ;\
pushq %gs
#else
-#define PUSH_FSGS_ISR \
- subq $16,%rsp
+#define PUSH_SEGMENTS_ISR(reg) \
+ subq $32,%rsp
#endif
#ifdef USER32
-#define POP_FSGS_ISR \
+#define POP_SEGMENTS_ISR(reg) \
popq %gs ;\
- popq %fs
+ popq %fs ;\
+ popq reg ;\
+ movq reg,%es ;\
+ popq reg ;\
+ movq reg,%ds
#else
-#define POP_FSGS_ISR \
- addq $16,%rsp
+#define POP_SEGMENTS_ISR(reg) \
+ addq $32,%rsp
#endif
-
+#ifdef USER32
+#define SET_KERNEL_SEGMENTS \
+ mov %ss,%dx /* switch to kernel segments */ ;\
+ mov %dx,%ds ;\
+ mov %dx,%es ;\
+ mov %dx,%fs ;\
+ mov %dx,%gs
+#else
+#define SET_KERNEL_SEGMENTS
+#endif
/*
* Fault recovery.
@@ -350,32 +383,17 @@ ENTRY(start_timer)
/*
* Trap/interrupt entry points.
*
- * All traps must create the following save area on the kernel stack:
- *
- * gs
- * fs
- * es
- * ds
- * edi
- * esi
- * ebp
- * cr2 if page fault - otherwise unused
- * ebx
- * edx
- * ecx
- * eax
- * trap number
- * error code
- * eip
- * cs
- * eflags
- * user rsp - if from user
- * user ss - if from user
- * es - if from V86 thread
- * ds - if from V86 thread
- * fs - if from V86 thread
- * gs - if from V86 thread
+ * All traps must create the i386_saved_state struct on the stack on
+ * entry. Note that:
+ * - CR2 is only used if the trap is a page fault
+ * - user_rsp/user_ss are only used if entering from user space
+ * - v86_regs are used only from V86 threads
+ * (TODO check if V86 is still used with USER32)
*
+ * Depending the CPL before entry, the stack might be switched or not;
+ * if entering from user-space the CPU loads TSS->RSP0 in RSP,
+ * otherwise RSP is unchanged. After this, the cpu pushes
+ * SS/RSP/RFLAFS/CS/RIP and optionally ErrorCode and executes the handler.
*/
/* Try to save/show some information when a double fault happens
@@ -426,16 +444,16 @@ trap_check_kernel_exit:
/* check for the kernel exit sequence */
cmpq $_kret_iret,16(%rsp) /* on IRET? */
je fault_iret
-#if 0
+#ifdef USER32
cmpq $_kret_popl_ds,16(%rsp) /* popping DS? */
je fault_popl_ds
cmpq $_kret_popl_es,16(%rsp) /* popping ES? */
je fault_popl_es
-#endif
cmpq $_kret_popl_fs,16(%rsp) /* popping FS? */
je fault_popl_fs
cmpq $_kret_popl_gs,16(%rsp) /* popping GS? */
je fault_popl_gs
+#endif
take_fault: /* if none of the above: */
jmp EXT(alltraps) /* treat as normal trap. */
@@ -464,6 +482,7 @@ fault_iret:
popq %rax /* restore eax */
jmp EXT(alltraps) /* take fault */
+#ifdef USER32
/*
* Fault restoring a segment register. The user's registers are still
* saved on the stack. The offending segment register has not been
@@ -499,6 +518,7 @@ push_gs:
push_gsbase:
pushq $0
pushq $0
+#endif
push_segregs:
movq %rax,R_TRAPNO(%rsp) /* set trap number */
movq %rdx,R_ERR(%rsp) /* set error code */
@@ -562,23 +582,8 @@ ENTRY(t_page_fault)
ENTRY(alltraps)
pusha /* save the general registers */
trap_push_segs:
- movq %ds,%rax /* and the segment registers */
- pushq %rax
- movq %es,%rax /* and the segment registers */
- pushq %rax
- PUSH_FSGS
-
- /* Note that we have to load the segment registers
- even if this is a trap from the kernel,
- because the kernel uses user segment registers for copyin/copyout.
- (XXX Would it be smarter just to use fs or gs for that?) */
- mov %ss,%ax /* switch to kernel data segment */
- mov %ax,%ds /* (same as kernel stack segment) */
- mov %ax,%es
-#ifdef USER32
- mov %ax,%fs
- mov %ax,%gs
-#endif
+ PUSH_SEGMENTS(%rax)
+ SET_KERNEL_SEGMENTS
trap_set_segs:
cld /* clear direction flag */
#ifdef USER32
@@ -634,23 +639,20 @@ _return_to_user:
*/
_return_from_kernel:
- addq $16,%rsp /* skip FS/GS base */
#ifndef USER32
-_kret_popl_gs:
-_kret_popl_fs:
- addq $16,%rsp /* skip FS/GS selector */
+ addq $32,%rsp /* skip FS/GS selector */
#else
_kret_popl_gs:
popq %gs /* restore segment registers */
_kret_popl_fs:
popq %fs
-#endif
_kret_popl_es:
popq %rax
movq %rax,%es
_kret_popl_ds:
popq %rax
movq %rax,%ds
+#endif
popa /* restore general registers */
addq $16,%rsp /* discard trap number and error code */
_kret_iret:
@@ -777,8 +779,11 @@ INTERRUPT(255)
/* XXX handle NMI - at least print a warning like Linux does. */
/*
- * All interrupts enter here.
- * old %eax on stack; interrupt number in %eax.
+ * All interrupts enter here. The cpu might have loaded a new RSP,
+ * depending on the previous CPL, as in alltraps.
+ * Old %eax on stack, interrupt number in %eax; we need to fill the remaining
+ * fields of struct i386_interrupt_state, which might be in the pcb or in the
+ * interrupt stack.
*/
ENTRY(all_intrs)
pushq %rcx /* save registers */
@@ -791,24 +796,15 @@ ENTRY(all_intrs)
pushq %r11
cld /* clear direction flag */
- movq %ds,%rdx /* save segment registers */
- pushq %rdx
- movq %es,%rdx
- pushq %rdx
- PUSH_FSGS_ISR
+ PUSH_SEGMENTS_ISR(%rdx)
movq %rsp,%rdx /* on an interrupt stack? */
and $(~(INTSTACK_SIZE-1)),%rdx
cmpq %ss:EXT(int_stack_base),%rdx
je int_from_intstack /* if not: */
- mov %ss,%dx /* switch to kernel segments */
- mov %dx,%ds
- mov %dx,%es
-#ifdef USER32
- mov %dx,%fs
- mov %dx,%gs
-#endif
+ SET_KERNEL_SEGMENTS
+
CPU_NUMBER(%edx)
movq CX(EXT(int_stack_top),%edx),%rcx
@@ -849,11 +845,7 @@ LEXT(return_to_iret) /* ( label for
kdb_kintr and hardclock) */
cmpq $0,CX(EXT(need_ast),%edx)
jnz ast_from_interrupt /* take it if so */
1:
- POP_FSGS_ISR
- pop %rdx
- mov %rdx,%es
- pop %rdx
- mov %rdx,%ds
+ POP_SEGMENTS_ISR(%rdx)
pop %r11
pop %r10
pop %r9
@@ -871,12 +863,7 @@ int_from_intstack:
jb stack_overflowed /* if not: */
call EXT(interrupt) /* call interrupt routine */
_return_to_iret_i: /* ( label for kdb_kintr) */
- POP_FSGS_ISR
- pop %rdx
- mov %rdx,%es
- pop %rdx
- mov %rdx,%ds
-
+ POP_SEGMENTS_ISR(%rdx)
pop %r11
pop %r10
pop %r9
@@ -909,11 +896,7 @@ stack_overflowed:
* ss
*/
ast_from_interrupt:
- POP_FSGS_ISR
- pop %rdx
- mov %rdx,%es
- pop %rdx
- mov %rdx,%ds
+ POP_SEGMENTS_ISR(%rdx)
popq %r11
popq %r10
popq %r9
@@ -926,19 +909,8 @@ ast_from_interrupt:
pushq $0 /* zero code */
pushq $0 /* zero trap number */
pusha /* save general registers */
- mov %ds,%rdx /* save segment registers */
- push %rdx
- mov %es,%rdx
- push %rdx
- PUSH_FSGS_ISR
-
- mov %ss,%dx /* switch to kernel segments */
- mov %dx,%ds
- mov %dx,%es
-#ifdef USER32
- mov %dx,%fs
- mov %dx,%gs
-#endif
+ PUSH_SEGMENTS_ISR(%rdx)
+ SET_KERNEL_SEGMENTS
CPU_NUMBER(%edx)
TIME_TRAP_UENTRY
@@ -1056,20 +1028,12 @@ kdb_from_iret_i: /* on interrupt
stack */
pushq $0 /* zero error code */
pushq $0 /* zero trap number */
pusha /* save general registers */
- mov %ds,%rdx /* save segment registers */
- push %rdx
- mov %es,%rdx
- push %rdx
- PUSH_FSGS
+ PUSH_SEGMENTS(%rdx)
movq %rsp,%rdx /* pass regs, */
movq $0,%rsi /* code, */
movq $-1,%rdi /* type to kdb */
call EXT(kdb_trap)
- POP_FSGS
- pop %rdx
- mov %rdx,%es
- pop %rdx
- mov %rdx,%ds
+ POP_SEGMENTS(%rdx)
popa /* restore general registers */
addq $16,%rsp
@@ -1144,23 +1108,13 @@ ttd_from_iret_i: /* on interrupt
stack */
pushq $0 /* zero error code */
pushq $0 /* zero trap number */
pusha /* save general registers */
- mov %ds,%rdx /* save segment registers */
- push %rdx
- mov %es,%rdx
- push %rdx
- push %fs
- push %gs
+ PUSH_SEGMENTS_ISR(%rdx)
ud2 // TEST it
movq %rsp,%rdx /* pass regs, */
movq $0,%rsi /* code, */
movq $-1,%rdi /* type to kdb */
call _kttd_trap
- pop %gs /* restore segment registers */
- pop %fs
- pop %rdx
- mov %rdx,%es
- pop %rdx
- mov %rdx,%ds
+ POP_SEGMENTS_ISR(%rdx)
popa /* restore general registers */
addq $16,%rsp
@@ -1193,20 +1147,8 @@ syscall_entry_2:
pushq $0 /* clear trap number slot */
pusha /* save the general registers */
- movq %ds,%rdx /* and the segment registers */
- pushq %rdx
- movq %es,%rdx
- pushq %rdx
- pushq %fs
- pushq %gs
- pushq $0 // gsbase
- pushq $0 // fsbase
-
- mov %ss,%dx /* switch to kernel data segment */
- mov %dx,%ds
- mov %dx,%es
- mov %dx,%fs
- mov %dx,%gs
+ PUSH_SEGMENTS(%rdx)
+ SET_KERNEL_SEGMENTS
/*
* Shuffle eflags,eip,cs into proper places
--
2.39.2