OriginalAuthor: Dave Jones <djones@redhat.com>, Solar Designer <solar at openwall.com>
OriginalLocation: http://www.codemonkey.org.uk/projects/execshield/
Bug: #369978
This implements NX emulation via CS-limits. It closes a gap in security
protections on ia32 kernels without PAE, and for ia32 hardware that lacks
the NX feature.
Upstream feels this NX emulation is not appropriate for mainline,
and as such, RedHat has carried it in their kernels for a long time
now. The approach is well-tested, though recent forward-porting
may need additional testing.
Also reference https://blueprints.edge.launchpad.net/ubuntu/+spec/use-pae-when-possible
+#ifdef CONFIG_X86_32
+ /*
+ * emulation of NX with segment limits unfortunately means
+ * we have to disable the fast system calls, due to the way that
+ * sysexit clears the segment limits on return.
+ * If we have NX, then we don't need to do this.
+ */
+#ifdef CONFIG_X86_PAE
+ if (!test_cpu_cap(c, X86_FEATURE_NX))
+#endif
+ clear_cpu_cap(c, X86_FEATURE_SEP);
+#endif /*CONFIG_X86_32*/
+
/* If the model name is still unset, do table lookup. */
if (!c->x86_model_id[0]) {
const char *p;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d..c2dda16 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -345,6 +345,9 @@ struct pv_cpu_ops pv_cpu_ops = {
.read_tscp = native_read_tscp,
.load_tr_desc = native_load_tr_desc,
.set_ldt = native_set_ldt,
+#ifdef CONFIG_X86_32
+ .load_user_cs_desc = native_load_user_cs_desc,
+#endif /*CONFIG_X86_32*/
.load_gdt = native_load_gdt,
.load_idt = native_load_idt,
.store_gdt = native_store_gdt,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf7956..3eee776 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -296,6 +296,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
void
start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
{
+ int cpu;
+
set_user_gs(regs, 0);
regs->fs = 0;
set_fs(USER_DS);
@@ -305,6 +307,11 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
regs->cs = __USER_CS;
regs->ip = new_ip;
regs->sp = new_sp;
+
+ cpu = get_cpu();
+ load_user_cs_desc(cpu, current->mm);
+ put_cpu();
+
/*
* Free the old FP and other extended state
*/
@@ -359,6 +366,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
__unlazy_fpu(prev_p);
+ if (next_p->mm)
+ load_user_cs_desc(cpu, next_p->mm);
/* we're going to use this soon, after a few expensive things */
if (preload_fpu)
@@ -497,3 +506,40 @@ unsigned long get_wchan(struct task_struct *p)
return 0;
}
+static void modify_cs(struct mm_struct *mm, unsigned long limit)
+{
+ mm->context.exec_limit = limit;
+ set_user_cs(&mm->context.user_cs, limit);
+ if (mm == current->mm) {
+ int cpu;
+
+ cpu = get_cpu();
+ load_user_cs_desc(cpu, mm);
+ put_cpu();
+ }
+}
+
+void arch_add_exec_range(struct mm_struct *mm, unsigned long limit)
+{
+ if (limit > mm->context.exec_limit)
+ modify_cs(mm, limit);
+}
+
+void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end)
+{
+ struct vm_area_struct *vma;
+ unsigned long limit = PAGE_SIZE;
+
+ if (old_end == mm->context.exec_limit) {
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
+ limit = vma->vm_end;
+ modify_cs(mm, limit);
+ }
+}
+
+void arch_flush_exec_range(struct mm_struct *mm)
+{
+ mm->context.exec_limit = 0;
+ set_user_cs(&mm->context.user_cs, 0);
+}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dce..627de31 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -115,6 +115,67 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err)
if (!user_mode_vm(regs))
die(str, regs, err);
}
+
+static inline int
+__compare_user_cs_desc(const struct desc_struct *desc1,
+ const struct desc_struct *desc2)
+{
+ return ((desc1->limit0 != desc2->limit0) ||
+ (desc1->limit != desc2->limit) ||
+ (desc1->base0 != desc2->base0) ||
+ (desc1->base1 != desc2->base1) ||
+ (desc1->base2 != desc2->base2));
+}
+
+/*
+ * lazy-check for CS validity on exec-shield binaries:
+ *
+ * the original non-exec stack patch was written by
+ * Solar Designer <solar at openwall.com>. Thanks!
+ */
+static int
+check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code)
+{
+ struct desc_struct *desc1, *desc2;
+ struct vm_area_struct *vma;
+ unsigned long limit;
+
+ if (current->mm == NULL)
+ return 0;
+
+ limit = -1UL;
+ if (current->mm->context.exec_limit != -1UL) {
+ limit = PAGE_SIZE;
+ spin_lock(¤t->mm->page_table_lock);
+ for (vma = current->mm->mmap; vma; vma = vma->vm_next)
+ if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
+ limit = vma->vm_end;
+ vma = get_gate_vma(current);
+ if (vma && (vma->vm_flags & VM_EXEC) && (vma->vm_end > limit))
+ limit = vma->vm_end;
+ spin_unlock(¤t->mm->page_table_lock);
+ if (limit >= TASK_SIZE)
+ limit = -1UL;
+ current->mm->context.exec_limit = limit;
+ }
+ set_user_cs(¤t->mm->context.user_cs, limit);
+
+ desc1 = ¤t->mm->context.user_cs;
+ desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS;
+
+ if (__compare_user_cs_desc(desc1, desc2)) {
+ /*
+ * The CS was not in sync - reload it and retry the
+ * instruction. If the instruction still faults then
+ * we won't hit this branch next time around.
+ */
+ load_user_cs_desc(cpu, current->mm);
+
+ return 1;
+ }
+
+ return 0;
+}
#endif
static void __kprobes
@@ -273,6 +334,20 @@ do_general_protection(struct pt_regs *regs, long error_code)
if (!user_mode(regs))
goto gp_in_kernel;
+#ifdef CONFIG_X86_32
+{
+ int cpu;
+ int ok;
+
+ cpu = get_cpu();
+ ok = check_lazy_exec_limit(cpu, regs, error_code);
+ put_cpu();
+
+ if (ok)
+ return;
+}
+#endif
+
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 13;
@@ -881,11 +956,29 @@ do_device_not_available(struct pt_regs *regs, long error_code)
}
#ifdef CONFIG_X86_32
+/*
+ * The fixup code for errors in iret jumps to here (iret_exc). It loses
+ * the original trap number and erorr code. The bogus trap 32 and error
+ * code 0 are what the vanilla kernel delivers via:
+ * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
+ *
+ * NOTE: Because of the final "1" in the macro we need to enable interrupts.
+ *
+ * In case of a general protection fault in the iret instruction, we
+ * need to check for a lazy CS update for exec-shield.
+ */
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
{
siginfo_t info;
+ int ok;
+ int cpu;
local_irq_enable();
+#ifdef CONFIG_X86_32
+ /*
+ * Turn off the CS limit completely if NX active:
+ */
+ if (executable_stack != EXSTACK_DISABLE_X || nx_enabled)
+ arch_add_exec_range(current->mm, -1);
+#endif
+
/* OK, This is the point of no return */
current->flags &= ~PF_FORKNOEXEC;
current->mm->def_flags = def_flags;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 24c3956..1af5c44 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1129,7 +1129,15 @@ extern int install_special_mapping(struct mm_struct *mm,
unsigned long addr, unsigned long len,
unsigned long flags, struct page **pages);
-extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+extern unsigned long get_unmapped_area_prot(struct file *, unsigned long,
+ unsigned long, unsigned long, unsigned long, int);
+
+static inline unsigned long get_unmapped_area(struct file *file,
+ unsigned long addr, unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ return get_unmapped_area_prot(file, addr, len, pgoff, flags, 0);
+}
extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 84a524a..a81e0db 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -206,6 +206,9 @@ struct mm_struct {
unsigned long (*get_unmapped_area) (struct file *filp,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags);
+ unsigned long (*get_unmapped_exec_area) (struct file *filp,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags);
void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
unsigned long mmap_base; /* base of mmap area */
unsigned long task_size; /* size of task vm space */
diff --git a/include/linux/resource.h b/include/linux/resource.h
index 40fc7e6..68c2549 100644
--- a/include/linux/resource.h
+++ b/include/linux/resource.h
@@ -55,8 +55,11 @@ struct rlimit {
/*
* Limit the stack by to some sane default: root can always
* increase this limit if needed.. 8MB seems reasonable.
+ *
+ * (2MB more to cover randomization effects.)
*/
-#define _STK_LIM (8*1024*1024)
+#define _STK_LIM (10*1024*1024)
+#define EXEC_STACK_BIAS (2*1024*1024)
/*
* GPG2 wants 64kB of mlocked memory, to make sure pass phrases
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75e6e60..a8026e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -379,6 +379,9 @@ extern unsigned long
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
unsigned long, unsigned long);
extern unsigned long
+arch_get_unmapped_exec_area(struct file *, unsigned long, unsigned long,
+ unsigned long, unsigned long);
+extern unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags);
diff --git a/mm/mmap.c b/mm/mmap.c
index 73f5e4b..814b95f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/perf_event.h>
+#include <linux/random.h>
@@ -970,7 +989,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
/* Obtain the address to map to. we verify (or select) it and ensure
* that it represents a valid section of the address space.
*/
- addr = get_unmapped_area(file, addr, len, pgoff, flags);
+ addr = get_unmapped_area_prot(file, addr, len, pgoff, flags,
+ prot & PROT_EXEC);
if (addr & ~PAGE_MASK)
return addr;
unsigned long
-get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags)
+get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags, int exec)
{
unsigned long (*get_area)(struct file *, unsigned long,
unsigned long, unsigned long, unsigned long);
return arch_rebalance_pgtables(addr, len);
}
+EXPORT_SYMBOL(get_unmapped_area_prot);
+
+#define SHLIB_BASE 0x00110000
+
+unsigned long
+arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0,
+ unsigned long len0, unsigned long pgoff, unsigned long flags)
+{
+ unsigned long addr = addr0, len = len0;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long tmp;
+
+ if (len > TASK_SIZE)
+ return -ENOMEM;
+
+ if (flags & MAP_FIXED)
+ return addr;
+
+ if (!addr)
+ addr = randomize_range(SHLIB_BASE, 0x01000000, len);
+
+ if (addr) {
+ addr = PAGE_ALIGN(addr);
+ vma = find_vma(mm, addr);
+ if (TASK_SIZE - len >= addr &&
+ (!vma || addr + len <= vma->vm_start))
+ return addr;
+ }
+
+ addr = SHLIB_BASE;
+ for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+ /* At this point: (!vma || addr < vma->vm_end). */
+ if (TASK_SIZE - len < addr)
+ return -ENOMEM;
+
+ if (!vma || addr + len <= vma->vm_start) {
+ /*
+ * Must not let a PROT_EXEC mapping get into the
+ * brk area:
+ */
+ if (addr + len > mm->brk)
+ goto failed;
+
+ /*
+ * Up until the brk area we randomize addresses
+ * as much as possible:
+ */
+ if (addr >= 0x01000000) {
+ tmp = randomize_range(0x01000000,
+ PAGE_ALIGN(max(mm->start_brk,
+ (unsigned long)0x08000000)), len);
+ vma = find_vma(mm, tmp);
+ if (TASK_SIZE - len >= tmp &&
+ (!vma || tmp + len <= vma->vm_start))
+ return tmp;
+ }
+ /*
+ * Ok, randomization didnt work out - return
+ * the result of the linear search:
+ */
+ return addr;
+ }
+ addr = vma->vm_end;
+ }
+
+failed:
+ return current->mm->get_unmapped_area(filp, addr0, len0, pgoff, flags);
+}
-EXPORT_SYMBOL(get_unmapped_area);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
@@ -1549,6 +1641,14 @@ out:
return prev ? prev->vm_next : vma;
}
+static int over_stack_limit(unsigned long sz)
+{
+ if (sz < EXEC_STACK_BIAS)
+ return 0;
+ return (sz - EXEC_STACK_BIAS) >
+ current->signal->rlim[RLIMIT_STACK].rlim_cur;
+}
+
/*
* Verify that the stack growth is acceptable and
* update accounting. This is shared with both the
@@ -1565,7 +1665,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
return -ENOMEM;
/* Stack limit test */
- if (size > rlim[RLIMIT_STACK].rlim_cur)
+ if (over_stack_limit(size))
return -ENOMEM;
/* mlock limit tests */
@@ -1875,10 +1975,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);