pcb_native.c   [plain text]


/*
 * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
/*
 * @OSF_COPYRIGHT@
 */
/* 
 * Mach Operating System
 * Copyright (c) 1991,1990 Carnegie Mellon University
 * All Rights Reserved.
 * 
 * Permission to use, copy, modify and distribute this software and its
 * documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 * 
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 * 
 * Carnegie Mellon requests users of this software to return to
 * 
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 * 
 * any improvements or extensions that they make and grant Carnegie Mellon
 * the rights to redistribute these changes.
 */

#include <mach_rt.h>
#include <mach_debug.h>
#include <mach_ldebug.h>

#include <sys/kdebug.h>

#include <mach/kern_return.h>
#include <mach/thread_status.h>
#include <mach/vm_param.h>

#include <kern/counters.h>
#include <kern/kalloc.h>
#include <kern/mach_param.h>
#include <kern/processor.h>
#include <kern/cpu_data.h>
#include <kern/cpu_number.h>
#include <kern/task.h>
#include <kern/thread.h>
#include <kern/sched_prim.h>
#include <kern/misc_protos.h>
#include <kern/assert.h>
#include <kern/spl.h>
#include <kern/machine.h>
#include <ipc/ipc_port.h>
#include <vm/vm_kern.h>
#include <vm/vm_map.h>
#include <vm/pmap.h>
#include <vm/vm_protos.h>

#include <i386/commpage/commpage.h>
#include <i386/cpu_data.h>
#include <i386/cpu_number.h>
#include <i386/eflags.h>
#include <i386/proc_reg.h>
#include <i386/tss.h>
#include <i386/user_ldt.h>
#include <i386/fpu.h>
#include <i386/mp_desc.h>
#include <i386/misc_protos.h>
#include <i386/thread.h>
#if defined(__i386__)
#include <i386/fpu.h>
#endif
#include <i386/seg.h>
#include <i386/machine_routines.h>

#define ASSERT_IS_16BYTE_MULTIPLE_SIZEOF(_type_)	\
extern char assert_is_16byte_multiple_sizeof_ ## _type_	\
		[(sizeof(_type_) % 16) == 0 ? 1 : -1]

/* Compile-time checks for vital save area sizing: */
ASSERT_IS_16BYTE_MULTIPLE_SIZEOF(x86_64_intr_stack_frame_t);
ASSERT_IS_16BYTE_MULTIPLE_SIZEOF(x86_sframe64_t);
ASSERT_IS_16BYTE_MULTIPLE_SIZEOF(x86_saved_state_compat32_t);
ASSERT_IS_16BYTE_MULTIPLE_SIZEOF(x86_saved_state_t);

#define DIRECTION_FLAG_DEBUG (DEBUG | DEVELOPMENT)

extern zone_t		iss_zone;		/* zone for saved_state area */
extern zone_t		ids_zone;		/* zone for debug_state area */

extern void *get_bsduthreadarg(thread_t);
void
act_machine_switch_pcb(__unused thread_t old, thread_t new)
{
        pcb_t			pcb = THREAD_TO_PCB(new);
	cpu_data_t      	*cdp = current_cpu_datap();
	struct real_descriptor	*ldtp;
	mach_vm_offset_t	pcb_stack_top;

	assert(new->kernel_stack != 0);
	assert(ml_get_interrupts_enabled() == FALSE);
#ifdef	DIRECTION_FLAG_DEBUG
	if (x86_get_flags() & EFL_DF) {
		panic("Direction flag detected: 0x%lx", x86_get_flags());
	}
#endif

#if defined(__x86_64__)
	/*
	 * Clear segment state
	 * unconditionally for DS/ES/FS but more carefully for GS whose
	 * cached state we track.
	 */
	set_ds(NULL_SEG);
	set_es(NULL_SEG);
	set_fs(NULL_SEG);
	if (get_gs() != NULL_SEG) {
		swapgs();		/* switch to user's GS context */
		set_gs(NULL_SEG);
		swapgs();		/* and back to kernel */

		/* record the active machine state lost */
		cdp->cpu_uber.cu_user_gs_base = 0;
	} 

	if (is_saved_state64(pcb->iss)) {
		/*
		 * The test above is performed against the thread save state
		 * flavor and not task's 64-bit feature flag because of the
		 * thread/task 64-bit state divergence that can arise in
		 * task_set_64bit() x86: the task state is changed before
		 * the individual thread(s).
		 */
	        x86_saved_state64_tagged_t	*iss64;
		vm_offset_t			isf;

		assert(is_saved_state64(pcb->iss));
						   
		iss64 = (x86_saved_state64_tagged_t *) pcb->iss;
	
		/*
		 * Set pointer to PCB's interrupt stack frame in cpu data.
		 * Used by syscall and double-fault trap handlers.
		 */
		isf = (vm_offset_t) &iss64->state.isf;
		cdp->cpu_uber.cu_isf = isf;
		pcb_stack_top = (vm_offset_t) (iss64 + 1);
		/* require 16-byte alignment */
		assert((pcb_stack_top & 0xF) == 0);

		/* Interrupt stack is pcb */
		current_ktss64()->rsp0 = pcb_stack_top;

		/*
		 * Top of temporary sysenter stack points to pcb stack.
		 * Although this is not normally used by 64-bit users,
		 * it needs to be set in case a sysenter is attempted.
		 */
		*current_sstk64() = pcb_stack_top;

		cdp->cpu_task_map = new->map->pmap->pm_task_map; 

		/*
		 * Enable the 64-bit user code segment, USER64_CS.
		 * Disable the 32-bit user code segment, USER_CS.
		 */
		ldt_desc_p(USER64_CS)->access |= ACC_PL_U;
		ldt_desc_p(USER_CS)->access &= ~ACC_PL_U;

		/*
		 * Switch user's GS base if necessary
		 * by setting the Kernel's GS base MSR
		 * - this will become the user's on the swapgs when
		 * returning to user-space.  Avoid this for
		 * kernel threads (no user TLS support required)
		 * and verify the memory shadow of the segment base
		 * in the event it was altered in user space.
		 */
		if ((pcb->cthread_self != 0) || (new->task != kernel_task)) {
			if ((cdp->cpu_uber.cu_user_gs_base != pcb->cthread_self) || (pcb->cthread_self != rdmsr64(MSR_IA32_KERNEL_GS_BASE))) {
				cdp->cpu_uber.cu_user_gs_base = pcb->cthread_self;
				wrmsr64(MSR_IA32_KERNEL_GS_BASE, pcb->cthread_self);
			}
		}
	} else {
		x86_saved_state_compat32_t	*iss32compat;
		vm_offset_t			isf;

		assert(is_saved_state32(pcb->iss));
		iss32compat = (x86_saved_state_compat32_t *) pcb->iss;

		pcb_stack_top = (uintptr_t) (iss32compat + 1);
		/* require 16-byte alignment */
		assert((pcb_stack_top & 0xF) == 0);

		/*
		 * Set pointer to PCB's interrupt stack frame in cpu data.
		 * Used by debug trap handler.
		 */
		isf = (vm_offset_t) &iss32compat->isf64;
		cdp->cpu_uber.cu_isf = isf;

		/* Top of temporary sysenter stack points to pcb stack */
		*current_sstk64() = pcb_stack_top;

		/* Interrupt stack is pcb */
		current_ktss64()->rsp0 = pcb_stack_top;

		cdp->cpu_task_map = TASK_MAP_32BIT;
		/* Precalculate pointers to syscall argument store, for use
		 * in the trampolines.
		 */
		cdp->cpu_uber_arg_store = (vm_offset_t)get_bsduthreadarg(new);
		cdp->cpu_uber_arg_store_valid = (vm_offset_t)&pcb->arg_store_valid;
		pcb->arg_store_valid = 0;

		/*
		 * Disable USER64_CS
		 * Enable USER_CS
		 */
		ldt_desc_p(USER64_CS)->access &= ~ACC_PL_U;
		ldt_desc_p(USER_CS)->access |= ACC_PL_U;

		/*
		 * Set the thread`s cthread (a.k.a pthread)
		 * For 32-bit user this involves setting the USER_CTHREAD
		 * descriptor in the LDT to point to the cthread data.
		 * The involves copying in the pre-initialized descriptor.
		 */ 
		ldtp = (struct real_descriptor *)current_ldt();
		ldtp[sel_idx(USER_CTHREAD)] = pcb->cthread_desc;
		if (pcb->uldt_selector != 0)
			ldtp[sel_idx(pcb->uldt_selector)] = pcb->uldt_desc;
		cdp->cpu_uber.cu_user_gs_base = pcb->cthread_self;

		/*
		 * Set the thread`s LDT or LDT entry.
		 */
		if (new->task == TASK_NULL || new->task->i386_ldt == 0) {
			/*
			 * Use system LDT.
			 */
		       	ml_cpu_set_ldt(KERNEL_LDT);
		} else {
			/*
			 * Task has its own LDT.
			 */
			user_ldt_set(new);
		}
	}

#else /* !__x86_64__ */

	vm_offset_t		hi_pcb_stack_top;
	vm_offset_t		hi_iss;

	if (!cpu_mode_is64bit()) {
		x86_saved_state32_tagged_t	*hi_iss32;
		/*
		 *	Save a pointer to the top of the "kernel" stack -
		 *	actually the place in the PCB where a trap into
		 *	kernel mode will push the registers.
		 */
		hi_iss = (vm_offset_t)((unsigned long)
			pmap_cpu_high_map_vaddr(cpu_number(), HIGH_CPU_ISS0) |
			((unsigned long)pcb->iss & PAGE_MASK));

		cdp->cpu_hi_iss = (void *)hi_iss;

		pmap_high_map(pcb->iss_pte0, HIGH_CPU_ISS0);
		pmap_high_map(pcb->iss_pte1, HIGH_CPU_ISS1);

		hi_iss32 = (x86_saved_state32_tagged_t *) hi_iss;
		assert(hi_iss32->tag == x86_SAVED_STATE32);

		hi_pcb_stack_top = (int) (hi_iss32 + 1);

		/*
		 * For fast syscall, top of interrupt stack points to pcb stack
		 */
		*(vm_offset_t *) current_sstk() = hi_pcb_stack_top;

		current_ktss()->esp0 = hi_pcb_stack_top;

	} else if (is_saved_state64(pcb->iss)) {
		/*
		 * The test above is performed against the thread save state
		 * flavor and not task's 64-bit feature flag because of the
		 * thread/task 64-bit state divergence that can arise in
		 * task_set_64bit() x86: the task state is changed before
		 * the individual thread(s).
		 */
	        x86_saved_state64_tagged_t	*iss64;
		vm_offset_t			isf;

		assert(is_saved_state64(pcb->iss));
						   
		iss64 = (x86_saved_state64_tagged_t *) pcb->iss;
	
		/*
		 * Set pointer to PCB's interrupt stack frame in cpu data.
		 * Used by syscall and double-fault trap handlers.
		 */
		isf = (vm_offset_t) &iss64->state.isf;
		cdp->cpu_uber.cu_isf = UBER64(isf);
		pcb_stack_top = (vm_offset_t) (iss64 + 1);
		/* require 16-byte alignment */
		assert((pcb_stack_top & 0xF) == 0);
		/* Interrupt stack is pcb */
		current_ktss64()->rsp0 = UBER64(pcb_stack_top);

		/*
		 * Top of temporary sysenter stack points to pcb stack.
		 * Although this is not normally used by 64-bit users,
		 * it needs to be set in case a sysenter is attempted.
		 */
		*current_sstk64() = UBER64(pcb_stack_top);

		cdp->cpu_task_map = new->map->pmap->pm_task_map; 

		/*
		 * Enable the 64-bit user code segment, USER64_CS.
		 * Disable the 32-bit user code segment, USER_CS.
		 */
		ldt_desc_p(USER64_CS)->access |= ACC_PL_U;
		ldt_desc_p(USER_CS)->access &= ~ACC_PL_U;

	} else {
		x86_saved_state_compat32_t	*iss32compat;
		vm_offset_t			isf;

		assert(is_saved_state32(pcb->iss));
		iss32compat = (x86_saved_state_compat32_t *) pcb->iss;

		pcb_stack_top = (int) (iss32compat + 1);
		/* require 16-byte alignment */
		assert((pcb_stack_top & 0xF) == 0);

		/*
		 * Set pointer to PCB's interrupt stack frame in cpu data.
		 * Used by debug trap handler.
		 */
		isf = (vm_offset_t) &iss32compat->isf64;
		cdp->cpu_uber.cu_isf = UBER64(isf);

		/* Top of temporary sysenter stack points to pcb stack */
		*current_sstk64() = UBER64(pcb_stack_top);

		/* Interrupt stack is pcb */
		current_ktss64()->rsp0 = UBER64(pcb_stack_top);

		cdp->cpu_task_map = TASK_MAP_32BIT;
		/* Precalculate pointers to syscall argument store, for use
		 * in the trampolines.
		 */
		cdp->cpu_uber_arg_store = UBER64((vm_offset_t)get_bsduthreadarg(new));
		cdp->cpu_uber_arg_store_valid = UBER64((vm_offset_t)&pcb->arg_store_valid);
		pcb->arg_store_valid = 0;

		/*
		 * Disable USER64_CS
		 * Enable USER_CS
		 */
		ldt_desc_p(USER64_CS)->access &= ~ACC_PL_U;
		ldt_desc_p(USER_CS)->access |= ACC_PL_U;
	}

	/*
	 * Set the thread`s cthread (a.k.a pthread)
	 * For 32-bit user this involves setting the USER_CTHREAD
	 * descriptor in the LDT to point to the cthread data.
	 * The involves copying in the pre-initialized descriptor.
	 */ 
	ldtp = (struct real_descriptor *)current_ldt();
	ldtp[sel_idx(USER_CTHREAD)] = pcb->cthread_desc;
	if (pcb->uldt_selector != 0)
		ldtp[sel_idx(pcb->uldt_selector)] = pcb->uldt_desc;

	/*
	 * For 64-bit, we additionally set the 64-bit User GS base
	 * address. On return to 64-bit user, the GS.Base MSR will be written.
	 */
	cdp->cpu_uber.cu_user_gs_base = pcb->cthread_self;

	/*
	 * Set the thread`s LDT or LDT entry.
	 */
	if (new->task == TASK_NULL || new->task->i386_ldt == 0) {
		/*
		 * Use system LDT.
		 */
	       	ml_cpu_set_ldt(KERNEL_LDT);
	} else {
		/*
		 * Task has its own LDT.
		 */
		user_ldt_set(new);
	}
#endif

	/*
	 * Bump the scheduler generation count in the commpage.
	 * This can be read by user code to detect its preemption.
	 */
	commpage_sched_gen_inc();
}
void
thread_set_wq_state32(thread_t thread, thread_state_t tstate)
{
        x86_thread_state32_t	*state;
        x86_saved_state32_t	*saved_state;
	thread_t curth = current_thread();
	spl_t			s=0;

	pal_register_cache_state(thread, DIRTY);

	saved_state = USER_REGS32(thread);

	state = (x86_thread_state32_t *)tstate;
	
	if (curth != thread) {
		s = splsched();
	        thread_lock(thread);
	}

	saved_state->ebp = 0;
	saved_state->eip = state->eip;
	saved_state->eax = state->eax;
	saved_state->ebx = state->ebx;
	saved_state->ecx = state->ecx;
	saved_state->edx = state->edx;
	saved_state->edi = state->edi;
	saved_state->esi = state->esi;
	saved_state->uesp = state->esp;
	saved_state->efl = EFL_USER_SET;

	saved_state->cs = USER_CS;
	saved_state->ss = USER_DS;
	saved_state->ds = USER_DS;
	saved_state->es = USER_DS;

	if (curth != thread) {
	        thread_unlock(thread);
		splx(s);
	}
}


void
thread_set_wq_state64(thread_t thread, thread_state_t tstate)
{
        x86_thread_state64_t	*state;
        x86_saved_state64_t	*saved_state;
	thread_t curth = current_thread();
	spl_t			s=0;

	pal_register_cache_state(thread, DIRTY);

	saved_state = USER_REGS64(thread);
	state = (x86_thread_state64_t *)tstate;
	
	if (curth != thread) {
		s = splsched();
	        thread_lock(thread);
	}

	saved_state->rbp = 0;
	saved_state->rdi = state->rdi;
	saved_state->rsi = state->rsi;
	saved_state->rdx = state->rdx;
	saved_state->rcx = state->rcx;
	saved_state->r8  = state->r8;
	saved_state->r9  = state->r9;

	saved_state->isf.rip = state->rip;
	saved_state->isf.rsp = state->rsp;
	saved_state->isf.cs = USER64_CS;
	saved_state->isf.rflags = EFL_USER_SET;

	if (curth != thread) {
	        thread_unlock(thread);
		splx(s);
	}
}

/*
 * Initialize the machine-dependent state for a new thread.
 */
kern_return_t
machine_thread_create(
	thread_t		thread,
	task_t			task)
{
        pcb_t			pcb = THREAD_TO_PCB(thread);
	x86_saved_state_t	*iss;

#if NCOPY_WINDOWS > 0
	inval_copy_windows(thread);

	thread->machine.physwindow_pte = 0;
	thread->machine.physwindow_busy = 0;
#endif

	/*
	 * Allocate save frame only if required.
	 */
	if (pcb->sf == NULL) {
		assert((get_preemption_level() == 0));
		pcb->sf = zalloc(iss_zone);
		if (pcb->sf == NULL)
			panic("iss_zone");
	}

        if (task_has_64BitAddr(task)) {
		x86_sframe64_t		*sf64;

		sf64 = (x86_sframe64_t *) pcb->sf;

		bzero((char *)sf64, sizeof(x86_sframe64_t));

		iss = (x86_saved_state_t *) &sf64->ssf;
		iss->flavor = x86_SAVED_STATE64;
		/*
		 *      Guarantee that the bootstrapped thread will be in user
		 *      mode.
		 */
		iss->ss_64.isf.rflags = EFL_USER_SET;
		iss->ss_64.isf.cs = USER64_CS;
		iss->ss_64.isf.ss = USER_DS;
		iss->ss_64.fs = USER_DS;
		iss->ss_64.gs = USER_DS;
	} else {
		if (cpu_mode_is64bit()) {
			x86_sframe_compat32_t      *sfc32;

			sfc32 = (x86_sframe_compat32_t *)pcb->sf;

			bzero((char *)sfc32, sizeof(x86_sframe_compat32_t));

			iss = (x86_saved_state_t *) &sfc32->ssf.iss32;
			iss->flavor = x86_SAVED_STATE32;
#if defined(__i386__)
#if DEBUG
			{
				sfc32->pad_for_16byte_alignment[0] = 0x64326432;
				sfc32->pad_for_16byte_alignment[1] = 0x64326432;
			}
#endif /* DEBUG */
		} else {
			x86_sframe32_t		*sf32;
			struct real_descriptor	*ldtp;
			pmap_paddr_t		paddr;

			sf32 = (x86_sframe32_t *) pcb->sf;

			bzero((char *)sf32, sizeof(x86_sframe32_t));

			iss = (x86_saved_state_t *) &sf32->ssf;
			iss->flavor = x86_SAVED_STATE32;

			pcb->iss_pte0 = pte_kernel_rw(kvtophys((vm_offset_t)iss));
			if (0 == (paddr = pa_to_pte(kvtophys((vm_offset_t)iss + PAGE_SIZE))))
			        pcb->iss_pte1 = INTEL_PTE_INVALID;
			else
	      			pcb->iss_pte1 = pte_kernel_rw(paddr);

			ldtp = (struct real_descriptor *)
				    pmap_index_to_virt(HIGH_FIXED_LDT_BEGIN);
			pcb->cthread_desc = ldtp[sel_idx(USER_DS)];
			pcb->uldt_desc = ldtp[sel_idx(USER_DS)];
#endif /* __i386__ */
		}
		/*
		 *      Guarantee that the bootstrapped thread will be in user
		 *      mode.
		 */
		iss->ss_32.cs = USER_CS;
		iss->ss_32.ss = USER_DS;
		iss->ss_32.ds = USER_DS;
		iss->ss_32.es = USER_DS;
		iss->ss_32.fs = USER_DS;
		iss->ss_32.gs = USER_DS;
		iss->ss_32.efl = EFL_USER_SET;

	}
	pcb->iss = iss;

	simple_lock_init(&pcb->lock, 0);

	pcb->arg_store_valid = 0;
	pcb->cthread_self = 0;
	pcb->uldt_selector = 0;

	/* Ensure that the "cthread" descriptor describes a valid
	 * segment.
	 */
	if ((pcb->cthread_desc.access & ACC_P) == 0) {
		struct real_descriptor  *ldtp;
		ldtp = (struct real_descriptor *)current_ldt();
		pcb->cthread_desc = ldtp[sel_idx(USER_DS)];
	}

	return(KERN_SUCCESS);
}

/*
 * Machine-dependent cleanup prior to destroying a thread
 */
void
machine_thread_destroy(
	thread_t		thread)
{
	register pcb_t	pcb = THREAD_TO_PCB(thread);

	if (pcb->ifps != 0)
		fpu_free(pcb->ifps);
	if (pcb->sf != 0) {
		zfree(iss_zone, pcb->sf);
		pcb->sf = 0;
	}
	if (pcb->ids) {
		zfree(ids_zone, pcb->ids);
		pcb->ids = NULL;
	}
}