pmCPU.c   [plain text]


/*
 * Copyright (c) 2004-2007 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

/*
 * CPU-specific power management support.
 *
 * Implements the "wrappers" to the KEXT.
 */
#include <i386/machine_routines.h>
#include <i386/machine_cpu.h>
#include <i386/misc_protos.h>
#include <i386/pmap.h>
#include <i386/asm.h>
#include <i386/mp.h>
#include <i386/proc_reg.h>
#include <kern/pms.h>
#include <kern/processor.h>
#include <i386/cpu_threads.h>
#include <i386/pmCPU.h>
#include <i386/cpuid.h>
#include <i386/rtclock.h>

extern int disableConsoleOutput;

decl_simple_lock_data(,pm_init_lock);

/*
 * The following is set when the KEXT loads and initializes.
 */
pmDispatch_t	*pmDispatch	= NULL;

/*
 * Current power management states (for use until KEXT is loaded).
 */
static pmInitState_t	pmInitState;

static uint32_t		pmInitDone	= 0;

/*
 * Nap control variables:
 */
uint32_t forcenap = 0;			/* Force nap (fn) boot-arg controls */

/*
 * Do any initialization needed
 */
void
pmsInit(void)
{
    static int		initialized	= 0;

    /*
     * Initialize some of the initial state to "uninitialized" until
     * it gets set with something more useful.  This allows the KEXT
     * to determine if the initial value was actually set to something.
     */
    if (!initialized) {
	pmInitState.PState = -1;
	pmInitState.PLimit = -1;
	pmInitState.maxBusDelay = -1;
	initialized = 1;
    }

    if (pmDispatch != NULL && pmDispatch->pmsInit != NULL)
	(*pmDispatch->pmsInit)();
}

/*
 * Start the power management stepper on all processors
 *
 * All processors must be parked.  This should be called when the hardware
 * is ready to step.  Probably only at boot and after wake from sleep.
 *
 */
void
pmsStart(void)
{
    if (pmDispatch != NULL && pmDispatch->pmsStart != NULL)
	(*pmDispatch->pmsStart)();
}

/*
 * Park the stepper execution.  This will force the stepper on this
 * processor to abandon its current step and stop.  No changes to the
 * hardware state is made and any previous step is lost.
 *	
 * This is used as the initial state at startup and when the step table
 * is being changed.
 *
 */
void
pmsPark(void)
{
    if (pmDispatch != NULL && pmDispatch->pmsPark != NULL)
	(*pmDispatch->pmsPark)();
}

/*
 * Control the Power Management Stepper.
 * Called from user state by the superuser.
 * Interrupts disabled.
 *
 * This interface is deprecated and is now a no-op.
 */
kern_return_t
pmsControl(__unused uint32_t request, __unused user_addr_t reqaddr,
	   __unused uint32_t reqsize)
{
    return(KERN_SUCCESS);
}

/*
 * Broadcast a change to all processors including ourselves.
 *
 * Interrupts disabled.
 */
void
pmsRun(uint32_t nstep)
{
    if (pmDispatch != NULL && pmDispatch->pmsRun != NULL)
	(*pmDispatch->pmsRun)(nstep);
}

/*
 * Build the tables needed for the stepper.  This includes both the step
 * definitions and the step control table.
 *
 * We most absolutely need to be parked before this happens because we're
 * going to change the table.  We also have to be complte about checking
 * for errors.  A copy is always made because we don't want to be crippled
 * by not being able to change the table or description formats.
 *
 * We pass in a table of external functions and the new stepper def uses
 * the corresponding indexes rather than actual function addresses.  This
 * is done so that a proper table can be built with the control syscall.
 * It can't supply addresses, so the index has to do.  We internalize the
 * table so our caller does not need to keep it.  Note that passing in a 0
 * will use the current function table.  Also note that entry 0 is reserved
 * and must be 0, we will check and fail the build.
 *
 * The platformData parameter is a 32-bit word of data that is passed unaltered
 * to the set function.
 *
 * The queryFunc parameter is the address of a function that will return the
 * current state of the platform. The format of the data returned is the same
 * as the platform specific portions of pmsSetCmd, i.e., pmsXClk, pmsVoltage,
 * and any part of pmsPowerID that is maintained by the platform hardware
 * (an example would be the values of the gpios that correspond to pmsPowerID).
 * The value should be constructed by querying hardware rather than returning
 * a value cached by software. One of the intents of this function is to help
 * recover lost or determine initial power states.
 *
 */
kern_return_t
pmsBuild(pmsDef *pd, uint32_t pdsize, pmsSetFunc_t *functab,
	 uint32_t platformData, pmsQueryFunc_t queryFunc)
{
    kern_return_t	rc	= 0;

    if (pmDispatch != NULL && pmDispatch->pmsBuild != NULL)
	rc = (*pmDispatch->pmsBuild)(pd, pdsize, functab,
				     platformData, queryFunc);

    return(rc);
}


/*
 * Load a new ratio/VID table.
 *
 * Note that this interface is specific to the Intel SpeedStep implementation.
 * It is expected that this will only be called once to override the default
 * ratio/VID table when the platform starts.
 *
 * Normally, the table will need to be replaced at the same time that the
 * stepper program proper is replaced, as the PState indices from an old
 * program may no longer be valid.  When replacing the default program this
 * should not be a problem as any new table will have at least two PState
 * entries and the default program only references P0 and P1.
 */
kern_return_t
pmsCPULoadVIDTable(uint16_t *tablep, int nstates)
{
    if (pmDispatch != NULL && pmDispatch->pmsCPULoadVIDTable != NULL)
	return((*pmDispatch->pmsCPULoadVIDTable)(tablep, nstates));
    else {
	int	i;

	if (nstates > MAX_PSTATES)
	    return(KERN_FAILURE);

	for (i = 0; i < nstates; i += 1)
	    pmInitState.VIDTable[i] = tablep[i];
    }
    return(KERN_SUCCESS);
}

/*
 * Set the (global) PState limit.  CPUs will not be permitted to run at
 * a lower (more performant) PState than this.
 */
kern_return_t
pmsCPUSetPStateLimit(uint32_t limit)
{
    if (pmDispatch != NULL && pmDispatch->pmsCPUSetPStateLimit != NULL)
	return((*pmDispatch->pmsCPUSetPStateLimit)(limit));

    pmInitState.PLimit = limit;
    return(KERN_SUCCESS);
}

/*
 * Initialize the Cstate change code.
 */
void
power_management_init(void)
{
    static boolean_t	initialized	= FALSE;

    /*
     * Initialize the lock for the KEXT initialization.
     */
    if (!initialized) {
	simple_lock_init(&pm_init_lock, 0);
	initialized = TRUE;
    }

    if (pmDispatch != NULL && pmDispatch->cstateInit != NULL)
	(*pmDispatch->cstateInit)();
}

/*
 * ACPI calls the following routine to set/update mwait hints.  A table
 * (possibly null) specifies the available Cstates and their hints, all
 * other states are assumed to be invalid.  ACPI may update available
 * states to change the nap policy (for example, while AC power is
 * available).
 */
kern_return_t
Cstate_table_set(Cstate_hint_t *tablep, unsigned int nstates)
{
    if (forcenap)
	return(KERN_SUCCESS);

    if (pmDispatch != NULL && pmDispatch->cstateTableSet != NULL)
	return((*pmDispatch->cstateTableSet)(tablep, nstates));
    else {
	unsigned int	i;

	for (i = 0; i < nstates; i += 1) {
	    pmInitState.CStates[i].number = tablep[i].number;
	    pmInitState.CStates[i].hint   = tablep[i].hint;
	}

	pmInitState.CStatesCount = nstates;
    }
    return(KERN_SUCCESS);
}

/*
 * Called when the CPU is idle.  It will choose the best C state to
 * be in.
 */
void
machine_idle_cstate(boolean_t halted)
{
	if (pmInitDone
	    && pmDispatch != NULL
	    && pmDispatch->cstateMachineIdle != NULL)
		(*pmDispatch->cstateMachineIdle)(!halted ?
						 0x7FFFFFFFFFFFFFFFULL : 0ULL);
	else if (halted) {
	    /*
	     * If no power managment and a processor is taken off-line,
	     * then invalidate the cache and halt it (it will not be able
	     * to be brought back on-line without resetting the CPU).
	     */
	    __asm__ volatile ( "wbinvd; hlt" );
	} else {
	    /*
	     * If no power management, re-enable interrupts and halt.
	     * This will keep the CPU from spinning through the scheduler
	     * and will allow at least some minimal power savings (but it
	     * may cause problems in some MP configurations w.r.t to the
	     * APIC stopping during a P-State transition).
	     */
	    __asm__ volatile ( "sti; hlt" );
	}
}

/*
 * Called when the CPU is to be halted.  It will choose the best C-State
 * to be in.
 */
void
pmCPUHalt(uint32_t reason)
{

    switch (reason) {
    case PM_HALT_DEBUG:
	__asm__ volatile ("wbinvd; hlt");
	break;

    case PM_HALT_PANIC:
	__asm__ volatile ("cli; wbinvd; hlt");
	break;

    case PM_HALT_NORMAL:
    default:
	__asm__ volatile ("cli");

	if (pmInitDone
	    && pmDispatch != NULL
	    && pmDispatch->pmCPUHalt != NULL) {
	    (*pmDispatch->pmCPUHalt)();
	} else {
	    cpu_data_t	*cpup	= current_cpu_datap();

	    /*
	     * If no power managment and a processor is taken off-line,
	     * then invalidate the cache and halt it (it will not be able
	     * to be brought back on-line without resetting the CPU).
	     */
	    __asm__ volatile ("wbinvd");
	    cpup->lcpu.halted = TRUE;
	    __asm__ volatile ( "wbinvd; hlt" );
	}
	break;
    }
}

/*
 * Called to initialize the power management structures for the CPUs.
 */
void
pmCPUStateInit(void)
{
    if (pmDispatch != NULL && pmDispatch->pmCPUStateInit != NULL)
	(*pmDispatch->pmCPUStateInit)();
}

static void
pmInitComplete(void)
{
    pmInitDone = 1;
}

static x86_lcpu_t *
pmGetLogicalCPU(int cpu)
{
    return(cpu_to_lcpu(cpu));
}

static x86_lcpu_t *
pmGetMyLogicalCPU(void)
{
    cpu_data_t	*cpup	= current_cpu_datap();

    return(&cpup->lcpu);
}

static x86_core_t *
pmGetCore(int cpu)
{
    return(cpu_to_core(cpu));
}

static x86_core_t *
pmGetMyCore(void)
{
    cpu_data_t	*cpup	= current_cpu_datap();

    return(cpup->lcpu.core);
}

static x86_pkg_t *
pmGetPackage(int cpu)
{
    return(cpu_to_package(cpu));
}

static x86_pkg_t *
pmGetMyPackage(void)
{
    cpu_data_t	*cpup	= current_cpu_datap();

    return(cpup->lcpu.core->package);
}

static void
pmLockCPUTopology(int lock)
{
    if (lock) {
	simple_lock(&x86_topo_lock);
    } else {
	simple_unlock(&x86_topo_lock);
    }
}

/*
 * Called to get the next deadline that has been set by the
 * power management code.
 */
uint64_t
pmCPUGetDeadline(cpu_data_t *cpu)
{
    uint64_t	deadline	= EndOfAllTime;

    if (pmInitDone
	&& pmDispatch != NULL
	&& pmDispatch->GetDeadline != NULL)
	deadline = (*pmDispatch->GetDeadline)(&cpu->lcpu);

    return(deadline);
}

/*
 * Called to determine if the supplied deadline or the power management
 * deadline is sooner.  Returns which ever one is first.
 */
uint64_t
pmCPUSetDeadline(cpu_data_t *cpu, uint64_t deadline)
{
    if (pmInitDone
	&& pmDispatch != NULL
	&& pmDispatch->SetDeadline != NULL)
	deadline = (*pmDispatch->SetDeadline)(&cpu->lcpu, deadline);

    return(deadline);
}

/*
 * Called when a power management deadline expires.
 */
void
pmCPUDeadline(cpu_data_t *cpu)
{
    if (pmInitDone
	&& pmDispatch != NULL
	&& pmDispatch->Deadline != NULL)
	(*pmDispatch->Deadline)(&cpu->lcpu);
}

/*
 * Called to get a CPU out of idle.
 */
boolean_t
pmCPUExitIdle(cpu_data_t *cpu)
{
    boolean_t		do_ipi;

    if (pmInitDone
	&& pmDispatch != NULL
	&& pmDispatch->exitIdle != NULL)
	do_ipi = (*pmDispatch->exitIdle)(&cpu->lcpu);
    else
	do_ipi = TRUE;

    return(do_ipi);
}

/*
 * Called when a CPU is being restarted after being powered off (as in S3).
 */
void
pmCPUMarkRunning(cpu_data_t *cpu)
{
    if (pmInitDone
	&& pmDispatch != NULL
	&& pmDispatch->markCPURunning != NULL)
	(*pmDispatch->markCPURunning)(&cpu->lcpu);
}

/*
 * Called from the HPET interrupt handler to perform the
 * necessary power management work.
 */
void
pmHPETInterrupt(void)
{
    if (pmInitDone
	&& pmDispatch != NULL
	&& pmDispatch->HPETInterrupt != NULL)
	(*pmDispatch->HPETInterrupt)();
}

/*
 * Called to get/set CPU power management state.
 */
int
pmCPUControl(uint32_t cmd, void *datap)
{
    int		rc	= -1;

    if (pmDispatch != NULL
	&& pmDispatch->pmCPUControl != NULL)
	rc = (*pmDispatch->pmCPUControl)(cmd, datap);

    return(rc);
}

/*
 * Set the worst-case time for the C4 to C2 transition.
 * No longer does anything.
 */
void
ml_set_maxsnoop(__unused uint32_t maxdelay)
{
}


/*
 * Get the worst-case time for the C4 to C2 transition.  Returns nanoseconds.
 */
unsigned
ml_get_maxsnoop(void)
{
    uint64_t	max_snoop	= 0;

    if (pmDispatch != NULL
	&& pmDispatch->getMaxSnoop != NULL)
	max_snoop = pmDispatch->getMaxSnoop();

    return((unsigned)(max_snoop & 0xffffffff));
}


uint32_t
ml_get_maxbusdelay(void)
{
    uint64_t	max_delay	= 0;

    if (pmDispatch != NULL
	&& pmDispatch->getMaxBusDelay != NULL)
	max_delay = pmDispatch->getMaxBusDelay();

    return((uint32_t)(max_delay & 0xffffffff));
}

/*
 * Set the maximum delay time allowed for snoop on the bus.
 *
 * Note that this value will be compared to the amount of time that it takes
 * to transition from a non-snooping power state (C4) to a snooping state (C2).
 * If maxBusDelay is less than C4C2SnoopDelay,
 * we will not enter the lowest power state.
 */
void
ml_set_maxbusdelay(uint32_t mdelay)
{
    uint64_t	maxdelay	= mdelay;

    if (pmDispatch != NULL
	&& pmDispatch->setMaxBusDelay != NULL)
	pmDispatch->setMaxBusDelay(maxdelay);
    else
	pmInitState.maxBusDelay = maxdelay;
}

/*
 * Put a CPU into "safe" mode with respect to power.
 *
 * Some systems cannot operate at a continuous "normal" speed without
 * exceeding the thermal design.  This is called per-CPU to place the
 * CPUs into a "safe" operating mode.
 */
void
pmSafeMode(x86_lcpu_t *lcpu, uint32_t flags)
{
    if (pmDispatch != NULL
	&& pmDispatch->pmCPUSafeMode != NULL)
	pmDispatch->pmCPUSafeMode(lcpu, flags);
    else {
	/*
	 * Do something reasonable if the KEXT isn't present.
	 *
	 * We only look at the PAUSE and RESUME flags.  The other flag(s)
	 * will not make any sense without the KEXT, so just ignore them.
	 *
	 * We set the halted flag in the LCPU structure to indicate
	 * that this CPU isn't to do anything.  If it's the CPU we're
	 * currently running on, then spin until the halted flag is
	 * reset.
	 */
	if (flags & PM_SAFE_FL_PAUSE) {
	    lcpu->halted = TRUE;
	    if (lcpu == x86_lcpu()) {
		while (lcpu->halted)
		    cpu_pause();
	    }
	}
	
	/*
	 * Clear the halted flag for the specified CPU, that will
	 * get it out of it's spin loop.
	 */
	if (flags & PM_SAFE_FL_RESUME) {
	    lcpu->halted = FALSE;
	}
    }
}

/*
 * Returns the root of the package tree.
 */
static x86_pkg_t *
pmGetPkgRoot(void)
{
    return(x86_pkgs);
}

static boolean_t
pmCPUGetHibernate(int cpu)
{
    return(cpu_datap(cpu)->cpu_hibernate);
}

static processor_t
pmLCPUtoProcessor(int lcpu)
{
    return(cpu_datap(lcpu)->cpu_processor);
}

/*
 * Called by the power management kext to register itself and to get the
 * callbacks it might need into other kernel functions.  This interface
 * is versioned to allow for slight mis-matches between the kext and the
 * kernel.
 */
void
pmKextRegister(uint32_t version, pmDispatch_t *cpuFuncs,
	       pmCallBacks_t *callbacks)
{
    if (callbacks != NULL && version == PM_DISPATCH_VERSION) {
	callbacks->InitState   = &pmInitState;
	callbacks->setRTCPop   = setPop;
	callbacks->resyncDeadlines = etimer_resync_deadlines;
	callbacks->initComplete= pmInitComplete;
	callbacks->GetLCPU     = pmGetLogicalCPU;
	callbacks->GetCore     = pmGetCore;
	callbacks->GetPackage  = pmGetPackage;
	callbacks->GetMyLCPU   = pmGetMyLogicalCPU;
	callbacks->GetMyCore   = pmGetMyCore;
	callbacks->GetMyPackage= pmGetMyPackage;
	callbacks->CoresPerPkg = cpuid_info()->cpuid_cores_per_package;
	callbacks->GetPkgRoot  = pmGetPkgRoot;
	callbacks->LockCPUTopology = pmLockCPUTopology;
	callbacks->GetHibernate    = pmCPUGetHibernate;
	callbacks->LCPUtoProcessor = pmLCPUtoProcessor;
    }

    if (cpuFuncs != NULL) {
	pmDispatch = cpuFuncs;
    }
}

/*
 * Unregisters the power management functions from the kext.
 */
void
pmUnRegister(pmDispatch_t *cpuFuncs)
{
    if (cpuFuncs != NULL && pmDispatch == cpuFuncs) {
	pmDispatch = NULL;
    }
}