zalloc_internal.h   [plain text]


/*
 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
/*
 * @OSF_COPYRIGHT@
 */
/*
 * Mach Operating System
 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
 * All Rights Reserved.
 *
 * Permission to use, copy, modify and distribute this software and its
 * documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie Mellon
 * the rights to redistribute these changes.
 */
/*
 */

#ifndef _KERN_ZALLOC_INTERNAL_H_
#define _KERN_ZALLOC_INTERNAL_H_

#include <kern/zalloc.h>
#include <kern/locks.h>
#include <kern/btlog.h>
#include <kern/simple_lock.h>

#include <os/atomic_private.h>
#include <sys/queue.h>

#if KASAN
#include <san/kasan.h>
#include <kern/spl.h>
#endif /* !KASAN */

/*!
 * @file <kern/zalloc_internal.h>
 *
 * @abstract
 * Exposes some guts of zalloc to interact with the VM, debugging, copyio and
 * kalloc subsystems.
 */

__BEGIN_DECLS

#pragma GCC visibility push(hidden)

#if CONFIG_GZALLOC
typedef struct gzalloc_data {
	uint32_t        gzfc_index;
	vm_offset_t     *gzfc;
} gzalloc_data_t;
#endif

/*
 *	A zone is a collection of fixed size blocks for which there
 *	is fast allocation/deallocation access.  Kernel routines can
 *	use zones to manage data structures dynamically, creating a zone
 *	for each type of data structure to be managed.
 *
 */

/*!
 * @typedef zone_pva_t
 *
 * @brief
 * Type used to point to a page virtual address in the zone allocator.
 *
 * @description
 * - Valid pages have the top bit set.
 * - 0 represents the "NULL" page
 * - non 0 values with the top bit cleared do not represent any valid page.
 *   the zone freelists use this space to encode "queue" addresses.
 */
typedef struct zone_packed_virtual_address {
	uint32_t packed_address;
} zone_pva_t;

/*!
 * @struct zone_stats
 *
 * @abstract
 * Per-cpu structure used for basic zone stats.
 *
 * @discussion
 * The values aren't scaled for per-cpu zones.
 */
struct zone_stats {
	uint64_t            zs_mem_allocated;
	uint64_t            zs_mem_freed;
	uint32_t            zs_poison_seqno; /* counter for poisoning every N frees */
	uint32_t            zs_alloc_rr;     /* allocation rr bias */
};

STAILQ_HEAD(zone_depot, zone_magazine);

struct zone {
	/*
	 * Readonly / rarely written fields
	 */

	/*
	 * The first 4 fields match a zone_view.
	 *
	 * z_self points back to the zone when the zone is initialized,
	 * or is NULL else.
	 */
	struct zone        *z_self;
	zone_stats_t        z_stats;
	const char         *z_name;
	struct zone_view   *z_views;

	struct thread      *z_expander;
	struct zone_cache  *__zpercpu z_pcpu_cache;

	uint16_t            z_chunk_pages;  /* size used for more memory in pages  */
	uint16_t            z_chunk_elems;  /* count of allocations per chunk */
	uint16_t            z_elems_rsv;    /* maintain a free reserve of elements */
	uint16_t            z_elem_size;    /* size of an element                  */

	uint64_t
	/*
	 * Lifecycle state (Mutable after creation)
	 */
	    z_destroyed        :1,  /* zone is (being) destroyed */
	    z_async_refilling  :1,  /* asynchronous allocation pending? */
	    z_replenish_wait   :1,  /* someone is waiting on the replenish thread */
	    z_expanding_wait   :1,  /* is thread waiting for expansion? */
	    z_expander_vm_priv :1,  /* a vm privileged thread is expanding */

	/*
	 * Security sensitive configuration bits
	 */
	    z_allows_foreign   :1,  /* allow non-zalloc space  */
	    z_destructible     :1,  /* zone can be zdestroy()ed  */
	    kalloc_heap        :2,  /* zone_kheap_id_t when part of a kalloc heap */
	    z_noencrypt        :1,  /* do not encrypt pages when hibernating */
	    z_submap_idx       :2,  /* a Z_SUBMAP_IDX_* value */
	    z_va_sequester     :1,  /* page sequester: no VA reuse with other zones */
	    z_free_zeroes      :1,  /* clear memory of elements on free and assert on alloc */

	/*
	 * Behavior configuration bits
	 */
	    z_percpu           :1,  /* the zone is percpu */
	    z_permanent        :1,  /* the zone allocations are permanent */
	    z_replenishes      :1,  /* uses the async replenish mechanism for VM */
	    z_nocaching        :1,  /* disallow zone caching for this zone */
	    collectable        :1,  /* garbage collect empty pages */
	    exhaustible        :1,  /* merely return if empty? */
	    expandable         :1,  /* expand zone (with message)? */
	    no_callout         :1,

	    _reserved          :26,

	/*
	 * Debugging features
	 */
	    alignment_required :1,  /* element alignment needs to be preserved */
	    gzalloc_tracked    :1,  /* this zone is tracked by gzalloc */
	    gzalloc_exempt     :1,  /* this zone doesn't participate with gzalloc */
	    kasan_fakestacks   :1,
	    kasan_noquarantine :1,  /* whether to use the kasan quarantine */
	    tag_zone_index     :7,
	    tags               :1,
	    tags_inline        :1,
	    zleak_on           :1,  /* Are we collecting allocation information? */
	    zone_logging       :1;  /* Enable zone logging for this zone. */

	/*
	 * often mutated fields
	 */

	lck_spin_t          z_lock;
	struct zone_depot   z_recirc;

	/*
	 * Page accounting (wired / VA)
	 *
	 * Those numbers are unscaled for z_percpu zones
	 * (zone_scale_for_percpu() needs to be used to find the true value).
	 */
	uint32_t            z_wired_max;    /* how large can this zone grow        */
	uint32_t            z_wired_hwm;    /* z_wired_cur high watermark          */
	uint32_t            z_wired_cur;    /* number of pages used by this zone   */
	uint32_t            z_wired_empty;  /* pages collectable by GC             */
	uint32_t            z_va_cur;       /* amount of VA used by this zone      */

	/*
	 * list of metadata structs, which maintain per-page free element lists
	 *
	 * Note: Due to the index packing in page metadata,
	 *       these pointers can't be at the beginning of the zone struct.
	 */
	zone_pva_t          z_pageq_empty;  /* populated, completely empty pages   */
	zone_pva_t          z_pageq_partial;/* populated, partially filled pages   */
	zone_pva_t          z_pageq_full;   /* populated, completely full pages    */
	zone_pva_t          z_pageq_va;     /* non-populated VA pages              */

	/*
	 * Zone statistics
	 *
	 * z_contention_wma:
	 *   weighted moving average of the number of contentions per second,
	 *   in Z_CONTENTION_WMA_UNIT units (fixed point decimal).
	 *
	 * z_contention_cur:
	 *   count of recorded contentions that will be fused in z_contention_wma
	 *   at the next period.
	 *
	 * z_recirc_cur:
	 *   number of magazines in the recirculation depot.
	 *
	 * z_elems_free:
	 *   number of free elements in the zone.
	 *
	 * z_elems_{min,max}:
	 *   tracks the low/high watermark of z_elems_free for the current
	 *   weighted moving average period.
	 *
	 * z_elems_free_wss:
	 *   weighted moving average of the (z_elems_free_max - z_elems_free_min)
	 *   amplited which is used by the GC for trim operations.
	 *
	 * z_elems_avail:
	 *   number of elements in the zone (at all).
	 */
#define Z_CONTENTION_WMA_UNIT (1u << 8)
	uint32_t            z_contention_wma;
	uint32_t            z_contention_cur;
	uint32_t            z_recirc_cur;
	uint32_t            z_elems_free_max;
	uint32_t            z_elems_free_wss;
	uint32_t            z_elems_free_min;
	uint32_t            z_elems_free;   /* Number of free elements             */
	uint32_t            z_elems_avail;  /* Number of elements available        */

#if CONFIG_ZLEAKS
	uint32_t            zleak_capture;  /* per-zone counter for capturing every N allocations */
#endif
#if CONFIG_GZALLOC
	gzalloc_data_t      gz;
#endif
#if KASAN_ZALLOC
	uint32_t            z_kasan_redzone;
	spl_t               z_kasan_spl;
#endif
#if DEBUG || DEVELOPMENT || CONFIG_ZLEAKS
	/* zone logging structure to hold stacks and element references to those stacks. */
	btlog_t            *zlog_btlog;
#endif
};


__options_decl(zone_security_options_t, uint64_t, {
	/*
	 * Zsecurity option to enable sequestering VA of zones
	 */
	ZSECURITY_OPTIONS_SEQUESTER             = 0x00000001,
	/*
	 * Zsecurity option to enable creating separate kalloc zones for
	 * bags of bytes
	 */
	ZSECURITY_OPTIONS_SUBMAP_USER_DATA      = 0x00000004,
	/*
	 * Zsecurity option to enable sequestering of kalloc zones used by
	 * kexts (KHEAP_KEXT heap)
	 */
	ZSECURITY_OPTIONS_SEQUESTER_KEXT_KALLOC = 0x00000008,
	/*
	 * Zsecurity option to enable strict free of iokit objects to zone
	 * or heap they were allocated from.
	 */
	ZSECURITY_OPTIONS_STRICT_IOKIT_FREE     = 0x00000010,
});

#define KALLOC_MINALIGN     (1 << KALLOC_LOG2_MINALIGN)
#define KALLOC_DLUT_SIZE    (2048 / KALLOC_MINALIGN)

struct kheap_zones {
	struct kalloc_zone_cfg         *cfg;
	struct kalloc_heap             *views;
	zone_kheap_id_t                 heap_id;
	uint16_t                        max_k_zone;
	uint8_t                         dlut[KALLOC_DLUT_SIZE];   /* table of indices into k_zone[] */
	uint8_t                         k_zindex_start;
	/* If there's no hit in the DLUT, then start searching from k_zindex_start. */
	zone_t                         *k_zone;
};

extern zone_security_options_t zsecurity_options;
extern zone_id_t _Atomic       num_zones;
extern uint32_t                zone_view_count;
extern struct zone             zone_array[];
extern const char * const      kalloc_heap_names[KHEAP_ID_COUNT];
extern bool                    panic_include_zprint;
#if CONFIG_ZLEAKS
extern bool                    panic_include_ztrace;
extern struct ztrace          *top_ztrace;
#endif
extern mach_memory_info_t     *panic_kext_memory_info;
extern vm_size_t               panic_kext_memory_size;
extern unsigned int            zone_map_jetsam_limit;

#define zone_index_foreach(i) \
	for (zone_id_t i = 1, num_zones_##i = os_atomic_load(&num_zones, acquire); \
	    i < num_zones_##i; i++)

#define zone_foreach(z) \
	for (zone_t z = &zone_array[1], \
	    last_zone_##z = &zone_array[os_atomic_load(&num_zones, acquire)]; \
	    z < last_zone_##z; z++)

struct zone_map_range {
	vm_offset_t min_address;
	vm_offset_t max_address;
} __attribute__((aligned(2 * sizeof(vm_offset_t))));

__pure2
static inline vm_offset_t
zone_elem_size(zone_t zone)
{
	return zone->z_elem_size;
}

static inline uint32_t
zone_count_allocated(zone_t zone)
{
	return zone->z_elems_avail - zone->z_elems_free;
}

static inline vm_size_t
zone_scale_for_percpu(zone_t zone, vm_size_t size)
{
	if (zone->z_percpu) {
		size *= zpercpu_count();
	}
	return size;
}

static inline vm_size_t
zone_size_wired(zone_t zone)
{
	/*
	 * this either require the zone lock,
	 * or to be used for statistics purposes only.
	 */
	vm_size_t size = ptoa(os_atomic_load(&zone->z_wired_cur, relaxed));
	return zone_scale_for_percpu(zone, size);
}

static inline vm_size_t
zone_size_free(zone_t zone)
{
	return zone_scale_for_percpu(zone,
	           (vm_size_t)zone->z_elem_size * zone->z_elems_free);
}

static inline vm_size_t
zone_size_allocated(zone_t zone)
{
	return zone_scale_for_percpu(zone,
	           (vm_size_t)zone->z_elem_size * zone_count_allocated(zone));
}

static inline vm_size_t
zone_size_wasted(zone_t zone)
{
	return zone_size_wired(zone) - zone_scale_for_percpu(zone,
	           (vm_size_t)zone->z_elem_size * zone->z_elems_avail);
}

/*
 * For sysctl kern.zones_collectable_bytes used by memory_maintenance to check if a
 * userspace reboot is needed. The only other way to query for this information
 * is via mach_memory_info() which is unavailable on release kernels.
 */
extern uint64_t get_zones_collectable_bytes(void);

/*!
 * @enum zone_gc_level_t
 *
 * @const ZONE_GC_TRIM
 * Request a trimming GC: it will trim allocations in excess
 * of the working set size estimate only.
 *
 * @const ZONE_GC_DRAIN
 * Request a draining GC: this is an aggressive mode that will
 * cause all caches to be drained and all free pages returned to the system.
 *
 * @const ZONE_GC_JETSAM
 * Request to consider a jetsam, and then fallback to @c ZONE_GC_TRIM or
 * @c ZONE_GC_DRAIN depending on the state of the zone map.
 * To avoid deadlocks, only @c vm_pageout_garbage_collect() should ever
 * request a @c ZONE_GC_JETSAM level.
 */
__enum_closed_decl(zone_gc_level_t, uint32_t, {
	ZONE_GC_TRIM,
	ZONE_GC_DRAIN,
	ZONE_GC_JETSAM,
});

/*!
 * @function zone_gc
 *
 * @brief
 * Reduces memory used by zones by trimming caches and freelists.
 *
 * @discussion
 * @c zone_gc() is called:
 * - by the pageout daemon when the system needs more free pages.
 * - by the VM when contiguous page allocation requests get stuck
 *   (see vm_page_find_contiguous()).
 *
 * @param level         The zone GC level requested.
 */
extern void     zone_gc(zone_gc_level_t level);

extern void     zone_gc_trim(void);
extern void     zone_gc_drain(void);

#define ZONE_WSS_UPDATE_PERIOD  10
/*!
 * @function compute_zone_working_set_size
 *
 * @brief
 * Recomputes the working set size for every zone
 *
 * @discussion
 * This runs about every @c ZONE_WSS_UPDATE_PERIOD seconds (10),
 * computing an exponential moving average with a weight of 75%,
 * so that the history of the last minute is the dominating factor.
 */
extern void     compute_zone_working_set_size(void *);

/* Debug logging for zone-map-exhaustion jetsams. */
extern void     get_zone_map_size(uint64_t *current_size, uint64_t *capacity);
extern void     get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size);

/* Bootstrap zone module (create zone zone) */
extern void     zone_bootstrap(void);

/*!
 * @function zone_foreign_mem_init
 *
 * @brief
 * Steal memory from pmap (prior to initialization of zalloc)
 * for the special vm zones that allow foreign memory and store
 * the range so as to facilitate range checking in zfree.
 */
__startup_func
extern vm_offset_t zone_foreign_mem_init(
	vm_size_t       size);

/*!
 * @function zone_get_foreign_alloc_size
 *
 * @brief
 * Compute the correct size (greater than @c ptoa(min_pages)) that is a multiple
 * of the allocation granule for the zone with the given creation flags and
 * element size.
 */
__startup_func
extern vm_size_t zone_get_foreign_alloc_size(
	const char          *name __unused,
	vm_size_t            elem_size,
	zone_create_flags_t  flags,
	uint16_t             min_pages);

/*!
 * @function zone_cram_foreign
 *
 * @brief
 * Cram memory allocated with @c zone_foreign_mem_init() into a zone.
 *
 * @param zone          The zone to cram memory into.
 * @param newmem        The base address for the memory to cram.
 * @param size          The size of the memory to cram into the zone.
 */
__startup_func
extern void     zone_cram_foreign(
	zone_t          zone,
	vm_offset_t     newmem,
	vm_size_t       size);

extern bool     zone_maps_owned(
	vm_address_t    addr,
	vm_size_t       size);

extern void     zone_map_sizes(
	vm_map_size_t  *psize,
	vm_map_size_t  *pfree,
	vm_map_size_t  *plargest_free);

extern bool
zone_map_nearing_exhaustion(void);

#if defined(__LP64__)
#define ZONE_POISON       0xdeadbeefdeadbeef
#else
#define ZONE_POISON       0xdeadbeef
#endif

static inline vm_tag_t
zalloc_flags_get_tag(zalloc_flags_t flags)
{
	return (vm_tag_t)((flags & Z_VM_TAG_MASK) >> Z_VM_TAG_SHIFT);
}

extern void    *zalloc_ext(
	zone_t          zone,
	zone_stats_t    zstats,
	zalloc_flags_t  flags);

extern void     zfree_ext(
	zone_t          zone,
	zone_stats_t    zstats,
	void           *addr);

/*!
 * @function zone_replenish_configure
 *
 * @brief
 * Used by zones backing the VM to maintain a reserve of free elements.
 *
 * @discussion
 * This function should not be used by anyone else than the VM.
 */
extern void     zone_replenish_configure(
	zone_t          zone);

extern vm_size_t zone_element_size(
	void           *addr,
	zone_t         *z);

/*!
 * @function zone_owns
 *
 * @abstract
 * This function is a soft version of zone_require that checks if a given
 * pointer belongs to the specified zone and should not be used outside
 * allocator code.
 *
 * @discussion
 * Note that zone_owns() can only work with:
 * - zones not allowing foreign memory
 * - zones in the general submap.
 *
 * @param zone          the zone the address needs to belong to.
 * @param addr          the element address to check.
 */
extern bool     zone_owns(
	zone_t          zone,
	void           *addr);

/*
 *  Structure for keeping track of a backtrace, used for leak detection.
 *  This is in the .h file because it is used during panic, see kern/debug.c
 *  A non-zero size indicates that the trace is in use.
 */
struct ztrace {
	vm_size_t               zt_size;                        /* How much memory are all the allocations referring to this trace taking up? */
	uint32_t                zt_depth;                       /* depth of stack (0 to MAX_ZTRACE_DEPTH) */
	void*                   zt_stack[MAX_ZTRACE_DEPTH];     /* series of return addresses from OSBacktrace */
	uint32_t                zt_collisions;                  /* How many times did a different stack land here while it was occupied? */
	uint32_t                zt_hit_count;                   /* for determining effectiveness of hash function */
};

#ifndef VM_MAX_TAG_ZONES
#error MAX_TAG_ZONES
#endif
#if VM_MAX_TAG_ZONES

extern uint32_t zone_index_from_tag_index(
	uint32_t        tag_zone_index,
	vm_size_t      *elem_size);

#endif /* VM_MAX_TAG_ZONES */

static inline void
zone_lock(zone_t zone)
{
#if KASAN_ZALLOC
	spl_t s = 0;
	if (zone->kasan_fakestacks) {
		s = splsched();
	}
#endif /* KASAN_ZALLOC */
	lck_spin_lock(&zone->z_lock);
#if KASAN_ZALLOC
	zone->z_kasan_spl = s;
#endif /* KASAN_ZALLOC */
}

static inline void
zone_unlock(zone_t zone)
{
#if KASAN_ZALLOC
	spl_t s = zone->z_kasan_spl;
	zone->z_kasan_spl = 0;
#endif /* KASAN_ZALLOC */
	lck_spin_unlock(&zone->z_lock);
#if KASAN_ZALLOC
	if (zone->kasan_fakestacks) {
		splx(s);
	}
#endif /* KASAN_ZALLOC */
}

#if CONFIG_GZALLOC
void gzalloc_init(vm_size_t);
void gzalloc_zone_init(zone_t);
void gzalloc_empty_free_cache(zone_t);
boolean_t gzalloc_enabled(void);

vm_offset_t gzalloc_alloc(zone_t, zone_stats_t zstats, zalloc_flags_t flags);
void gzalloc_free(zone_t, zone_stats_t zstats, void *);
boolean_t gzalloc_element_size(void *, zone_t *, vm_size_t *);
#endif /* CONFIG_GZALLOC */

#define MAX_ZONE_NAME   32      /* max length of a zone name we can take from the boot-args */
int track_this_zone(const char *zonename, const char *logname);

#if DEBUG || DEVELOPMENT
extern boolean_t run_zone_test(void);
extern void zone_gc_replenish_test(void);
extern void zone_alloc_replenish_test(void);
extern vm_size_t zone_element_info(void *addr, vm_tag_t * ptag);
extern bool zalloc_disable_copyio_check;
#else
#define zalloc_disable_copyio_check false
#endif /* DEBUG || DEVELOPMENT */

#pragma GCC visibility pop

__END_DECLS

#endif  /* _KERN_ZALLOC_INTERNAL_H_ */