#if defined(__aarch64__) || defined(__arm64__)|| defined (_M_ARM64)
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <fficonfig.h>
#include <ffi.h>
#include <ffi_common.h>
#include "internal.h"
#ifdef _M_ARM64
#include <windows.h>
#endif
#if __has_feature(ptrauth_calls)
#include <ptrauth.h>
#endif
#if FFI_TYPE_DOUBLE != FFI_TYPE_LONGDOUBLE
# if FFI_TYPE_LONGDOUBLE != 4
# error FFI_TYPE_LONGDOUBLE out of date
# endif
#else
# undef FFI_TYPE_LONGDOUBLE
# define FFI_TYPE_LONGDOUBLE 4
#endif
union _d
{
UINT64 d;
UINT32 s[2];
};
struct _v
{
union _d d[2] __attribute__((aligned(16)));
};
struct call_context
{
struct _v v[N_V_ARG_REG];
UINT64 x[N_X_ARG_REG];
};
#if FFI_EXEC_TRAMPOLINE_TABLE
#ifdef __MACH__
#include <mach/vm_param.h>
#endif
#else
#if defined (__clang__) && defined (__APPLE__)
extern void sys_icache_invalidate (void *start, size_t len);
#endif
static inline void
ffi_clear_cache (void *start, void *end)
{
#if defined (__clang__) && defined (__APPLE__)
sys_icache_invalidate (start, (char *)end - (char *)start);
#elif defined (__GNUC__)
__builtin___clear_cache (start, end);
#elif defined (_M_ARM64)
FlushInstructionCache(GetCurrentProcess(), start, (char*)end - (char*)start);
#else
#error "Missing builtin to flush instruction cache"
#endif
}
#endif
static int
is_hfa0 (const ffi_type *ty)
{
ffi_type **elements = ty->elements;
int i, ret = -1;
if (elements != NULL)
for (i = 0; elements[i]; ++i)
{
ret = elements[i]->type;
if (ret == FFI_TYPE_STRUCT || ret == FFI_TYPE_COMPLEX)
{
ret = is_hfa0 (elements[i]);
if (ret < 0)
continue;
}
break;
}
return ret;
}
static int
is_hfa1 (const ffi_type *ty, int candidate)
{
ffi_type **elements = ty->elements;
int i;
if (elements != NULL)
for (i = 0; elements[i]; ++i)
{
int t = elements[i]->type;
if (t == FFI_TYPE_STRUCT || t == FFI_TYPE_COMPLEX)
{
if (!is_hfa1 (elements[i], candidate))
return 0;
}
else if (t != candidate)
return 0;
}
return 1;
}
static int
is_vfp_type (const ffi_type *ty)
{
ffi_type **elements;
int candidate, i;
size_t size, ele_count;
candidate = ty->type;
switch (candidate)
{
default:
return 0;
case FFI_TYPE_FLOAT:
case FFI_TYPE_DOUBLE:
case FFI_TYPE_LONGDOUBLE:
ele_count = 1;
goto done;
case FFI_TYPE_COMPLEX:
candidate = ty->elements[0]->type;
switch (candidate)
{
case FFI_TYPE_FLOAT:
case FFI_TYPE_DOUBLE:
case FFI_TYPE_LONGDOUBLE:
ele_count = 2;
goto done;
}
return 0;
case FFI_TYPE_STRUCT:
break;
}
size = ty->size;
if (size < 4 || size > 64)
return 0;
elements = ty->elements;
candidate = elements[0]->type;
if (candidate == FFI_TYPE_STRUCT || candidate == FFI_TYPE_COMPLEX)
{
for (i = 0; ; ++i)
{
candidate = is_hfa0 (elements[i]);
if (candidate >= 0)
break;
}
}
switch (candidate)
{
case FFI_TYPE_FLOAT:
ele_count = size / sizeof(float);
if (size != ele_count * sizeof(float))
return 0;
break;
case FFI_TYPE_DOUBLE:
ele_count = size / sizeof(double);
if (size != ele_count * sizeof(double))
return 0;
break;
case FFI_TYPE_LONGDOUBLE:
ele_count = size / sizeof(long double);
if (size != ele_count * sizeof(long double))
return 0;
break;
default:
return 0;
}
if (ele_count > 4)
return 0;
for (i = 0; elements[i]; ++i)
{
int t = elements[i]->type;
if (t == FFI_TYPE_STRUCT || t == FFI_TYPE_COMPLEX)
{
if (!is_hfa1 (elements[i], candidate))
return 0;
}
else if (t != candidate)
return 0;
}
done:
return candidate * 4 + (4 - (int)ele_count);
}
struct arg_state
{
unsigned ngrn;
unsigned nsrn;
size_t nsaa;
#if defined (__APPLE__)
unsigned allocating_variadic;
#endif
};
static void
arg_init (struct arg_state *state)
{
state->ngrn = 0;
state->nsrn = 0;
state->nsaa = 0;
#if defined (__APPLE__)
state->allocating_variadic = 0;
#endif
}
static void *
allocate_to_stack (struct arg_state *state, void *stack,
size_t alignment, size_t size)
{
size_t nsaa = state->nsaa;
#if defined (__APPLE__)
if (state->allocating_variadic && alignment < 8)
alignment = 8;
#else
if (alignment < 8)
alignment = 8;
#endif
nsaa = FFI_ALIGN (nsaa, alignment);
state->nsaa = nsaa + size;
return (char *)stack + nsaa;
}
static ffi_arg
extend_integer_type (void *source, int type)
{
switch (type)
{
case FFI_TYPE_UINT8:
return *(UINT8 *) source;
case FFI_TYPE_SINT8:
return *(SINT8 *) source;
case FFI_TYPE_UINT16:
return *(UINT16 *) source;
case FFI_TYPE_SINT16:
return *(SINT16 *) source;
case FFI_TYPE_UINT32:
return *(UINT32 *) source;
case FFI_TYPE_INT:
case FFI_TYPE_SINT32:
return *(SINT32 *) source;
case FFI_TYPE_UINT64:
case FFI_TYPE_SINT64:
return *(UINT64 *) source;
break;
case FFI_TYPE_POINTER:
return *(uintptr_t *) source;
default:
abort();
}
}
#if defined(_MSC_VER)
void extend_hfa_type (void *dest, void *src, int h);
#else
static void
extend_hfa_type (void *dest, void *src, int h)
{
ssize_t f = h - AARCH64_RET_S4;
void *x0;
asm volatile (
"adr %0, 0f\n"
" add %0, %0, %1\n"
" br %0\n"
"0: ldp s16, s17, [%3]\n"
" ldp s18, s19, [%3, #8]\n"
" b 4f\n"
" ldp s16, s17, [%3]\n"
" ldr s18, [%3, #8]\n"
" b 3f\n"
" ldp s16, s17, [%3]\n"
" b 2f\n"
" nop\n"
" ldr s16, [%3]\n"
" b 1f\n"
" nop\n"
" ldp d16, d17, [%3]\n"
" ldp d18, d19, [%3, #16]\n"
" b 4f\n"
" ldp d16, d17, [%3]\n"
" ldr d18, [%3, #16]\n"
" b 3f\n"
" ldp d16, d17, [%3]\n"
" b 2f\n"
" nop\n"
" ldr d16, [%3]\n"
" b 1f\n"
" nop\n"
" ldp q16, q17, [%3]\n"
" ldp q18, q19, [%3, #32]\n"
" b 4f\n"
" ldp q16, q17, [%3]\n"
" ldr q18, [%3, #32]\n"
" b 3f\n"
" ldp q16, q17, [%3]\n"
" b 2f\n"
" nop\n"
" ldr q16, [%3]\n"
" b 1f\n"
"4: str q19, [%2, #48]\n"
"3: str q18, [%2, #32]\n"
"2: str q17, [%2, #16]\n"
"1: str q16, [%2]"
: "=&r"(x0)
: "r"(f * 12), "r"(dest), "r"(src)
: "memory", "v16", "v17", "v18", "v19");
}
#endif
#if defined(_MSC_VER)
void* compress_hfa_type (void *dest, void *src, int h);
#else
static void *
compress_hfa_type (void *dest, void *reg, int h)
{
switch (h)
{
case AARCH64_RET_S1:
if (dest == reg)
{
#ifdef __AARCH64EB__
dest += 12;
#endif
}
else
*(float *)dest = *(float *)reg;
break;
case AARCH64_RET_S2:
asm ("ldp q16, q17, [%1]\n\t"
"st2 { v16.s, v17.s }[0], [%0]"
: : "r"(dest), "r"(reg) : "memory", "v16", "v17");
break;
case AARCH64_RET_S3:
asm ("ldp q16, q17, [%1]\n\t"
"ldr q18, [%1, #32]\n\t"
"st3 { v16.s, v17.s, v18.s }[0], [%0]"
: : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
break;
case AARCH64_RET_S4:
asm ("ldp q16, q17, [%1]\n\t"
"ldp q18, q19, [%1, #32]\n\t"
"st4 { v16.s, v17.s, v18.s, v19.s }[0], [%0]"
: : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
break;
case AARCH64_RET_D1:
if (dest == reg)
{
#ifdef __AARCH64EB__
dest += 8;
#endif
}
else
*(double *)dest = *(double *)reg;
break;
case AARCH64_RET_D2:
asm ("ldp q16, q17, [%1]\n\t"
"st2 { v16.d, v17.d }[0], [%0]"
: : "r"(dest), "r"(reg) : "memory", "v16", "v17");
break;
case AARCH64_RET_D3:
asm ("ldp q16, q17, [%1]\n\t"
"ldr q18, [%1, #32]\n\t"
"st3 { v16.d, v17.d, v18.d }[0], [%0]"
: : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18");
break;
case AARCH64_RET_D4:
asm ("ldp q16, q17, [%1]\n\t"
"ldp q18, q19, [%1, #32]\n\t"
"st4 { v16.d, v17.d, v18.d, v19.d }[0], [%0]"
: : "r"(dest), "r"(reg) : "memory", "v16", "v17", "v18", "v19");
break;
default:
if (dest != reg)
return memcpy (dest, reg, 16 * (4 - (h & 3)));
break;
}
return dest;
}
#endif
static void *
allocate_int_to_reg_or_stack (struct call_context *context,
struct arg_state *state,
void *stack, size_t size)
{
if (state->ngrn < N_X_ARG_REG)
return &context->x[state->ngrn++];
state->ngrn = N_X_ARG_REG;
return allocate_to_stack (state, stack, size, size);
}
ffi_status FFI_HIDDEN
ffi_prep_cif_machdep (ffi_cif *cif)
{
ffi_type *rtype = cif->rtype;
size_t bytes = cif->bytes;
int flags, i, n;
switch (rtype->type)
{
case FFI_TYPE_VOID:
flags = AARCH64_RET_VOID;
break;
case FFI_TYPE_UINT8:
flags = AARCH64_RET_UINT8;
break;
case FFI_TYPE_UINT16:
flags = AARCH64_RET_UINT16;
break;
case FFI_TYPE_UINT32:
flags = AARCH64_RET_UINT32;
break;
case FFI_TYPE_SINT8:
flags = AARCH64_RET_SINT8;
break;
case FFI_TYPE_SINT16:
flags = AARCH64_RET_SINT16;
break;
case FFI_TYPE_INT:
case FFI_TYPE_SINT32:
flags = AARCH64_RET_SINT32;
break;
case FFI_TYPE_SINT64:
case FFI_TYPE_UINT64:
flags = AARCH64_RET_INT64;
break;
case FFI_TYPE_POINTER:
flags = (sizeof(void *) == 4 ? AARCH64_RET_UINT32 : AARCH64_RET_INT64);
break;
case FFI_TYPE_FLOAT:
case FFI_TYPE_DOUBLE:
case FFI_TYPE_LONGDOUBLE:
case FFI_TYPE_STRUCT:
case FFI_TYPE_COMPLEX:
flags = is_vfp_type (rtype);
if (flags == 0)
{
size_t s = rtype->size;
if (s > 16)
{
flags = AARCH64_RET_VOID | AARCH64_RET_IN_MEM;
bytes += 8;
}
else if (s == 16)
flags = AARCH64_RET_INT128;
else if (s == 8)
flags = AARCH64_RET_INT64;
else
flags = AARCH64_RET_INT128 | AARCH64_RET_NEED_COPY;
}
break;
default:
abort();
}
for (i = 0, n = cif->nargs; i < n; i++)
if (is_vfp_type (cif->arg_types[i]))
{
flags |= AARCH64_FLAG_ARG_V;
break;
}
cif->bytes = (unsigned) FFI_ALIGN(bytes, 16);
cif->flags = flags;
#if defined (__APPLE__)
cif->aarch64_nfixedargs = 0;
#endif
return FFI_OK;
}
#if defined (__APPLE__)
ffi_status FFI_HIDDEN
ffi_prep_cif_machdep_var(ffi_cif *cif, unsigned int nfixedargs,
unsigned int ntotalargs)
{
ffi_status status = ffi_prep_cif_machdep (cif);
cif->aarch64_nfixedargs = nfixedargs;
return status;
}
#endif
extern void ffi_call_SYSV (struct call_context *context, void *frame,
void (*fn)(void), void *rvalue, int flags,
void *closure) FFI_HIDDEN;
static void
ffi_call_int (ffi_cif *cif, void (*fn)(void), void *orig_rvalue,
void **avalue, void *closure)
{
struct call_context *context;
void *stack, *frame, *rvalue;
struct arg_state state;
size_t stack_bytes, rtype_size, rsize;
int i, nargs, flags;
ffi_type *rtype;
flags = cif->flags;
rtype = cif->rtype;
rtype_size = rtype->size;
stack_bytes = cif->bytes;
rsize = 0;
if (flags & AARCH64_RET_IN_MEM)
{
if (orig_rvalue == NULL)
rsize = rtype_size;
}
else if (orig_rvalue == NULL)
flags &= AARCH64_FLAG_ARG_V;
else if (flags & AARCH64_RET_NEED_COPY)
rsize = 16;
context = alloca (sizeof(struct call_context) + stack_bytes + 40 + rsize);
stack = context + 1;
frame = (void*)((uintptr_t)stack + (uintptr_t)stack_bytes);
rvalue = (rsize ? (void*)((uintptr_t)frame + 40) : orig_rvalue);
arg_init (&state);
for (i = 0, nargs = cif->nargs; i < nargs; i++)
{
ffi_type *ty = cif->arg_types[i];
size_t s = ty->size;
void *a = avalue[i];
int h, t;
t = ty->type;
switch (t)
{
case FFI_TYPE_VOID:
FFI_ASSERT (0);
break;
case FFI_TYPE_INT:
case FFI_TYPE_UINT8:
case FFI_TYPE_SINT8:
case FFI_TYPE_UINT16:
case FFI_TYPE_SINT16:
case FFI_TYPE_UINT32:
case FFI_TYPE_SINT32:
case FFI_TYPE_UINT64:
case FFI_TYPE_SINT64:
case FFI_TYPE_POINTER:
do_pointer:
{
ffi_arg ext = extend_integer_type (a, t);
if (state.ngrn < N_X_ARG_REG)
context->x[state.ngrn++] = ext;
else
{
void *d = allocate_to_stack (&state, stack, ty->alignment, s);
state.ngrn = N_X_ARG_REG;
#ifdef __APPLE__
memcpy(d, a, s);
#else
*(ffi_arg *)d = ext;
#endif
}
}
break;
case FFI_TYPE_FLOAT:
case FFI_TYPE_DOUBLE:
case FFI_TYPE_LONGDOUBLE:
case FFI_TYPE_STRUCT:
case FFI_TYPE_COMPLEX:
{
void *dest;
h = is_vfp_type (ty);
if (h)
{
int elems = 4 - (h & 3);
#ifdef _M_ARM64
if (cif->is_variadic)
{
if (state.ngrn + elems <= N_X_ARG_REG)
{
dest = &context->x[state.ngrn];
state.ngrn += elems;
extend_hfa_type(dest, a, h);
break;
}
state.nsrn = N_X_ARG_REG;
dest = allocate_to_stack(&state, stack, ty->alignment, s);
}
else
{
#endif
if (state.nsrn + elems <= N_V_ARG_REG)
{
dest = &context->v[state.nsrn];
state.nsrn += elems;
extend_hfa_type (dest, a, h);
break;
}
state.nsrn = N_V_ARG_REG;
dest = allocate_to_stack (&state, stack, ty->alignment, s);
#ifdef _M_ARM64
}
#endif
}
else if (s > 16)
{
a = &avalue[i];
t = FFI_TYPE_POINTER;
s = sizeof (void *);
goto do_pointer;
}
else
{
size_t n = (s + 7) / 8;
if (state.ngrn + n <= N_X_ARG_REG)
{
dest = &context->x[state.ngrn];
state.ngrn += (unsigned int)n;
}
else
{
state.ngrn = N_X_ARG_REG;
dest = allocate_to_stack (&state, stack, ty->alignment, s);
}
}
memcpy (dest, a, s);
}
break;
default:
abort();
}
#if defined (__APPLE__)
if (i + 1 == cif->aarch64_nfixedargs)
{
state.ngrn = N_X_ARG_REG;
state.nsrn = N_V_ARG_REG;
state.allocating_variadic = 1;
}
#endif
}
ffi_call_SYSV (context, frame, fn, rvalue, flags, closure);
if (flags & AARCH64_RET_NEED_COPY)
memcpy (orig_rvalue, rvalue, rtype_size);
}
void
ffi_call (ffi_cif *cif, void (*fn) (void), void *rvalue, void **avalue)
{
ffi_call_int (cif, fn, rvalue, avalue, NULL);
}
#ifdef FFI_GO_CLOSURES
void
ffi_call_go (ffi_cif *cif, void (*fn) (void), void *rvalue,
void **avalue, void *closure)
{
ffi_call_int (cif, fn, rvalue, avalue, closure);
}
#endif
#if FFI_CLOSURES
extern void ffi_closure_SYSV (void) FFI_HIDDEN;
extern void ffi_closure_SYSV_V (void) FFI_HIDDEN;
ffi_status
ffi_prep_closure_loc (ffi_closure *closure,
ffi_cif* cif,
void (*fun)(ffi_cif*,void*,void**,void*),
void *user_data,
void *codeloc)
{
if (cif->abi != FFI_SYSV)
return FFI_BAD_ABI;
void (*start)(void);
if (cif->flags & AARCH64_FLAG_ARG_V)
start = ffi_closure_SYSV_V;
else
start = ffi_closure_SYSV;
#if FFI_EXEC_TRAMPOLINE_TABLE
#ifdef __MACH__
#if __has_feature(ptrauth_calls)
codeloc = ptrauth_auth_data(codeloc, ptrauth_key_function_pointer, 0);
#endif
#ifdef FFI_TRAMPOLINE_WHOLE_DYLIB
void **config = (void **)((uint8_t *)codeloc - 2*PAGE_MAX_SIZE);
#else
void **config = (void **)((uint8_t *)codeloc - PAGE_MAX_SIZE);
#endif
config[0] = closure;
config[1] = start;
#endif
#else
static const unsigned char trampoline[16] = {
0x90, 0x00, 0x00, 0x58,
0xf1, 0xff, 0xff, 0x10,
0x00, 0x02, 0x1f, 0xd6
};
char *tramp = closure->tramp;
memcpy (tramp, trampoline, sizeof(trampoline));
*(UINT64 *)(tramp + 16) = (uintptr_t)start;
ffi_clear_cache(tramp, tramp + FFI_TRAMPOLINE_SIZE);
#ifdef _M_ARM64
unsigned char *tramp_code = tramp;
#else
unsigned char *tramp_code = ffi_data_to_code_pointer (tramp);
#endif
ffi_clear_cache (tramp_code, tramp_code + FFI_TRAMPOLINE_SIZE);
#endif
closure->cif = cif;
closure->fun = fun;
closure->user_data = user_data;
return FFI_OK;
}
ffi_closure *
ffi_find_closure_for_code_np(void *codeloc)
{
#if FFI_EXEC_TRAMPOLINE_TABLE
# ifdef FFI_TRAMPOLINE_WHOLE_DYLIB
void **config = (void **)((uint8_t *)codeloc - 2*PAGE_MAX_SIZE);
# else
void **config = (void **)((uint8_t *)codeloc - PAGE_MAX_SIZE);
# endif
return config[0];
#else
return (ffi_closure*)codeloc;
#endif
}
#ifdef FFI_GO_CLOSURES
extern void ffi_go_closure_SYSV (void) FFI_HIDDEN;
extern void ffi_go_closure_SYSV_V (void) FFI_HIDDEN;
ffi_status
ffi_prep_go_closure (ffi_go_closure *closure, ffi_cif* cif,
void (*fun)(ffi_cif*,void*,void**,void*))
{
void (*start)(void);
if (cif->abi != FFI_SYSV)
return FFI_BAD_ABI;
if (cif->flags & AARCH64_FLAG_ARG_V)
start = ffi_go_closure_SYSV_V;
else
start = ffi_go_closure_SYSV;
closure->tramp = start;
closure->cif = cif;
closure->fun = fun;
return FFI_OK;
}
#endif
int FFI_HIDDEN
ffi_closure_SYSV_inner (ffi_cif *cif,
void (*fun)(ffi_cif*,void*,void**,void*),
void *user_data,
struct call_context *context,
void *stack, void *rvalue, void *struct_rvalue)
{
void **avalue = (void**) alloca (cif->nargs * sizeof (void*));
int i, h, nargs, flags;
struct arg_state state;
arg_init (&state);
for (i = 0, nargs = cif->nargs; i < nargs; i++)
{
ffi_type *ty = cif->arg_types[i];
int t = ty->type;
size_t n, s = ty->size;
switch (t)
{
case FFI_TYPE_VOID:
FFI_ASSERT (0);
break;
case FFI_TYPE_INT:
case FFI_TYPE_UINT8:
case FFI_TYPE_SINT8:
case FFI_TYPE_UINT16:
case FFI_TYPE_SINT16:
case FFI_TYPE_UINT32:
case FFI_TYPE_SINT32:
case FFI_TYPE_UINT64:
case FFI_TYPE_SINT64:
case FFI_TYPE_POINTER:
avalue[i] = allocate_int_to_reg_or_stack (context, &state, stack, s);
break;
case FFI_TYPE_FLOAT:
case FFI_TYPE_DOUBLE:
case FFI_TYPE_LONGDOUBLE:
case FFI_TYPE_STRUCT:
case FFI_TYPE_COMPLEX:
h = is_vfp_type (ty);
if (h)
{
n = 4 - (h & 3);
#ifdef _M_ARM64
if (cif->is_variadic)
{
if (state.ngrn + n <= N_X_ARG_REG)
{
void *reg = &context->x[state.ngrn];
state.ngrn += (unsigned int)n;
avalue[i] = compress_hfa_type(reg, reg, h);
}
else
{
state.ngrn = N_X_ARG_REG;
state.nsrn = N_V_ARG_REG;
avalue[i] = allocate_to_stack(&state, stack,
ty->alignment, s);
}
}
else
{
#endif
if (state.nsrn + n <= N_V_ARG_REG)
{
void *reg = &context->v[state.nsrn];
state.nsrn += (unsigned int)n;
avalue[i] = compress_hfa_type(reg, reg, h);
}
else
{
state.nsrn = N_V_ARG_REG;
avalue[i] = allocate_to_stack(&state, stack,
ty->alignment, s);
}
#ifdef _M_ARM64
}
#endif
}
else if (s > 16)
{
avalue[i] = *(void **)
allocate_int_to_reg_or_stack (context, &state, stack,
sizeof (void *));
}
else
{
n = (s + 7) / 8;
if (state.ngrn + n <= N_X_ARG_REG)
{
avalue[i] = &context->x[state.ngrn];
state.ngrn += (unsigned int)n;
}
else
{
state.ngrn = N_X_ARG_REG;
avalue[i] = allocate_to_stack(&state, stack,
ty->alignment, s);
}
}
break;
default:
abort();
}
#if defined (__APPLE__)
if (i + 1 == cif->aarch64_nfixedargs)
{
state.ngrn = N_X_ARG_REG;
state.nsrn = N_V_ARG_REG;
state.allocating_variadic = 1;
}
#endif
}
flags = cif->flags;
if (flags & AARCH64_RET_IN_MEM)
rvalue = struct_rvalue;
fun (cif, rvalue, avalue, user_data);
return flags;
}
#endif
#endif