aarch64: Move return value handling into ffi_call_SYSV

This lets us pass return data directly to the caller of ffi_call in most cases, rather than storing it into temporary storage first.
2014-10-22 17:06:19 -04:00
parent 325471ea6a
commit 4fe1aea121
3 changed files with 260 additions and 116 deletions
--- a/src/aarch64/ffi.c
+++ b/src/aarch64/ffi.c
@@ -523,30 +523,90 @@ allocate_int_to_reg_or_stack (struct call_context *context,
 ffi_status
 ffi_prep_cif_machdep (ffi_cif *cif)
 {
+  ffi_type *rtype = cif->rtype;
+  size_t bytes = cif->bytes;
+  int flags, aarch64_flags, i, n;
+
+  switch (rtype->type)
+    {
+    case FFI_TYPE_VOID:
+      flags = AARCH64_RET_VOID;
+      break;
+    case FFI_TYPE_UINT8:
+      flags = AARCH64_RET_UINT8;
+      break;
+    case FFI_TYPE_UINT16:
+      flags = AARCH64_RET_UINT16;
+      break;
+    case FFI_TYPE_UINT32:
+      flags = AARCH64_RET_UINT32;
+      break;
+    case FFI_TYPE_SINT8:
+      flags = AARCH64_RET_SINT8;
+      break;
+    case FFI_TYPE_SINT16:
+      flags = AARCH64_RET_SINT16;
+      break;
+    case FFI_TYPE_INT:
+    case FFI_TYPE_SINT32:
+      flags = AARCH64_RET_SINT32;
+      break;
+    case FFI_TYPE_SINT64:
+    case FFI_TYPE_UINT64:
+      flags = AARCH64_RET_INT64;
+      break;
+    case FFI_TYPE_POINTER:
+      flags = (sizeof(void *) == 4 ? AARCH64_RET_UINT32 : AARCH64_RET_INT64);
+      break;
+
+    case FFI_TYPE_FLOAT:
+      flags = AARCH64_RET_S1;
+      break;
+    case FFI_TYPE_DOUBLE:
+      flags = AARCH64_RET_D1;
+      break;
+    case FFI_TYPE_LONGDOUBLE:
+      flags = AARCH64_RET_Q1;
+      break;
+
+    case FFI_TYPE_STRUCT:
+      {
+	int h = is_hfa (rtype);
+	size_t s = rtype->size;
+
+	if (h)
+	  flags = (h & 0xff) * 4 + 4 - (h >> 8);
+	else if (s > 16)
+	  {
+	    flags = AARCH64_RET_VOID | AARCH64_RET_IN_MEM;
+	    bytes += 8;
+	  }
+	else if (s == 16)
+	  flags = AARCH64_RET_INT128;
+	else if (s == 8)
+	  flags = AARCH64_RET_INT64;
+	else
+	  flags = AARCH64_RET_INT128 | AARCH64_RET_NEED_COPY;
+      }
+      break;
+
+    default:
+      abort();
+    }
+
+  aarch64_flags = 0;
+  for (i = 0, n = cif->nargs; i < n; i++)
+    if (is_v_register_candidate (cif->arg_types[i]))
+      {
+	aarch64_flags = AARCH64_FLAG_ARG_V;
+	flags |= AARCH64_FLAG_ARG_V;
+	break;
+      }
+
  /* Round the stack up to a multiple of the stack alignment requirement. */
-  cif->bytes = ALIGN(cif->bytes, 16);
-
-  /* Initialize our flags. We are interested if this CIF will touch a
-     vector register, if so we will enable context save and load to
-     those registers, otherwise not. This is intended to be friendly
-     to lazy float context switching in the kernel.  */
-  cif->aarch64_flags = 0;
-
-  if (is_v_register_candidate (cif->rtype))
-    {
-      cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
-    }
-  else
-    {
-      int i;
-      for (i = 0; i < cif->nargs; i++)
-        if (is_v_register_candidate (cif->arg_types[i]))
-          {
-            cif->aarch64_flags |= AARCH64_FLAG_ARG_V;
-            break;
-          }
-    }
-
+  cif->bytes = ALIGN(bytes, 16);
+  cif->flags = flags;
+  cif->aarch64_flags = aarch64_flags;
 #if defined (__APPLE__)
  cif->aarch64_nfixedargs = 0;
 #endif
@@ -555,51 +615,65 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 }

 #if defined (__APPLE__)
-
 /* Perform Apple-specific cif processing for variadic calls */
 ffi_status ffi_prep_cif_machdep_var(ffi_cif *cif,
 				    unsigned int nfixedargs,
 				    unsigned int ntotalargs)
 {
-  ffi_status status;
-
-  status = ffi_prep_cif_machdep (cif);
-
+  ffi_status status = ffi_prep_cif_machdep (cif);
  cif->aarch64_nfixedargs = nfixedargs;
-
  return status;
 }
+#endif /* __APPLE__ */

-#endif
-
-extern void ffi_call_SYSV (void *stack, void *frame,
-			   void (*fn)(void), int flags) FFI_HIDDEN;
+extern void ffi_call_SYSV (struct call_context *context, void *frame,
+			   void (*fn)(void), void *rvalue, int flags)
+	FFI_HIDDEN;

 /* Call a function with the provided arguments and capture the return
   value.  */
 void
-ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
+ffi_call (ffi_cif *cif, void (*fn)(void), void *orig_rvalue, void **avalue)
 {
  struct call_context *context;
-  void *stack, *frame;
+  void *stack, *frame, *rvalue;
  struct arg_state state;
-  size_t stack_bytes;
-  int i, nargs = cif->nargs;
-  int h, t;
+  size_t stack_bytes, rtype_size, rsize;
+  int i, nargs, flags;
  ffi_type *rtype;

-  /* Allocate consectutive stack for everything we'll need.  */
+  flags = cif->flags;
+  rtype = cif->rtype;
+  rtype_size = rtype->size;
  stack_bytes = cif->bytes;
-  stack = alloca (stack_bytes + 32 + sizeof(struct call_context));
+
+  /* If the target function returns a structure via hidden pointer,
+     then we cannot allow a null rvalue.  Otherwise, mash a null
+     rvalue to void return type.  */
+  rsize = 0;
+  if (flags & AARCH64_RET_IN_MEM)
+    {
+      if (orig_rvalue == NULL)
+	rsize = rtype_size;
+    }
+  else if (orig_rvalue == NULL)
+    flags &= AARCH64_FLAG_ARG_V;
+  else if (flags & AARCH64_RET_NEED_COPY)
+    rsize = 16;
+
+  /* Allocate consectutive stack for everything we'll need.  */
+  context = alloca (sizeof(struct call_context) + stack_bytes + 32 + rsize);
+  stack = context + 1;
  frame = stack + stack_bytes;
-  context = frame + 32;
+  rvalue = (rsize ? frame + 32 : orig_rvalue);

  arg_init (&state);
-  for (i = 0; i < nargs; i++)
+  for (i = 0, nargs = cif->nargs; i < nargs; i++)
    {
      ffi_type *ty = cif->arg_types[i];
      size_t s = ty->size;
      void *a = avalue[i];
+      int h, t;

      t = ty->type;
      switch (t)
@@ -717,54 +791,10 @@ ffi_call (ffi_cif *cif, void (*fn)(void), void *rvalue, void **avalue)
 #endif
    }

-  rtype = cif->rtype;
-  if (is_register_candidate (rtype))
-    {
-      ffi_call_SYSV (stack, frame, fn, cif->aarch64_flags);
+  ffi_call_SYSV (context, frame, fn, rvalue, flags);

-      t = rtype->type;
-      switch (t)
-	{
-	case FFI_TYPE_INT:
-	case FFI_TYPE_UINT8:
-	case FFI_TYPE_SINT8:
-	case FFI_TYPE_UINT16:
-	case FFI_TYPE_SINT16:
-	case FFI_TYPE_UINT32:
-	case FFI_TYPE_SINT32:
-	case FFI_TYPE_POINTER:
-	case FFI_TYPE_UINT64:
-	case FFI_TYPE_SINT64:
-	  *(ffi_arg *)rvalue = extend_integer_type (&context->x[0], t);
-	  break;
-
-	case FFI_TYPE_FLOAT:
-	case FFI_TYPE_DOUBLE:
-	case FFI_TYPE_LONGDOUBLE:
-	  compress_hfa_type (rvalue, &context->v[0], 0x100 + t);
-	  break;
-
-	case FFI_TYPE_STRUCT:
-	  h = is_hfa (cif->rtype);
-	  if (h)
-	    compress_hfa_type (rvalue, &context->v[0], h);
-	  else
-	    {
-	      FFI_ASSERT (rtype->size <= 16);
-	      memcpy (rvalue, &context->x[0], rtype->size);
-	    }
-	  break;
-
-	default:
-	  FFI_ASSERT (0);
-	  break;
-	}
-    }
-  else
-    {
-      context->x8 = (uintptr_t)rvalue;
-      ffi_call_SYSV (stack, frame, fn, cif->aarch64_flags);
-    }
+  if (flags & AARCH64_RET_NEED_COPY)
+    memcpy (orig_rvalue, rvalue, rtype_size);
 }

 static unsigned char trampoline [] =
--- a/src/aarch64/internal.h
+++ b/src/aarch64/internal.h
@@ -18,7 +18,48 @@ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */

-#define AARCH64_FLAG_ARG_V_BIT	0
+#define AARCH64_RET_VOID	0
+#define AARCH64_RET_INT64	1
+#define AARCH64_RET_INT128	2
+
+#define AARCH64_RET_UNUSED3	3
+#define AARCH64_RET_UNUSED4	4
+#define AARCH64_RET_UNUSED5	5
+#define AARCH64_RET_UNUSED6	6
+#define AARCH64_RET_UNUSED7	7
+
+/* Note that FFI_TYPE_FLOAT == 2, _DOUBLE == 3, _LONGDOUBLE == 4,
+   so _S4 through _Q1 are layed out as (TYPE * 4) + (4 - COUNT).  */
+#define AARCH64_RET_S4		8
+#define AARCH64_RET_S3		9
+#define AARCH64_RET_S2		10
+#define AARCH64_RET_S1		11
+
+#define AARCH64_RET_D4		12
+#define AARCH64_RET_D3		13
+#define AARCH64_RET_D2		14
+#define AARCH64_RET_D1		15
+
+#define AARCH64_RET_Q4		16
+#define AARCH64_RET_Q3		17
+#define AARCH64_RET_Q2		18
+#define AARCH64_RET_Q1		19
+
+/* Note that each of the sub-64-bit integers gets two entries.  */
+#define AARCH64_RET_UINT8	20
+#define AARCH64_RET_UINT16	22
+#define AARCH64_RET_UINT32	24
+
+#define AARCH64_RET_SINT8	26
+#define AARCH64_RET_SINT16	28
+#define AARCH64_RET_SINT32	30
+
+#define AARCH64_RET_MASK	31
+
+#define AARCH64_RET_IN_MEM	(1 << 5)
+#define AARCH64_RET_NEED_COPY	(1 << 6)
+
+#define AARCH64_FLAG_ARG_V_BIT	7
 #define AARCH64_FLAG_ARG_V	(1 << AARCH64_FLAG_ARG_V_BIT)

 #define N_X_ARG_REG		8
--- a/src/aarch64/sysv.S
+++ b/src/aarch64/sysv.S
@@ -40,9 +40,9 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
 #endif

 	.text
-	.align 2
+	.align 4

-	.globl CNAME(ffi_call_SYSV)
+	.globl	CNAME(ffi_call_SYSV)
 #ifdef __ELF__
 	.type	CNAME(ffi_call_SYSV), #function
 	.hidden	CNAME(ffi_call_SYSV)
@@ -50,14 +50,15 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */

 /* ffi_call_SYSV
   extern void ffi_call_SYSV (void *stack, void *frame,
-			      void (*fn)(void), int flags);
+			      void (*fn)(void), void *rvalue, int flags);

   Therefore on entry we have:

   x0 stack
   x1 frame
   x2 fn
-   x3 flags
+   x3 rvalue
+   x4 flags
 */

 	cfi_startproc
@@ -71,43 +72,111 @@ CNAME(ffi_call_SYSV):
 	cfi_rel_offset (x29, 0)
 	cfi_rel_offset (x30, 8)

-	str	w3, [x29, #16]		/* save flags */
 	mov	x9, x2			/* save fn */
+	mov	x8, x3			/* install structure return */
+	stp	x3, x4, [x29, #16]	/* save rvalue and flags */

 	/* Load the vector argument passing registers, if necessary.  */
-	tbz	w3, #AARCH64_FLAG_ARG_V_BIT, 1f
-	ldp     q0, q1, [x29, #32 + 0]
-	ldp     q2, q3, [x29, #32 + 32]
-	ldp     q4, q5, [x29, #32 + 64]
-	ldp     q6, q7, [x29, #32 + 96]
+	tbz	w4, #AARCH64_FLAG_ARG_V_BIT, 1f
+	ldp     q0, q1, [sp, #0]
+	ldp     q2, q3, [sp, #32]
+	ldp     q4, q5, [sp, #64]
+	ldp     q6, q7, [sp, #96]
 1:
 	/* Load the core argument passing registers, including
 	   the structure return pointer.  */
-	ldp     x0, x1, [x29, #32 + 16*N_V_ARG_REG + 0]
-	ldp     x2, x3, [x29, #32 + 16*N_V_ARG_REG + 16]
-	ldp     x4, x5, [x29, #32 + 16*N_V_ARG_REG + 32]
-	ldp     x6, x7, [x29, #32 + 16*N_V_ARG_REG + 48]
-	ldr     x8,     [x29, #32 + 16*N_V_ARG_REG + 64]
+	ldp     x0, x1, [sp, #16*N_V_ARG_REG + 0]
+	ldp     x2, x3, [sp, #16*N_V_ARG_REG + 16]
+	ldp     x4, x5, [sp, #16*N_V_ARG_REG + 32]
+	ldp     x6, x7, [sp, #16*N_V_ARG_REG + 48]
+
+	/* Deallocate the context, leaving the stacked arguments.  */
+	add	sp, sp, #CALL_CONTEXT_SIZE

 	blr     x9			/* call fn */

-	ldr	w3, [x29, #16]		/* reload flags */
+	ldp	x3, x4, [x29, #16]	/* reload rvalue and flags */

 	/* Partially deconstruct the stack frame.  */
 	mov     sp, x29
 	cfi_def_cfa_register (sp)
 	ldp     x29, x30, [x29]

-	/* Save the core return registers.  */
-	stp     x0, x1, [sp, #32 + 16*N_V_ARG_REG]
+	/* Save the return value as directed.  */
+	adr	x5, 0f
+	and	w4, w4, #AARCH64_RET_MASK
+	add	x5, x5, x4, lsl #3
+	br	x5

-	/* Save the vector return registers, if necessary.  */
-	tbz     w3, #AARCH64_FLAG_ARG_V_BIT, 1f
-	stp     q0, q1, [sp, #32 + 0]
-	stp     q2, q3, [sp, #32 + 32]
-1:
-	/* All done.  */
+	/* Note that each table entry is 2 insns, and thus 8 bytes.
+	   For integer data, note that we're storing into ffi_arg
+	   and therefore we want to extend to 64 bits; these types
+	   have two consecutive entries allocated for them.  */
+	.align	4
+0:	ret				/* VOID */
+	nop
+1:	str	x0, [x3]		/* INT64 */
 	ret
+2:	stp	x0, x1, [x3]		/* INT128 */
+	ret
+3:	brk	#1000			/* UNUSED */
+	ret
+4:	brk	#1000			/* UNUSED */
+	ret
+5:	brk	#1000			/* UNUSED */
+	ret
+6:	brk	#1000			/* UNUSED */
+	ret
+7:	brk	#1000			/* UNUSED */
+	ret
+8:	st4	{ v0.s-v3.s }[0], [x3]	/* S4 */
+	ret
+9:	st3	{ v0.s-v2.s }[0], [x3]	/* S3 */
+	ret
+10:	stp	s0, s1, [x3]		/* S2 */
+	ret
+11:	str	s0, [x3]		/* S1 */
+	ret
+12:	st4	{ v0.d-v3.d }[0], [x3]	/* D4 */
+	ret
+13:	st3	{ v0.d-v2.d }[0], [x3]	/* D3 */
+	ret
+14:	stp	d0, d1, [x3]		/* D2 */
+	ret
+15:	str	d0, [x3]		/* D1 */
+	ret
+16:	str	q3, [x3, #48]		/* Q4 */
+	nop
+17:	str	q2, [x3, #32]		/* Q3 */
+	nop
+18:	stp	q0, q1, [x3]		/* Q2 */
+	ret
+19:	str	q0, [x3]		/* Q1 */
+	ret
+20:	uxtb	w0, w0			/* UINT8 */
+	str	x0, [x3]
+21:	ret				/* reserved */
+	nop
+22:	uxth	w0, w0			/* UINT16 */
+	str	x0, [x3]
+23:	ret				/* reserved */
+	nop
+24:	mov	w0, w0			/* UINT32 */
+	str	x0, [x3]
+25:	ret				/* reserved */
+	nop
+26:	sxtb	x0, w0			/* SINT8 */
+	str	x0, [x3]
+27:	ret				/* reserved */
+	nop
+28:	sxth	x0, w0			/* SINT16 */
+	str	x0, [x3]
+29:	ret				/* reserved */
+	nop
+30:	sxtw	x0, w0			/* SINT32 */
+	str	x0, [x3]
+31:	ret				/* reserved */
+	nop

 	cfi_endproc
 #ifdef __ELF__
@@ -154,9 +223,13 @@ CNAME(ffi_call_SYSV):
   Voila!  */

        .text
-        .align 2
+        .align 4

-        .globl CNAME(ffi_closure_SYSV)
+        .globl	CNAME(ffi_closure_SYSV)
+#ifdef __ELF__
+	.type	CNAME(ffi_closure_SYSV), #function
+	.hidden	CNAME(ffi_closure_SYSV)
+#endif
        cfi_startproc
 CNAME(ffi_closure_SYSV):
        stp     x29, x30, [sp, #-16]!