x86_64: Decouple return types from FFI_TYPE constants

We can better support structure returns, and as prep for complex types.
2014-10-28 11:17:35 -07:00
parent 2e9dc16556
commit 32c5683163
3 changed files with 203 additions and 197 deletions
--- a/src/x86/ffi64.c
+++ b/src/x86/ffi64.c
@@ -33,6 +33,7 @@
 #include <stdlib.h>
 #include <stdarg.h>
 #include <stdint.h>
 #include "internal64.h"
 #ifdef __x86_64__
@@ -191,7 +192,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
 	  }
 	else if (size <= 16)
 	  {
-	    classes[0] = classes[1] = X86_64_INTEGERSI_CLASS;
+	    classes[0] = classes[1] = X86_64_INTEGER_CLASS;
 	    return 2;
 	  }
 	else
@@ -360,15 +361,55 @@ ffi_prep_cif_machdep (ffi_cif *cif)
  int gprcount, ssecount, i, avn, ngpr, nsse, flags;
  enum x86_64_reg_class classes[MAX_CLASSES];
  size_t bytes, n;
  ffi_type *rtype;
  if (cif->abi != FFI_UNIX64)
    return FFI_BAD_ABI;
  gprcount = ssecount = 0;
-  flags = cif->rtype->type;
+  rtype = cif->rtype;
-  if (flags != FFI_TYPE_VOID)
+  switch (rtype->type)
    {
    case FFI_TYPE_VOID:
      flags = UNIX64_RET_VOID;
      break;
    case FFI_TYPE_UINT8:
      flags = UNIX64_RET_UINT8;
      break;
    case FFI_TYPE_SINT8:
      flags = UNIX64_RET_SINT8;
      break;
    case FFI_TYPE_UINT16:
      flags = UNIX64_RET_UINT16;
      break;
    case FFI_TYPE_SINT16:
      flags = UNIX64_RET_SINT16;
      break;
    case FFI_TYPE_UINT32:
      flags = UNIX64_RET_UINT32;
      break;
    case FFI_TYPE_INT:
    case FFI_TYPE_SINT32:
      flags = UNIX64_RET_SINT32;
      break;
    case FFI_TYPE_UINT64:
    case FFI_TYPE_SINT64:
      flags = UNIX64_RET_INT64;
      break;
    case FFI_TYPE_POINTER:
      flags = (sizeof(void *) == 4 ? UNIX64_RET_UINT32 : UNIX64_RET_INT64);
      break;
    case FFI_TYPE_FLOAT:
      flags = UNIX64_RET_XMM32;
      break;
    case FFI_TYPE_DOUBLE:
      flags = UNIX64_RET_XMM64;
      break;
    case FFI_TYPE_LONGDOUBLE:
      flags = UNIX64_RET_X87;
      break;
    case FFI_TYPE_STRUCT:
      n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
      if (n == 0)
 	{
@@ -376,22 +417,24 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 	     memory is the first argument.  Allocate a register for it.  */
 	  gprcount++;
 	  /* We don't have to do anything in asm for the return.  */
-	  flags = FFI_TYPE_VOID;
+	  flags = UNIX64_RET_VOID | UNIX64_FLAG_RET_IN_MEM;
 	}
-      else if (flags == FFI_TYPE_STRUCT)
+      else
 	{
 	  /* Mark which registers the result appears in.  */
 	  _Bool sse0 = SSE_CLASS_P (classes[0]);
 	  _Bool sse1 = n == 2 && SSE_CLASS_P (classes[1]);
-	  if (sse0 && !sse1)
+	  if (sse0)
-	    flags |= 1 << 8;
+	    flags = (sse1 ? UNIX64_RET_ST_XMM0_XMM1 : UNIX64_RET_ST_XMM0_RAX);
-	  else if (!sse0 && sse1)
+	  else
-	    flags |= 1 << 9;
+	    flags = (sse1 ? UNIX64_RET_ST_RAX_XMM0 : UNIX64_RET_ST_RAX_RDX);
-	  else if (sse0 && sse1)
+
 	    flags |= 1 << 10;
 	  /* Mark the true size of the structure.  */
-	  flags |= cif->rtype->size << 12;
+	  flags |= rtype->size << UNIX64_SIZE_SHIFT;
 	}
      break;
    default:
      return FFI_BAD_TYPEDEF;
    }
  /* Go over all arguments and determine the way they should be passed.
@@ -418,9 +461,10 @@ ffi_prep_cif_machdep (ffi_cif *cif)
 	}
    }
  if (ssecount)
-    flags |= 1 << 11;
+    flags |= UNIX64_FLAG_XMM_ARGS;
  cif->flags = flags;
-  cif->bytes = (unsigned)ALIGN (bytes, 8);
+  cif->bytes = ALIGN (bytes, 8);
  return FFI_OK;
 }
@@ -432,20 +476,22 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
  enum x86_64_reg_class classes[MAX_CLASSES];
  char *stack, *argp;
  ffi_type **arg_types;
-  int gprcount, ssecount, ngpr, nsse, i, avn;
+  int gprcount, ssecount, ngpr, nsse, i, avn, flags;
  _Bool ret_in_memory;
  struct register_args *reg_args;
  /* Can't call 32-bit mode from 64-bit mode.  */
  FFI_ASSERT (cif->abi == FFI_UNIX64);
  /* If the return value is a struct and we don't have a return value
-     address then we need to make one.  Note the setting of flags to
+     address then we need to make one.  Otherwise we can ignore it.  */
-     VOID above in ffi_prep_cif_machdep.  */
+  flags = cif->flags;
-  ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
+  if (rvalue == NULL)
-		   && (cif->flags & 0xff) == FFI_TYPE_VOID);
+    {
-  if (rvalue == NULL && ret_in_memory)
+      if (flags & UNIX64_FLAG_RET_IN_MEM)
-    rvalue = alloca (cif->rtype->size);
+	rvalue = alloca (cif->rtype->size);
      else
 	flags = UNIX64_RET_VOID;
    }
  /* Allocate the space for the arguments, plus 4 words of temp space.  */
  stack = alloca (sizeof (struct register_args) + cif->bytes + 4*8);
@@ -458,7 +504,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
  /* If the return value is passed in memory, add the pointer as the
     first integer argument.  */
-  if (ret_in_memory)
+  if (flags & UNIX64_FLAG_RET_IN_MEM)
    reg_args->gpr[gprcount++] = (unsigned long) rvalue;
  avn = cif->nargs;
@@ -503,17 +549,17 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
 		  switch (arg_types[i]->type)
 		    {
 		    case FFI_TYPE_SINT8:
-		      *(SINT64 *)&reg_args->gpr[gprcount] = (SINT64) *((SINT8 *) a);
+		      reg_args->gpr[gprcount] = (SINT64) *((SINT8 *) a);
 		      break;
 		    case FFI_TYPE_SINT16:
-		      *(SINT64 *)&reg_args->gpr[gprcount] = (SINT64) *((SINT16 *) a);
+		      reg_args->gpr[gprcount] = (SINT64) *((SINT16 *) a);
 		      break;
 		    case FFI_TYPE_SINT32:
-		      *(SINT64 *)&reg_args->gpr[gprcount] = (SINT64) *((SINT32 *) a);
+		      reg_args->gpr[gprcount] = (SINT64) *((SINT32 *) a);
 		      break;
 		    default:
 		      reg_args->gpr[gprcount] = 0;
-		      memcpy (&reg_args->gpr[gprcount], a, size < 8 ? size : 8);
+		      memcpy (&reg_args->gpr[gprcount], a, size);
 		    }
 		  gprcount++;
 		  break;
@@ -533,7 +579,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
  reg_args->rax = ssecount;
  ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
-		   cif->flags, rvalue, fn);
+		   flags, rvalue, fn);
 }
 void
@@ -573,7 +619,7 @@ ffi_prep_closure_loc (ffi_closure* closure,
  if (cif->abi != FFI_UNIX64)
    return FFI_BAD_ABI;
-  if (cif->flags & (1 << 11))
+  if (cif->flags & UNIX64_FLAG_XMM_ARGS)
    dest = ffi_closure_unix64_sse;
  else
    dest = ffi_closure_unix64;
@@ -600,39 +646,17 @@ ffi_closure_unix64_inner(ffi_cif *cif,
  ffi_type **arg_types;
  long i, avn;
  int gprcount, ssecount, ngpr, nsse;
-  int ret;
+  int flags;
  avalue = alloca(cif->nargs * sizeof(void *));
  gprcount = ssecount = 0;
  ret = cif->rtype->type;
  if (ret != FFI_TYPE_VOID)
    {
      enum x86_64_reg_class classes[MAX_CLASSES];
      size_t n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
      if (n == 0)
 	{
 	  /* The return value goes in memory.  Arrange for the closure
 	     return value to go directly back to the original caller.  */
 	  rvalue = (void *) (unsigned long) reg_args->gpr[gprcount++];
 	  /* We don't have to do anything in asm for the return.  */
 	  ret = FFI_TYPE_VOID;
 	}
      else if (ret == FFI_TYPE_STRUCT && n == 2)
 	{
 	  /* Mark which register the second word of the structure goes in.  */
 	  _Bool sse0 = SSE_CLASS_P (classes[0]);
 	  _Bool sse1 = SSE_CLASS_P (classes[1]);
 	  if (!sse0 && sse1)
 	    ret |= 1 << 8;
 	  else if (sse0 && !sse1)
 	    ret |= 1 << 9;
 	}
    }
  avn = cif->nargs;
-  arg_types = cif->arg_types;
+  flags = cif->flags;
  avalue = alloca(avn * sizeof(void *));
  gprcount = ssecount = 0;
  if (flags & UNIX64_FLAG_RET_IN_MEM)
    rvalue = (void *)(uintptr_t)reg_args->gpr[gprcount++];
  arg_types = cif->arg_types;
  for (i = 0; i < avn; ++i)
    {
      enum x86_64_reg_class classes[MAX_CLASSES];
@@ -693,7 +717,7 @@ ffi_closure_unix64_inner(ffi_cif *cif,
  fun (cif, rvalue, avalue, user_data);
  /* Tell assembly how to perform return type promotions.  */
-  return ret;
+  return flags;
 }
 extern void ffi_go_closure_unix64(void) FFI_HIDDEN;
@@ -706,7 +730,7 @@ ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
  if (cif->abi != FFI_UNIX64)
    return FFI_BAD_ABI;
-  closure->tramp = (cif->flags & (1 << 11)
+  closure->tramp = (cif->flags & UNIX64_FLAG_XMM_ARGS
 		    ? ffi_go_closure_unix64_sse
 		    : ffi_go_closure_unix64);
  closure->cif = cif;
--- a/src/x86/internal64.h
+++ b/src/x86/internal64.h
@@ -0,0 +1,20 @@
 #define UNIX64_RET_VOID		0
 #define UNIX64_RET_UINT8	1
 #define UNIX64_RET_UINT16	2
 #define UNIX64_RET_UINT32	3
 #define UNIX64_RET_SINT8	4
 #define UNIX64_RET_SINT16	5
 #define UNIX64_RET_SINT32	6
 #define UNIX64_RET_INT64	7
 #define UNIX64_RET_XMM32	8
 #define UNIX64_RET_XMM64	9
 #define UNIX64_RET_X87		10
 #define UNIX64_RET_ST_RAX_RDX	11
 #define UNIX64_RET_ST_XMM0_RAX	12
 #define UNIX64_RET_ST_RAX_XMM0	13
 #define UNIX64_RET_ST_XMM0_XMM1	14
 #define UNIX64_RET_LAST		14
 #define UNIX64_FLAG_RET_IN_MEM	(1 << 10)
 #define UNIX64_FLAG_XMM_ARGS	(1 << 11)
 #define UNIX64_SIZE_SHIFT	12
--- a/src/x86/unix64.S
+++ b/src/x86/unix64.S
@@ -31,9 +31,15 @@
 #include <fficonfig.h>
 #include <ffi.h>
 #include <ffi_cfi.h>
 #include "internal64.h"
 	.text
 .macro E index
 	.align	8
 	.org	0b + \index * 8, 0x90
 .endm
 /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
 	            void *raddr, void (*fnaddr)(void));
@@ -41,7 +47,7 @@
   for this function.  This has been allocated by ffi_call.  We also
   deallocate some of the stack that has been alloca'd.  */
-	.align	2
+	.align	8
 	.globl	ffi_call_unix64
 	.type	ffi_call_unix64,@function
 	FFI_HIDDEN(ffi_call_unix64)
@@ -100,108 +106,80 @@ ffi_call_unix64:
 	cfi_restore(%rbp)
 	/* The first byte of the flags contains the FFI_TYPE.  */
 	cmpb	$UNIX64_RET_LAST, %cl
 	movzbl	%cl, %r10d
-	leaq	.Lstore_table(%rip), %r11
+	leaq	0f(%rip), %r11
-	movslq	(%r11, %r10, 4), %r10
+	ja	9f
-	addq	%r11, %r10
+	leaq	(%r11, %r10, 8), %r10
 	/* Prep for the structure cases: scratch area in redzone.  */
 	leaq	-20(%rsp), %rsi
 	jmp	*%r10
-	.section .rodata
+	.align	8
-	.align	2
+0:
-.Lstore_table:
+E UNIX64_RET_VOID
 	.long	.Lst_void-.Lstore_table		/* FFI_TYPE_VOID */
 	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_INT */
 	.long	.Lst_float-.Lstore_table	/* FFI_TYPE_FLOAT */
 	.long	.Lst_double-.Lstore_table	/* FFI_TYPE_DOUBLE */
 	.long	.Lst_ldouble-.Lstore_table	/* FFI_TYPE_LONGDOUBLE */
 	.long	.Lst_uint8-.Lstore_table	/* FFI_TYPE_UINT8 */
 	.long	.Lst_sint8-.Lstore_table	/* FFI_TYPE_SINT8 */
 	.long	.Lst_uint16-.Lstore_table	/* FFI_TYPE_UINT16 */
 	.long	.Lst_sint16-.Lstore_table	/* FFI_TYPE_SINT16 */
 	.long	.Lst_uint32-.Lstore_table	/* FFI_TYPE_UINT32 */
 	.long	.Lst_sint32-.Lstore_table	/* FFI_TYPE_SINT32 */
 	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_UINT64 */
 	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_SINT64 */
 	.long	.Lst_struct-.Lstore_table	/* FFI_TYPE_STRUCT */
 	.long	.Lst_int64-.Lstore_table	/* FFI_TYPE_POINTER */
 	.previous
 	.align 2
 .Lst_void:
 	ret
-	.align 2
+E UNIX64_RET_UINT8
-
+	movzbl	%al, %eax
 .Lst_uint8:
 	movzbq	%al, %rax
 	movq	%rax, (%rdi)
 	ret
-	.align 2
+E UNIX64_RET_UINT16
-.Lst_sint8:
+	movzwl	%ax, %eax
 	movq	%rax, (%rdi)
 	ret
 E UNIX64_RET_UINT32
 	movl	%eax, %eax
 	movq	%rax, (%rdi)
 	ret
 E UNIX64_RET_SINT8
 	movsbq	%al, %rax
 	movq	%rax, (%rdi)
 	ret
-	.align 2
+E UNIX64_RET_SINT16
 .Lst_uint16:
 	movzwq	%ax, %rax
 	movq	%rax, (%rdi)
 	.align 2
 .Lst_sint16:
 	movswq	%ax, %rax
 	movq	%rax, (%rdi)
 	ret
-	.align 2
+E UNIX64_RET_SINT32
 .Lst_uint32:
 	movl	%eax, %eax
 	movq	%rax, (%rdi)
 	.align 2
 .Lst_sint32:
 	cltq
 	movq	%rax, (%rdi)
 	ret
-	.align 2
+E UNIX64_RET_INT64
 .Lst_int64:
 	movq	%rax, (%rdi)
 	ret
-
+E UNIX64_RET_XMM32
-	.align 2
+	movd	%xmm0, (%rdi)
 .Lst_float:
 	movss	%xmm0, (%rdi)
 	ret
-	.align 2
+E UNIX64_RET_XMM64
-.Lst_double:
+	movq	%xmm0, (%rdi)
 	movsd	%xmm0, (%rdi)
 	ret
-.Lst_ldouble:
+E UNIX64_RET_X87
 	fstpt	(%rdi)
 	ret
-
+E UNIX64_RET_ST_RAX_RDX
 	.align 2
 .Lst_struct:
 	leaq	-20(%rsp), %rsi		/* Scratch area in redzone.  */
 	/* We have to locate the values now, and since we don't want to
 	   write too much data into the user's return value, we spill the
 	   value to a 16 byte scratch area first.  Bits 8, 9, and 10
 	   control where the values are located.  Only one of the three
 	   bits will be set; see ffi_prep_cif_machdep for the pattern.  */
 	movd	%xmm0, %r10
 	movd	%xmm1, %r11
 	testl	$0x100, %ecx
 	cmovnz	%rax, %rdx
 	cmovnz	%r10, %rax
 	testl	$0x200, %ecx
 	cmovnz	%r10, %rdx
 	testl	$0x400, %ecx
 	cmovnz	%r10, %rax
 	cmovnz	%r11, %rdx
 	movq	%rax, (%rsi)
 	movq	%rdx, 8(%rsi)
 	jmp	2f
 E UNIX64_RET_ST_XMM0_RAX
 	movq	%rax, 8(%rsi)
 	jmp	3f
 E UNIX64_RET_ST_RAX_XMM0
 	movq	%xmm0, 8(%rsi)
 	jmp	2f
 E UNIX64_RET_ST_XMM0_XMM1
 	movq	%xmm1, 8(%rsi)
-	/* Bits 12-31 contain the true size of the structure.  Copy from
+	.align 8
-	   the scratch area to the true destination.  */
+3:	movq	%xmm0, (%rsi)
-	shrl	$12, %ecx
+	shrl	$UNIX64_SIZE_SHIFT, %ecx
 	rep movsb
 	ret
 	.align 8
 2:	movq	%rax, (%rsi)
 	shrl	$UNIX64_SIZE_SHIFT, %ecx
 	rep movsb
 	ret
 9:	call	abort@PLT
 	/* Many times we can avoid loading any SSE registers at all.
 	   It's not worth an indirect jump to load the exact set of
@@ -292,83 +270,67 @@ ffi_closure_unix64:
 	cfi_adjust_cfa_offset(-ffi_closure_FS)
 	/* The first byte of the return value contains the FFI_TYPE.  */
 	cmpb	$UNIX64_RET_LAST, %al
 	movzbl	%al, %r10d
-	leaq	.Lload_table(%rip), %r11
+	leaq	0f(%rip), %r11
-	movslq	(%r11, %r10, 4), %r10
+	ja	9f
-	addq	%r11, %r10
+	leaq	(%r11, %r10, 8), %r10
 	jmp	*%r10
-	.section .rodata
+	.align	8
-	.align	2
+0:
-.Lload_table:
+E UNIX64_RET_VOID
 	.long	.Lld_void-.Lload_table		/* FFI_TYPE_VOID */
 	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_INT */
 	.long	.Lld_float-.Lload_table		/* FFI_TYPE_FLOAT */
 	.long	.Lld_double-.Lload_table	/* FFI_TYPE_DOUBLE */
 	.long	.Lld_ldouble-.Lload_table	/* FFI_TYPE_LONGDOUBLE */
 	.long	.Lld_int8-.Lload_table		/* FFI_TYPE_UINT8 */
 	.long	.Lld_int8-.Lload_table		/* FFI_TYPE_SINT8 */
 	.long	.Lld_int16-.Lload_table		/* FFI_TYPE_UINT16 */
 	.long	.Lld_int16-.Lload_table		/* FFI_TYPE_SINT16 */
 	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_UINT32 */
 	.long	.Lld_int32-.Lload_table		/* FFI_TYPE_SINT32 */
 	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_UINT64 */
 	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_SINT64 */
 	.long	.Lld_struct-.Lload_table	/* FFI_TYPE_STRUCT */
 	.long	.Lld_int64-.Lload_table		/* FFI_TYPE_POINTER */
 	.previous
 	.align 2
 .Lld_void:
 	ret
-
+E UNIX64_RET_UINT8
 	.align 2
 .Lld_int8:
 	movzbl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
-	.align 2
+E UNIX64_RET_UINT16
 .Lld_int16:
 	movzwl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
-	.align 2
+E UNIX64_RET_UINT32
 .Lld_int32:
 	movl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
-	.align 2
+E UNIX64_RET_SINT8
-.Lld_int64:
+	movsbl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
 E UNIX64_RET_SINT16
 	movswl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
 E UNIX64_RET_SINT32
 	movl	ffi_closure_RED_RVALUE(%rsp), %eax
 	ret
 E UNIX64_RET_INT64
 	movq	ffi_closure_RED_RVALUE(%rsp), %rax
 	ret
-
+E UNIX64_RET_XMM32
-	.align 2
+	movd	ffi_closure_RED_RVALUE(%rsp), %xmm0
 .Lld_float:
 	movss	ffi_closure_RED_RVALUE(%rsp), %xmm0
 	ret
-	.align 2
+E UNIX64_RET_XMM64
-.Lld_double:
+	movq	ffi_closure_RED_RVALUE(%rsp), %xmm0
 	movsd	ffi_closure_RED_RVALUE(%rsp), %xmm0
 	ret
-	.align 2
+E UNIX64_RET_X87
 .Lld_ldouble:
 	fldt	ffi_closure_RED_RVALUE(%rsp)
 	ret
-
+E UNIX64_RET_ST_RAX_RDX
 	.align 2
 .Lld_struct:
 	/* There are four possibilities here, %rax/%rdx, %xmm0/%rax,
 	   %rax/%xmm0, %xmm0/%xmm1.  We collapse two by always loading
 	   both rdx and xmm1 with the second word.  For the remaining,
 	   bit 8 set means xmm0 gets the second word, and bit 9 means
 	   that rax gets the second word.  */
 	movq	ffi_closure_RED_RVALUE(%rsp), %rcx
 	movq	ffi_closure_RED_RVALUE+8(%rsp), %rdx
 	jmp	2f
 E UNIX64_RET_ST_XMM0_RAX
 	movq	ffi_closure_RED_RVALUE+8(%rsp), %rax
 	jmp	3f
 E UNIX64_RET_ST_RAX_XMM0
 	movq	ffi_closure_RED_RVALUE+8(%rsp), %xmm0
 	jmp	2f
 E UNIX64_RET_ST_XMM0_XMM1
 	movq	ffi_closure_RED_RVALUE+8(%rsp), %xmm1
-	testl	$0x100, %eax
+
-	cmovnz	%rdx, %rcx
+	.align	8
-	movd	%rcx, %xmm0
+3:	movq	ffi_closure_RED_RVALUE(%rsp), %xmm0
 	testl	$0x200, %eax
 	movq	ffi_closure_RED_RVALUE(%rsp), %rax
 	cmovnz	%rdx, %rax
 	ret
 	.align	8
 2:	movq	ffi_closure_RED_RVALUE(%rsp), %rax
 	ret
 9:	call	abort@PLT
 	cfi_endproc
 	.size	ffi_closure_unix64,.-ffi_closure_unix64