arm: Rewrite ffi_call

Use the trick to allocate the stack frame for ffi_call_SYSV within ffi_call itself.
2014-10-17 01:27:16 -04:00
parent a74a3aaddb
commit e7f15f60e8
4 changed files with 303 additions and 449 deletions
--- a/src/arm/ffi.c
+++ b/src/arm/ffi.c
@@ -30,16 +30,13 @@
 #include <ffi.h>
 #include <ffi_common.h>
 #include <stdlib.h>
 #include "internal.h"
 /* Forward declares. */
 static int vfp_type_p (const ffi_type *);
 static void layout_vfp_args (ffi_cif *);
 int ffi_prep_args_SYSV (char *stack, extended_cif *ecif, float *vfp_space);
 int ffi_prep_args_VFP (char *stack, extended_cif *ecif, float *vfp_space);
 static void *
 ffi_align (ffi_type *ty, void *p)
 {
@@ -98,53 +95,44 @@ ffi_put_arg (ffi_type *ty, void *src, void *dst)
  return ALIGN (z, 4);
 }
-/* ffi_prep_args is called by the assembly routine once stack space
+/* ffi_prep_args is called once stack space has been allocated
-   has been allocated for the function's arguments
+   for the function's arguments.
   The vfp_space parameter is the load area for VFP regs, the return
   value is cif->vfp_used (word bitset of VFP regs used for passing
   arguments). These are only used for the VFP hard-float ABI.
 */
-int
+static void
-ffi_prep_args_SYSV (char *stack, extended_cif *ecif, float *vfp_space)
+ffi_prep_args_SYSV (ffi_cif *cif, int flags, void *rvalue,
 		    void **avalue, char *argp)
 {
-  register unsigned int i;
+  ffi_type **arg_types = cif->arg_types;
-  register void **p_argv;
+  int i, n;
  register char *argp;
  register ffi_type **p_arg;
  argp = stack;
-  if (ecif->cif->flags == FFI_TYPE_STRUCT)
+  if (flags == ARM_TYPE_STRUCT)
    {
-      *(void **) argp = ecif->rvalue;
+      *(void **) argp = rvalue;
      argp += 4;
    }
-  p_argv = ecif->avalue;
+  for (i = 0, n = cif->nargs; i < n; i++)
  for (i = ecif->cif->nargs, p_arg = ecif->cif->arg_types;
       (i != 0); i--, p_arg++, p_argv++)
    {
-      argp = ffi_align (*p_arg, argp);
+      ffi_type *ty = arg_types[i];
-      argp += ffi_put_arg (*p_arg, *p_argv, argp);
+      argp = ffi_align (ty, argp);
      argp += ffi_put_arg (ty, avalue[i], argp);
    }
 }
-  return 0;
+static void
-}
+ffi_prep_args_VFP (ffi_cif *cif, int flags, void *rvalue,
-
+                   void **avalue, char *stack, char *vfp_space)
 int
 ffi_prep_args_VFP (char *stack, extended_cif * ecif, float *vfp_space)
 {
-  register unsigned int i, vi = 0;
+  ffi_type **arg_types = cif->arg_types;
-  register void **p_argv;
+  int i, n, vi = 0;
-  register char *argp, *regp, *eo_regp;
+  char *argp, *regp, *eo_regp;
  register ffi_type **p_arg;
  char stack_used = 0;
  char done_with_regs = 0;
  /* Make sure we are using FFI_VFP.  */
  FFI_ASSERT (ecif->cif->abi == FFI_VFP);
  /* The first 4 words on the stack are used for values
     passed in core registers.  */
  regp = stack;
@@ -152,37 +140,36 @@ ffi_prep_args_VFP (char *stack, extended_cif * ecif, float *vfp_space)
  /* If the function returns an FFI_TYPE_STRUCT in memory,
     that address is passed in r0 to the function.  */
-  if (ecif->cif->flags == FFI_TYPE_STRUCT)
+  if (flags == ARM_TYPE_STRUCT)
    {
-      *(void **) regp = ecif->rvalue;
+      *(void **) regp = rvalue;
      regp += 4;
    }
-  p_argv = ecif->avalue;
+  for (i = 0, n = cif->nargs; i < n; i++)
  for (i = ecif->cif->nargs, p_arg = ecif->cif->arg_types;
       (i != 0); i--, p_arg++, p_argv++)
    {
-      int is_vfp_type = vfp_type_p (*p_arg);
+      ffi_type *ty = arg_types[i];
      void *a = avalue[i];
      int is_vfp_type = vfp_type_p (ty);
      /* Allocated in VFP registers. */
-      if (vi < ecif->cif->vfp_nargs && is_vfp_type)
+      if (vi < cif->vfp_nargs && is_vfp_type)
 	{
-	  char *vfp_slot = (char *) (vfp_space + ecif->cif->vfp_args[vi++]);
+	  char *vfp_slot = vfp_space + cif->vfp_args[vi++] * 4;
-	  ffi_put_arg (*p_arg, *p_argv, vfp_slot);
+	  ffi_put_arg (ty, a, vfp_slot);
 	  continue;
 	}
      /* Try allocating in core registers. */
      else if (!done_with_regs && !is_vfp_type)
 	{
-	  char *tregp = ffi_align (*p_arg, regp);
+	  char *tregp = ffi_align (ty, regp);
-	  size_t size = (*p_arg)->size;
+	  size_t size = ty->size;
 	  size = (size < 4) ? 4 : size;	// pad
 	  /* Check if there is space left in the aligned register
 	     area to place the argument.  */
 	  if (tregp + size <= eo_regp)
 	    {
-	      regp = tregp + ffi_put_arg (*p_arg, *p_argv, tregp);
+	      regp = tregp + ffi_put_arg (ty, a, tregp);
 	      done_with_regs = (regp == argp);
 	      // ensure we did not write into the stack area
 	      FFI_ASSERT (regp <= argp);
@@ -195,88 +182,98 @@ ffi_prep_args_VFP (char *stack, extended_cif * ecif, float *vfp_space)
 	    {
 	      stack_used = 1;
 	      done_with_regs = 1;
-	      argp = tregp + ffi_put_arg (*p_arg, *p_argv, tregp);
+	      argp = tregp + ffi_put_arg (ty, a, tregp);
 	      FFI_ASSERT (eo_regp < argp);
 	      continue;
 	    }
 	}
      /* Base case, arguments are passed on the stack */
      stack_used = 1;
-      argp = ffi_align (*p_arg, argp);
+      argp = ffi_align (ty, argp);
-      argp += ffi_put_arg (*p_arg, *p_argv, argp);
+      argp += ffi_put_arg (ty, a, argp);
    }
  /* Indicate the VFP registers used. */
  return ecif->cif->vfp_used;
 }
 /* Perform machine dependent cif processing */
 ffi_status
 ffi_prep_cif_machdep (ffi_cif *cif)
 {
  int flags = 0, cabi = cif->abi;
  size_t bytes;
  /* Round the stack up to a multiple of 8 bytes.  This isn't needed
     everywhere, but it is on some platforms, and it doesn't harm anything
     when it isn't needed.  */
-  cif->bytes = (cif->bytes + 7) & ~7;
+  bytes = ALIGN (cif->bytes, 8);
-  /* Set the return type flag */
+  /* Minimum stack space is the 4 register arguments that we pop.  */
-  switch (cif->rtype->type)
+  if (bytes < 4*4)
-    {
+    bytes = 4*4;
-    case FFI_TYPE_VOID:
+  cif->bytes = bytes;
    case FFI_TYPE_FLOAT:
    case FFI_TYPE_DOUBLE:
      cif->flags = (unsigned) cif->rtype->type;
      break;
    case FFI_TYPE_SINT64:
    case FFI_TYPE_UINT64:
      cif->flags = (unsigned) FFI_TYPE_SINT64;
      break;
    case FFI_TYPE_STRUCT:
      if (cif->abi == FFI_VFP)
 	{
 	  int h = vfp_type_p (cif->rtype);
 	  if (h)
 	    {
 	      int ele_count = h >> 8;
 	      int type_code = h & 0xff;
 	      if (ele_count > 1)
 		{
 		  if (type_code == FFI_TYPE_FLOAT)
 		    type_code = FFI_TYPE_STRUCT_VFP_FLOAT;
 		  else
 		    type_code = FFI_TYPE_STRUCT_VFP_DOUBLE;
 		}
 	      cif->flags = type_code;
 	      break;
 	    }
 	}
      if (cif->rtype->size <= 4)
 	{
 	  /* A Composite Type not larger than 4 bytes is returned in r0.  */
 	  cif->flags = (unsigned) FFI_TYPE_INT;
 	}
      else
 	{
 	  /* A Composite Type larger than 4 bytes, or whose size cannot
 	     be determined statically ... is stored in memory at an
 	     address passed [in r0].  */
 	  cif->flags = (unsigned) FFI_TYPE_STRUCT;
 	}
      break;
    default:
      cif->flags = FFI_TYPE_INT;
      break;
    }
  /* Map out the register placements of VFP register args.  The VFP
     hard-float calling conventions are slightly more sophisticated
     than the base calling conventions, so we do it here instead of
     in ffi_prep_args(). */
-  if (cif->abi == FFI_VFP)
+  if (cabi == FFI_VFP)
    layout_vfp_args (cif);
  /* Set the return type flag */
  switch (cif->rtype->type)
    {
    case FFI_TYPE_VOID:
      flags = ARM_TYPE_VOID;
      break;
    case FFI_TYPE_INT:
    case FFI_TYPE_UINT8:
    case FFI_TYPE_SINT8:
    case FFI_TYPE_UINT16:
    case FFI_TYPE_SINT16:
    case FFI_TYPE_UINT32:
    case FFI_TYPE_SINT32:
    case FFI_TYPE_POINTER:
      flags = ARM_TYPE_INT;
      break;
    case FFI_TYPE_SINT64:
    case FFI_TYPE_UINT64:
      flags = ARM_TYPE_INT64;
      break;
    case FFI_TYPE_FLOAT:
      flags = (cabi == FFI_VFP ? ARM_TYPE_VFP_S : ARM_TYPE_INT);
      break;
    case FFI_TYPE_DOUBLE:
      flags = (cabi == FFI_VFP ? ARM_TYPE_VFP_D : ARM_TYPE_INT64);
      break;
    case FFI_TYPE_STRUCT:
      if (cabi == FFI_VFP)
 	{
 	  int h = vfp_type_p (cif->rtype);
 	  flags = ARM_TYPE_VFP_N;
 	  if (h == 0x100 + FFI_TYPE_FLOAT)
 	    flags = ARM_TYPE_VFP_S;
 	  if (h == 0x100 + FFI_TYPE_DOUBLE)
 	    flags = ARM_TYPE_VFP_D;
 	  if (h != 0)
 	      break;
 	}
      /* A Composite Type not larger than 4 bytes is returned in r0.
 	 A Composite Type larger than 4 bytes, or whose size cannot
 	 be determined statically ... is stored in memory at an
 	 address passed [in r0].  */
      flags = (cif->rtype->size <= 4 ? ARM_TYPE_INT : ARM_TYPE_STRUCT);
      break;
    default:
      abort();
    }
  cif->flags = flags;
  return FFI_OK;
 }
@@ -293,69 +290,83 @@ ffi_prep_cif_machdep_var (ffi_cif * cif,
 }
 /* Prototypes for assembly functions, in sysv.S.  */
-extern void ffi_call_SYSV (void (*fn) (void), extended_cif *, unsigned,
+
-			   unsigned, unsigned *);
+struct call_frame
-extern void ffi_call_VFP (void (*fn) (void), extended_cif *, unsigned,
+{
-			  unsigned, unsigned *);
+  void *fp;
  void *lr;
  void *rvalue;
  int flags;
 };
 extern void ffi_call_SYSV (void *stack, struct call_frame *,
 			   void (*fn) (void)) FFI_HIDDEN;
 extern void ffi_call_VFP (void *vfp_space, struct call_frame *,
 			   void (*fn) (void), unsigned vfp_used) FFI_HIDDEN;
 void
 ffi_call (ffi_cif * cif, void (*fn) (void), void *rvalue, void **avalue)
 {
-  extended_cif ecif;
+  int flags = cif->flags;
  ffi_type *rtype = cif->rtype;
  size_t bytes, rsize, vfp_size;
  char *stack, *vfp_space, *new_rvalue;
  struct call_frame *frame;
-  int small_struct = (cif->flags == FFI_TYPE_INT
+  rsize = 0;
-		      && cif->rtype->type == FFI_TYPE_STRUCT);
+  if (rvalue == NULL)
  int vfp_struct = (cif->flags == FFI_TYPE_STRUCT_VFP_FLOAT
 		    || cif->flags == FFI_TYPE_STRUCT_VFP_DOUBLE);
  unsigned int temp;
  ecif.cif = cif;
  ecif.avalue = avalue;
  /* If the return value is a struct and we don't have a return
     value address then we need to make one.  */
  if ((rvalue == NULL) && (cif->flags == FFI_TYPE_STRUCT))
    {
-      ecif.rvalue = alloca (cif->rtype->size);
+      /* If the return value is a struct and we don't have a return
 	 value address then we need to make one.  Otherwise the return
 	 value is in registers and we can ignore them.  */
      if (flags == ARM_TYPE_STRUCT)
 	rsize = rtype->size;
      else
 	flags = ARM_TYPE_VOID;
    }
-  else if (small_struct)
+  else if (flags == ARM_TYPE_VFP_N)
    ecif.rvalue = &temp;
  else if (vfp_struct)
    {
      /* Largest case is double x 4. */
-      ecif.rvalue = alloca (32);
+      rsize = 32;
    }
  else if (flags == ARM_TYPE_INT && rtype->type == FFI_TYPE_STRUCT)
    rsize = 4;
  /* Largest case.  */
  vfp_size = (cif->abi == FFI_VFP && cif->vfp_used ? 8*8: 0);
  bytes = cif->bytes;
  stack = alloca (vfp_size + bytes + sizeof(struct call_frame) + rsize);
  vfp_space = NULL;
  if (vfp_size)
    {
      vfp_space = stack;
      stack += vfp_size;
    }
  frame = (struct call_frame *)(stack + bytes);
  new_rvalue = rvalue;
  if (rsize)
    new_rvalue = (void *)(frame + 1);
  frame->rvalue = new_rvalue;
  frame->flags = flags;
  if (vfp_space)
    {
      ffi_prep_args_VFP (cif, flags, new_rvalue, avalue, stack, vfp_space);
      ffi_call_VFP (vfp_space, frame, fn, cif->vfp_used);
    }
  else
    ecif.rvalue = rvalue;
  switch (cif->abi)
    {
-    case FFI_SYSV:
+      ffi_prep_args_SYSV (cif, flags, new_rvalue, avalue, stack);
-      ffi_call_SYSV (fn, &ecif, cif->bytes, cif->flags, ecif.rvalue);
+      ffi_call_SYSV (stack, frame, fn);
-      break;
+    }
-    case FFI_VFP:
+  if (rvalue && rvalue != new_rvalue)
-#ifdef __ARM_EABI__
+    memcpy (rvalue, new_rvalue, rtype->size);
      ffi_call_VFP (fn, &ecif, cif->bytes, cif->flags, ecif.rvalue);
      break;
 #endif
    default:
      FFI_ASSERT (0);
      break;
    }
  if (small_struct)
    {
      FFI_ASSERT (rvalue != NULL);
      memcpy (rvalue, &temp, cif->rtype->size);
    }
  else if (vfp_struct)
    {
      FFI_ASSERT (rvalue != NULL);
      memcpy (rvalue, ecif.rvalue, cif->rtype->size);
    }
 }
 /** private members **/
--- a/src/arm/ffitarget.h
+++ b/src/arm/ffitarget.h
@@ -53,7 +53,7 @@ typedef enum ffi_abi {
 #define FFI_EXTRA_CIF_FIELDS			\
  int vfp_used;					\
-  short vfp_reg_free, vfp_nargs;		\
+  unsigned short vfp_reg_free, vfp_nargs;	\
  signed char vfp_args[16]			\
 /* Internally used. */
--- a/src/arm/internal.h
+++ b/src/arm/internal.h
@@ -0,0 +1,7 @@
 #define ARM_TYPE_VFP_S	0
 #define ARM_TYPE_VFP_D	1
 #define ARM_TYPE_VFP_N	2
 #define ARM_TYPE_INT64	3
 #define ARM_TYPE_INT	4
 #define ARM_TYPE_VOID	5
 #define ARM_TYPE_STRUCT	6
--- a/src/arm/sysv.S
+++ b/src/arm/sysv.S
@@ -28,219 +28,155 @@
 #define LIBFFI_ASM	
 #include <fficonfig.h>
 #include <ffi.h>
-#ifdef HAVE_MACHINE_ASM_H
+#include <ffi_cfi.h>
-#include <machine/asm.h>
+#include "internal.h"
 #else
 #ifdef __USER_LABEL_PREFIX__
 #define CONCAT1(a, b) CONCAT2(a, b)
 #define CONCAT2(a, b) a ## b
 /* Use the right prefix for global labels.  */
 #define CNAME(x) CONCAT1 (__USER_LABEL_PREFIX__, x)
 #else
 #define CNAME(x) x
 #endif
 #ifdef __APPLE__
 #define ENTRY(x) .globl _##x; _##x:
 #else
 #define ENTRY(x) .globl CNAME(x); .type CNAME(x),%function; CNAME(x):
 #endif /* __APPLE__ */
 #endif
 #ifdef __ELF__
 #define LSYM(x) .x
 #else
 #define LSYM(x) x
 #endif
 /* Use the SOFTFP return value ABI on Mac OS X, as per the iOS ABI
  Function Call Guide */
 #ifdef __APPLE__
 #define __SOFTFP__
 #endif
 /* We need a better way of testing for this, but for now, this is all 
   we can do.  */
@ This selects the minimum architecture level required.
 #define __ARM_ARCH__ 3
 #if defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
 # undef __ARM_ARCH__
 # define __ARM_ARCH__ 4
 #endif
 #if defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) \
 	|| defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
 	|| defined(__ARM_ARCH_5TEJ__)
 # undef __ARM_ARCH__
 # define __ARM_ARCH__ 5
 #endif
 #if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
        || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \
        || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) \
 	|| defined(__ARM_ARCH_6M__)
 # undef __ARM_ARCH__
 # define __ARM_ARCH__ 6
 #endif
 /* GCC 4.8 provides __ARM_ARCH; construct it otherwise.  */
 #ifndef __ARM_ARCH
 # if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
     || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
     || defined(__ARM_ARCH_7EM__)
-# undef __ARM_ARCH__
+#  define __ARM_ARCH 7
-# define __ARM_ARCH__ 7
+# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
-#endif
+        || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \
-
+        || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) \
-#if __ARM_ARCH__ >= 5
+	|| defined(__ARM_ARCH_6M__)
-# define call_reg(x)	blx	x
+#  define __ARM_ARCH 6
-#elif defined (__ARM_ARCH_4T__)
+# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) \
-# define call_reg(x)	mov	lr, pc ; bx	x
+	|| defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
-# if defined(__thumb__) || defined(__THUMB_INTERWORK__)
+	|| defined(__ARM_ARCH_5TEJ__)
-#  define __INTERWORKING__
+#  define __ARM_ARCH 5
 # endif
 # else
-# define call_reg(x)	mov	lr, pc ; mov	pc, x
+#  define __ARM_ARCH 4
 # endif
 #endif
 /* Conditionally compile unwinder directives.  */
 .macro UNWIND text:vararg
 #ifdef __ARM_EABI__
-#define UNWIND
+	\text
 #else
 #define UNWIND @
 #endif	
 .syntax unified
 #if defined(__thumb__) && !defined(__THUMB_INTERWORK__)
 #define ARM_FUNC_START(name) \
 	.text; \
 	.align 2; \
 	.thumb; \
 	.thumb_func; \
 	ENTRY(name); \
 	bx pc; \
 	nop; \
 	.arm; \
 	UNWIND .fnstart; \
 _L__##name:
 #else
 #define ARM_FUNC_START(name) \
 	.text; \
 	.align 2; \
 	.arm; \
 	ENTRY(name); \
 	UNWIND .fnstart
 #endif
 .macro	RETLDM	regs=, cond=, dirn=ia
 #if defined (__INTERWORKING__)
 	.ifc "\regs",""
 	ldr\cond	lr, [sp], #4
 	.else
 	ldm\cond\dirn	sp!, {\regs, lr}
 	.endif
 	bx\cond	lr
 #else
 	.ifc "\regs",""
 	ldr\cond	pc, [sp], #4
 	.else
 	ldm\cond\dirn	sp!, {\regs, pc}
 	.endif
 #endif	
 .endm
 #if defined(HAVE_AS_CFI_PSEUDO_OP) && defined(__ARM_EABI__)
 	.cfi_sections	.debug_frame
 #endif
-	@ r0:   ffi_prep_args
+#define CONCAT(a, b)	CONCAT2(a, b)
-	@ r1:   &ecif
+#define CONCAT2(a, b)	a ## b
 	@ r2:   cif->bytes
 	@ r3:   fig->flags
 	@ sp+0: ecif.rvalue
-	@ This assumes we are using gas.
+#ifdef __USER_LABEL_PREFIX__
-ARM_FUNC_START(ffi_call_SYSV)
+# define CNAME(X)	CONCAT (__USER_LABEL_PREFIX__, X)
-	@ Save registers
+#else
-        stmfd	sp!, {r0-r3, fp, lr}
+# define CNAME(X)	X
-	UNWIND .save	{r0-r3, fp, lr}
+#endif
-	mov	fp, sp
+#ifdef __ELF__
 # define SIZE(X)	.size CNAME(X), . - CNAME(X)
 # define TYPE(X, Y)	.type CNAME(X), Y
 #else
 # define SIZE(X)
 # define TYPE(X, Y)
 #endif
 #define ARM_FUNC_START(name, gl) \
 	.align	3; \
 	.ifne gl; .globl CNAME(name); FFI_HIDDEN(CNAME(name)); .endif; \
 	TYPE(name, %function); \
 	CNAME(name):
 #define ARM_FUNC_END(name) \
 	SIZE(name)
 /* Aid in defining a jump table with 8 bytes between entries.  */
 .macro E index
 	.if . - 0b - 8*\index
 	.error "type table out of sync"
 	.endif
 .endm
 	.text
 	.syntax unified
 	.arm
 	/* We require interworking on LDM, which implies ARMv5T,
 	   which implies the existance of BLX.  */
 	.arch	armv5t
 	/* Note that we use STC and LDC to encode VFP instructions,
 	   so that we do not need ".fpu vfp", nor get that added to
 	   the object file attributes.  These will not be executed
 	   unless the FFI_VFP abi is used.  */
 	@ r0:   stack
 	@ r1:   frame
 	@ r2:   fn
 	@ r3:	vfp_used
 ARM_FUNC_START(ffi_call_VFP, 1)
 	UNWIND	.fnstart
 	cfi_startproc
 	cmp	r3, #3			@ load only d0 if possible
 	ldcle	p11, cr0, [r0]		@ vldrle d0, [sp]
 	ldcgt	p11, cr0, [r0], {16}	@ vldmgt sp, {d0-d7}
 	add	r0, r0, #64		@ discard the vfp register args
 	/* FALLTHRU */
 ARM_FUNC_END(ffi_call_VFP)
 ARM_FUNC_START(ffi_call_SYSV, 1)
 	stm	r1, {fp, lr}
 	mov	fp, r1
 	@ This is a bit of a lie wrt the origin of the unwind info, but
 	@ now we've got the usual frame pointer and two saved registers.
 	UNWIND	.save {fp,lr}
 	UNWIND	.setfp fp, sp
 	cfi_def_cfa(fp, 8)
 	cfi_rel_offset(fp, 0)
 	cfi_rel_offset(lr, 4)
-	@ Make room for all of the new args.
+	mov	sp, r0		@ install the stack pointer
-	sub	sp, fp, r2
+	mov	lr, r2		@ move the fn pointer out of the way
-
+	ldmia	sp!, {r0-r3}	@ move first 4 parameters in registers.
-	@ Place all of the ffi_prep_args in position
+	blx	lr		@ call fn
 	mov	r0, sp
 	@     r1 already set
 	@ Call ffi_prep_args(stack, &ecif)
 	bl	CNAME(ffi_prep_args_SYSV)
 	@ move first 4 parameters in registers
 	ldmia	sp, {r0-r3}
 	@ and adjust stack
 	sub	lr, fp, sp	@ cif->bytes == fp - sp
 	ldr	ip, [fp]	@ load fn() in advance
 	cmp	lr, #16
 	movhs	lr, #16
 	add	sp, sp, lr
 	@ call (fn) (...)
 	call_reg(ip)
 	@ Remove the space we pushed for the args
 	mov	sp, fp
 	@ Load r2 with the pointer to storage for the return value
 	ldr	r2, [sp, #24]
 	@ Load r3 with the return type code
-	ldr	r3, [sp, #12]
+	ldr	r2, [fp, #8]
 	ldr	r3, [fp, #12]
-	@ If the return value pointer is NULL, assume no return value.
+	@ Deallocate the stack with the arguments.
-	cmp	r2, #0
+	mov	sp, fp
-	beq	LSYM(Lepilogue)
+	cfi_def_cfa_register(sp)
-@ return INT
+	@ Store values stored in registers.
-	cmp	r3, #FFI_TYPE_INT
+	.align	3
-#if defined(__SOFTFP__) || defined(__ARM_EABI__)
+	add	pc, pc, r3, lsl #3
-	cmpne	r3, #FFI_TYPE_FLOAT
+	nop
-#endif
+0:
-	streq	r0, [r2]
+E ARM_TYPE_VFP_S
-	beq	LSYM(Lepilogue)
+	stc	p10, cr0, [r2]		@ vstr s0, [r2]
 	pop	{fp,pc}
 E ARM_TYPE_VFP_D
 	stc	p11, cr0, [r2]		@ vstr d0, [r2]
 	pop	{fp,pc}
 E ARM_TYPE_VFP_N
 	stc	p11, cr0, [r2], {8}	@ vstm r2, {d0-d3}
 	pop	{fp,pc}
 E ARM_TYPE_INT64
 	str	r1, [r2, #4]
 	nop
 E ARM_TYPE_INT
 	str	r0, [r2]
 	pop	{fp,pc}
 E ARM_TYPE_VOID
 	pop	{fp,pc}
 	nop
 E ARM_TYPE_STRUCT
 	pop	{fp,pc}
-	@ return INT64
+	cfi_endproc
 	cmp	r3, #FFI_TYPE_SINT64
 #if defined(__SOFTFP__) || defined(__ARM_EABI__)
 	cmpne	r3, #FFI_TYPE_DOUBLE
 #endif
 	stmiaeq	r2, {r0, r1}
 #if !defined(__SOFTFP__) && !defined(__ARM_EABI__)
 	beq	LSYM(Lepilogue)
@ return FLOAT
 	cmp	r3, #FFI_TYPE_FLOAT
 	stfeqs	f0, [r2]
 	beq	LSYM(Lepilogue)
@ return DOUBLE or LONGDOUBLE
 	cmp	r3, #FFI_TYPE_DOUBLE
 	stfeqd	f0, [r2]
 #endif
 LSYM(Lepilogue):
 #if defined (__INTERWORKING__)
 	ldmia   sp!, {r0-r3,fp, lr}
 	bx	lr
 #else
 	ldmia   sp!, {r0-r3,fp, pc}
 #endif
 .ffi_call_SYSV_end:
 	UNWIND	.fnend
-#ifdef __ELF__
+ARM_FUNC_END(ffi_call_SYSV)
        .size    CNAME(ffi_call_SYSV),.ffi_call_SYSV_end-CNAME(ffi_call_SYSV)
 #endif
 /*
@@ -251,7 +187,8 @@ LSYM(Lepilogue):
  	     void *args;
 */
-ARM_FUNC_START(ffi_closure_SYSV)
+ARM_FUNC_START(ffi_closure_SYSV, 1)
 	UNWIND	.fnstart
 	UNWIND .pad #16
 	add	ip, sp, #16
 	stmfd	sp!, {ip, lr}
@@ -310,116 +247,16 @@ ARM_FUNC_START(ffi_closure_SYSV)
 	ldfd	f0, [sp]
 	b	.Lclosure_epilogue
 #endif
 .ffi_closure_SYSV_end:
 	UNWIND .fnend
-#ifdef __ELF__
+ARM_FUNC_END(ffi_closure_SYSV)
        .size    CNAME(ffi_closure_SYSV),.ffi_closure_SYSV_end-CNAME(ffi_closure_SYSV)
 #endif
 /* Below are VFP hard-float ABI call and closure implementations.
   Add VFP FPU directive here. This is only compiled into the library
   under EABI.  */
 #ifdef __ARM_EABI__
-	.fpu	vfp
+ARM_FUNC_START(ffi_closure_VFP, 1)
-
+	UNWIND	.fnstart
 	@ r0:   fn
 	@ r1:   &ecif
 	@ r2:   cif->bytes
 	@ r3:   fig->flags
 	@ sp+0: ecif.rvalue
 ARM_FUNC_START(ffi_call_VFP)
 	@ Save registers
        stmfd	sp!, {r0-r3, fp, lr}
 	UNWIND .save	{r0-r3, fp, lr}
 	mov	fp, sp
 	UNWIND .setfp	fp, sp
 	@ Make room for all of the new args.
 	sub	sp, sp, r2
 	@ Make room for loading VFP args
 	sub	sp, sp, #64
 	@ Place all of the ffi_prep_args in position
 	mov	r0, sp
 	@     r1 already set
 	sub	r2, fp, #64   @ VFP scratch space
 	@ Call ffi_prep_args(stack, &ecif, vfp_space)
 	bl	CNAME(ffi_prep_args_VFP)
 	@ Load VFP register args if needed
 	cmp	r0, #0
 	mov	ip, fp
 	beq	LSYM(Lbase_args)
 	@ Load only d0 if possible
 	cmp	r0, #3
 	sub	ip, fp, #64
 	flddle	d0, [ip]
 	fldmiadgt	ip, {d0-d7}
 LSYM(Lbase_args):
 	@ move first 4 parameters in registers
 	ldmia	sp, {r0-r3}
 	@ and adjust stack
 	sub	lr, ip, sp	@ cif->bytes == (fp - 64) - sp
 	ldr	ip, [fp]	@ load fn() in advance
        cmp	lr, #16
 	movhs	lr, #16
        add	sp, sp, lr
 	@ call (fn) (...)
 	call_reg(ip)
 	@ Remove the space we pushed for the args
 	mov	sp, fp
 	@ Load r2 with the pointer to storage for
 	@ the return value
 	ldr	r2, [sp, #24]
 	@ Load r3 with the return type code 
 	ldr	r3, [sp, #12]
 	@ If the return value pointer is NULL,
 	@ assume no return value.
 	cmp	r2, #0
 	beq	LSYM(Lepilogue_vfp)
 	cmp	r3, #FFI_TYPE_INT
 	streq	r0, [r2]
 	beq	LSYM(Lepilogue_vfp)
 	cmp	r3, #FFI_TYPE_SINT64
 	stmeqia	r2, {r0, r1}
 	beq	LSYM(Lepilogue_vfp)
 	cmp	r3, #FFI_TYPE_FLOAT
 	fstseq	s0, [r2]
 	beq	LSYM(Lepilogue_vfp)
 	cmp	r3, #FFI_TYPE_DOUBLE
 	fstdeq	d0, [r2]
 	beq	LSYM(Lepilogue_vfp)
 	cmp	r3, #FFI_TYPE_STRUCT_VFP_FLOAT
 	cmpne	r3, #FFI_TYPE_STRUCT_VFP_DOUBLE
 	fstmiadeq	r2, {d0-d3}
 LSYM(Lepilogue_vfp):
 	RETLDM	"r0-r3,fp"
 .ffi_call_VFP_end:
 	UNWIND .fnend
        .size    CNAME(ffi_call_VFP),.ffi_call_VFP_end-CNAME(ffi_call_VFP)
 ARM_FUNC_START(ffi_closure_VFP)
 	fstmfdd	sp!, {d0-d7}
 	@ r0-r3, then d0-d7
 	UNWIND .pad #80
@@ -475,16 +312,15 @@ ARM_FUNC_START(ffi_closure_VFP)
 .Lretdouble_struct_vfp:
 	fldmiad	sp, {d0-d3}
 	b	.Lclosure_epilogue_vfp
 .ffi_closure_VFP_end:
 	UNWIND .fnend
-        .size    CNAME(ffi_closure_VFP),.ffi_closure_VFP_end-CNAME(ffi_closure_VFP)
+ARM_FUNC_END(ffi_closure_VFP)
 #endif
-ENTRY(ffi_arm_trampoline)
+ARM_FUNC_START(ffi_arm_trampoline, 1)
 	stmfd sp!, {r0-r3}
 	ldr r0, [pc]
 	ldr pc, [pc]
 ARM_FUNC_END(ffi_arm_trampoline)
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",%progbits