arm: Rewrite ffi_call

Use the trick to allocate the stack frame for ffi_call_SYSV within ffi_call itself.
2014-10-17 01:27:16 -04:00
parent a74a3aaddb
commit e7f15f60e8
4 changed files with 303 additions and 449 deletions
--- a/src/arm/ffi.c
+++ b/src/arm/ffi.c
@@ -30,16 +30,13 @@

 #include <ffi.h>
 #include <ffi_common.h>
-
 #include <stdlib.h>
+#include "internal.h"

 /* Forward declares. */
 static int vfp_type_p (const ffi_type *);
 static void layout_vfp_args (ffi_cif *);

-int ffi_prep_args_SYSV (char *stack, extended_cif *ecif, float *vfp_space);
-int ffi_prep_args_VFP (char *stack, extended_cif *ecif, float *vfp_space);
-
 static void *
 ffi_align (ffi_type *ty, void *p)
 {
@@ -98,53 +95,44 @@ ffi_put_arg (ffi_type *ty, void *src, void *dst)
  return ALIGN (z, 4);
 }

-/* ffi_prep_args is called by the assembly routine once stack space
-   has been allocated for the function's arguments
+/* ffi_prep_args is called once stack space has been allocated
+   for the function's arguments.

   The vfp_space parameter is the load area for VFP regs, the return
   value is cif->vfp_used (word bitset of VFP regs used for passing
   arguments). These are only used for the VFP hard-float ABI.
 */
-int
-ffi_prep_args_SYSV (char *stack, extended_cif *ecif, float *vfp_space)
+static void
+ffi_prep_args_SYSV (ffi_cif *cif, int flags, void *rvalue,
+		    void **avalue, char *argp)
 {
-  register unsigned int i;
-  register void **p_argv;
-  register char *argp;
-  register ffi_type **p_arg;
-  argp = stack;
+  ffi_type **arg_types = cif->arg_types;
+  int i, n;

-  if (ecif->cif->flags == FFI_TYPE_STRUCT)
+  if (flags == ARM_TYPE_STRUCT)
    {
-      *(void **) argp = ecif->rvalue;
+      *(void **) argp = rvalue;
      argp += 4;
    }

-  p_argv = ecif->avalue;
-
-  for (i = ecif->cif->nargs, p_arg = ecif->cif->arg_types;
-       (i != 0); i--, p_arg++, p_argv++)
+  for (i = 0, n = cif->nargs; i < n; i++)
    {
-      argp = ffi_align (*p_arg, argp);
-      argp += ffi_put_arg (*p_arg, *p_argv, argp);
+      ffi_type *ty = arg_types[i];
+      argp = ffi_align (ty, argp);
+      argp += ffi_put_arg (ty, avalue[i], argp);
    }
-
-  return 0;
 }

-int
-ffi_prep_args_VFP (char *stack, extended_cif * ecif, float *vfp_space)
+static void
+ffi_prep_args_VFP (ffi_cif *cif, int flags, void *rvalue,
+                   void **avalue, char *stack, char *vfp_space)
 {
-  register unsigned int i, vi = 0;
-  register void **p_argv;
-  register char *argp, *regp, *eo_regp;
-  register ffi_type **p_arg;
+  ffi_type **arg_types = cif->arg_types;
+  int i, n, vi = 0;
+  char *argp, *regp, *eo_regp;
  char stack_used = 0;
  char done_with_regs = 0;

-  /* Make sure we are using FFI_VFP.  */
-  FFI_ASSERT (ecif->cif->abi == FFI_VFP);
-
  /* The first 4 words on the stack are used for values
     passed in core registers.  */
  regp = stack;
@@ -152,37 +140,36 @@ ffi_prep_args_VFP (char *stack, extended_cif * ecif, float *vfp_space)

  /* If the function returns an FFI_TYPE_STRUCT in memory,
     that address is passed in r0 to the function.  */
-  if (ecif->cif->flags == FFI_TYPE_STRUCT)
+  if (flags == ARM_TYPE_STRUCT)
    {
-      *(void **) regp = ecif->rvalue;
+      *(void **) regp = rvalue;
      regp += 4;
    }

-  p_argv = ecif->avalue;
-
-  for (i = ecif->cif->nargs, p_arg = ecif->cif->arg_types;
-       (i != 0); i--, p_arg++, p_argv++)
+  for (i = 0, n = cif->nargs; i < n; i++)
    {
-      int is_vfp_type = vfp_type_p (*p_arg);
+      ffi_type *ty = arg_types[i];
+      void *a = avalue[i];
+      int is_vfp_type = vfp_type_p (ty);

      /* Allocated in VFP registers. */
-      if (vi < ecif->cif->vfp_nargs && is_vfp_type)
+      if (vi < cif->vfp_nargs && is_vfp_type)
 	{
-	  char *vfp_slot = (char *) (vfp_space + ecif->cif->vfp_args[vi++]);
-	  ffi_put_arg (*p_arg, *p_argv, vfp_slot);
+	  char *vfp_slot = vfp_space + cif->vfp_args[vi++] * 4;
+	  ffi_put_arg (ty, a, vfp_slot);
 	  continue;
 	}
      /* Try allocating in core registers. */
      else if (!done_with_regs && !is_vfp_type)
 	{
-	  char *tregp = ffi_align (*p_arg, regp);
-	  size_t size = (*p_arg)->size;
+	  char *tregp = ffi_align (ty, regp);
+	  size_t size = ty->size;
 	  size = (size < 4) ? 4 : size;	// pad
 	  /* Check if there is space left in the aligned register
 	     area to place the argument.  */
 	  if (tregp + size <= eo_regp)
 	    {
-	      regp = tregp + ffi_put_arg (*p_arg, *p_argv, tregp);
+	      regp = tregp + ffi_put_arg (ty, a, tregp);
 	      done_with_regs = (regp == argp);
 	      // ensure we did not write into the stack area
 	      FFI_ASSERT (regp <= argp);
@@ -195,88 +182,98 @@ ffi_prep_args_VFP (char *stack, extended_cif * ecif, float *vfp_space)
 	    {
 	      stack_used = 1;
 	      done_with_regs = 1;
-	      argp = tregp + ffi_put_arg (*p_arg, *p_argv, tregp);
+	      argp = tregp + ffi_put_arg (ty, a, tregp);
 	      FFI_ASSERT (eo_regp < argp);
 	      continue;
 	    }
 	}
      /* Base case, arguments are passed on the stack */
      stack_used = 1;
-      argp = ffi_align (*p_arg, argp);
-      argp += ffi_put_arg (*p_arg, *p_argv, argp);
+      argp = ffi_align (ty, argp);
+      argp += ffi_put_arg (ty, a, argp);
    }
-  /* Indicate the VFP registers used. */
-  return ecif->cif->vfp_used;
 }

 /* Perform machine dependent cif processing */
 ffi_status
-ffi_prep_cif_machdep (ffi_cif * cif)
+ffi_prep_cif_machdep (ffi_cif *cif)
 {
+  int flags = 0, cabi = cif->abi;
+  size_t bytes;
+
  /* Round the stack up to a multiple of 8 bytes.  This isn't needed
     everywhere, but it is on some platforms, and it doesn't harm anything
     when it isn't needed.  */
-  cif->bytes = (cif->bytes + 7) & ~7;
+  bytes = ALIGN (cif->bytes, 8);

-  /* Set the return type flag */
-  switch (cif->rtype->type)
-    {
-    case FFI_TYPE_VOID:
-    case FFI_TYPE_FLOAT:
-    case FFI_TYPE_DOUBLE:
-      cif->flags = (unsigned) cif->rtype->type;
-      break;
-
-    case FFI_TYPE_SINT64:
-    case FFI_TYPE_UINT64:
-      cif->flags = (unsigned) FFI_TYPE_SINT64;
-      break;
-
-    case FFI_TYPE_STRUCT:
-      if (cif->abi == FFI_VFP)
-	{
-	  int h = vfp_type_p (cif->rtype);
-	  if (h)
-	    {
-	      int ele_count = h >> 8;
-	      int type_code = h & 0xff;
-	      if (ele_count > 1)
-		{
-		  if (type_code == FFI_TYPE_FLOAT)
-		    type_code = FFI_TYPE_STRUCT_VFP_FLOAT;
-		  else
-		    type_code = FFI_TYPE_STRUCT_VFP_DOUBLE;
-		}
-	      cif->flags = type_code;
-	      break;
-	    }
-	}
-      if (cif->rtype->size <= 4)
-	{
-	  /* A Composite Type not larger than 4 bytes is returned in r0.  */
-	  cif->flags = (unsigned) FFI_TYPE_INT;
-	}
-      else
-	{
-	  /* A Composite Type larger than 4 bytes, or whose size cannot
-	     be determined statically ... is stored in memory at an
-	     address passed [in r0].  */
-	  cif->flags = (unsigned) FFI_TYPE_STRUCT;
-	}
-      break;
-
-    default:
-      cif->flags = FFI_TYPE_INT;
-      break;
-    }
+  /* Minimum stack space is the 4 register arguments that we pop.  */
+  if (bytes < 4*4)
+    bytes = 4*4;
+  cif->bytes = bytes;

  /* Map out the register placements of VFP register args.  The VFP
     hard-float calling conventions are slightly more sophisticated
     than the base calling conventions, so we do it here instead of
     in ffi_prep_args(). */
-  if (cif->abi == FFI_VFP)
+  if (cabi == FFI_VFP)
    layout_vfp_args (cif);

+  /* Set the return type flag */
+  switch (cif->rtype->type)
+    {
+    case FFI_TYPE_VOID:
+      flags = ARM_TYPE_VOID;
+      break;
+
+    case FFI_TYPE_INT:
+    case FFI_TYPE_UINT8:
+    case FFI_TYPE_SINT8:
+    case FFI_TYPE_UINT16:
+    case FFI_TYPE_SINT16:
+    case FFI_TYPE_UINT32:
+    case FFI_TYPE_SINT32:
+    case FFI_TYPE_POINTER:
+      flags = ARM_TYPE_INT;
+      break;
+
+    case FFI_TYPE_SINT64:
+    case FFI_TYPE_UINT64:
+      flags = ARM_TYPE_INT64;
+      break;
+
+    case FFI_TYPE_FLOAT:
+      flags = (cabi == FFI_VFP ? ARM_TYPE_VFP_S : ARM_TYPE_INT);
+      break;
+    case FFI_TYPE_DOUBLE:
+      flags = (cabi == FFI_VFP ? ARM_TYPE_VFP_D : ARM_TYPE_INT64);
+      break;
+
+    case FFI_TYPE_STRUCT:
+      if (cabi == FFI_VFP)
+	{
+	  int h = vfp_type_p (cif->rtype);
+
+	  flags = ARM_TYPE_VFP_N;
+	  if (h == 0x100 + FFI_TYPE_FLOAT)
+	    flags = ARM_TYPE_VFP_S;
+	  if (h == 0x100 + FFI_TYPE_DOUBLE)
+	    flags = ARM_TYPE_VFP_D;
+	  if (h != 0)
+	      break;
+	}
+
+      /* A Composite Type not larger than 4 bytes is returned in r0.
+	 A Composite Type larger than 4 bytes, or whose size cannot
+	 be determined statically ... is stored in memory at an
+	 address passed [in r0].  */
+      flags = (cif->rtype->size <= 4 ? ARM_TYPE_INT : ARM_TYPE_STRUCT);
+      break;
+
+    default:
+      abort();
+    }
+  cif->flags = flags;
+
  return FFI_OK;
 }

@@ -293,69 +290,83 @@ ffi_prep_cif_machdep_var (ffi_cif * cif,
 }

 /* Prototypes for assembly functions, in sysv.S.  */
-extern void ffi_call_SYSV (void (*fn) (void), extended_cif *, unsigned,
-			   unsigned, unsigned *);
-extern void ffi_call_VFP (void (*fn) (void), extended_cif *, unsigned,
-			  unsigned, unsigned *);
+
+struct call_frame
+{
+  void *fp;
+  void *lr;
+  void *rvalue;
+  int flags;
+};
+
+extern void ffi_call_SYSV (void *stack, struct call_frame *,
+			   void (*fn) (void)) FFI_HIDDEN;
+extern void ffi_call_VFP (void *vfp_space, struct call_frame *,
+			   void (*fn) (void), unsigned vfp_used) FFI_HIDDEN;

 void
 ffi_call (ffi_cif * cif, void (*fn) (void), void *rvalue, void **avalue)
 {
-  extended_cif ecif;
+  int flags = cif->flags;
+  ffi_type *rtype = cif->rtype;
+  size_t bytes, rsize, vfp_size;
+  char *stack, *vfp_space, *new_rvalue;
+  struct call_frame *frame;

-  int small_struct = (cif->flags == FFI_TYPE_INT
-		      && cif->rtype->type == FFI_TYPE_STRUCT);
-  int vfp_struct = (cif->flags == FFI_TYPE_STRUCT_VFP_FLOAT
-		    || cif->flags == FFI_TYPE_STRUCT_VFP_DOUBLE);
-
-  unsigned int temp;
-
-  ecif.cif = cif;
-  ecif.avalue = avalue;
-
-  /* If the return value is a struct and we don't have a return
-     value address then we need to make one.  */
-
-  if ((rvalue == NULL) && (cif->flags == FFI_TYPE_STRUCT))
+  rsize = 0;
+  if (rvalue == NULL)
    {
-      ecif.rvalue = alloca (cif->rtype->size);
+      /* If the return value is a struct and we don't have a return
+	 value address then we need to make one.  Otherwise the return
+	 value is in registers and we can ignore them.  */
+      if (flags == ARM_TYPE_STRUCT)
+	rsize = rtype->size;
+      else
+	flags = ARM_TYPE_VOID;
    }
-  else if (small_struct)
-    ecif.rvalue = &temp;
-  else if (vfp_struct)
+  else if (flags == ARM_TYPE_VFP_N)
    {
      /* Largest case is double x 4. */
-      ecif.rvalue = alloca (32);
+      rsize = 32;
+    }
+  else if (flags == ARM_TYPE_INT && rtype->type == FFI_TYPE_STRUCT)
+    rsize = 4;
+
+  /* Largest case.  */
+  vfp_size = (cif->abi == FFI_VFP && cif->vfp_used ? 8*8: 0);
+
+  bytes = cif->bytes;
+  stack = alloca (vfp_size + bytes + sizeof(struct call_frame) + rsize);
+
+  vfp_space = NULL;
+  if (vfp_size)
+    {
+      vfp_space = stack;
+      stack += vfp_size;
+    }
+
+  frame = (struct call_frame *)(stack + bytes);
+
+  new_rvalue = rvalue;
+  if (rsize)
+    new_rvalue = (void *)(frame + 1);
+
+  frame->rvalue = new_rvalue;
+  frame->flags = flags;
+
+  if (vfp_space)
+    {
+      ffi_prep_args_VFP (cif, flags, new_rvalue, avalue, stack, vfp_space);
+      ffi_call_VFP (vfp_space, frame, fn, cif->vfp_used);
    }
  else
-    ecif.rvalue = rvalue;
-
-  switch (cif->abi)
    {
-    case FFI_SYSV:
-      ffi_call_SYSV (fn, &ecif, cif->bytes, cif->flags, ecif.rvalue);
-      break;
-
-    case FFI_VFP:
-#ifdef __ARM_EABI__
-      ffi_call_VFP (fn, &ecif, cif->bytes, cif->flags, ecif.rvalue);
-      break;
-#endif
-
-    default:
-      FFI_ASSERT (0);
-      break;
-    }
-  if (small_struct)
-    {
-      FFI_ASSERT (rvalue != NULL);
-      memcpy (rvalue, &temp, cif->rtype->size);
-    }
-  else if (vfp_struct)
-    {
-      FFI_ASSERT (rvalue != NULL);
-      memcpy (rvalue, ecif.rvalue, cif->rtype->size);
+      ffi_prep_args_SYSV (cif, flags, new_rvalue, avalue, stack);
+      ffi_call_SYSV (stack, frame, fn);
    }
+ 
+  if (rvalue && rvalue != new_rvalue)
+    memcpy (rvalue, new_rvalue, rtype->size);
 }

 /** private members **/
--- a/src/arm/ffitarget.h
+++ b/src/arm/ffitarget.h
@@ -53,7 +53,7 @@ typedef enum ffi_abi {

 #define FFI_EXTRA_CIF_FIELDS			\
  int vfp_used;					\
-  short vfp_reg_free, vfp_nargs;		\
+  unsigned short vfp_reg_free, vfp_nargs;	\
  signed char vfp_args[16]			\

 /* Internally used. */
--- a/src/arm/internal.h
+++ b/src/arm/internal.h
@@ -0,0 +1,7 @@
+#define ARM_TYPE_VFP_S	0
+#define ARM_TYPE_VFP_D	1
+#define ARM_TYPE_VFP_N	2
+#define ARM_TYPE_INT64	3
+#define ARM_TYPE_INT	4
+#define ARM_TYPE_VOID	5
+#define ARM_TYPE_STRUCT	6
--- a/src/arm/sysv.S
+++ b/src/arm/sysv.S
@@ -1,8 +1,8 @@
 /* -----------------------------------------------------------------------
   sysv.S - Copyright (c) 1998, 2008, 2011 Red Hat, Inc.
 	    Copyright (c) 2011 Plausible Labs Cooperative, Inc.
-   
-   ARM Foreign Function Interface 
+
+   ARM Foreign Function Interface

   Permission is hereby granted, free of charge, to any person obtaining
   a copy of this software and associated documentation files (the
@@ -28,219 +28,155 @@
 #define LIBFFI_ASM	
 #include <fficonfig.h>
 #include <ffi.h>
-#ifdef HAVE_MACHINE_ASM_H
-#include <machine/asm.h>
-#else
-#ifdef __USER_LABEL_PREFIX__
-#define CONCAT1(a, b) CONCAT2(a, b)
-#define CONCAT2(a, b) a ## b
+#include <ffi_cfi.h>
+#include "internal.h"

-/* Use the right prefix for global labels.  */
-#define CNAME(x) CONCAT1 (__USER_LABEL_PREFIX__, x)
-#else
-#define CNAME(x) x
-#endif
-#ifdef __APPLE__
-#define ENTRY(x) .globl _##x; _##x:
-#else
-#define ENTRY(x) .globl CNAME(x); .type CNAME(x),%function; CNAME(x):
-#endif /* __APPLE__ */
-#endif
-
-#ifdef __ELF__
-#define LSYM(x) .x
-#else
-#define LSYM(x) x
-#endif
-
-/* Use the SOFTFP return value ABI on Mac OS X, as per the iOS ABI
-  Function Call Guide */
-#ifdef __APPLE__
-#define __SOFTFP__
-#endif
-
-/* We need a better way of testing for this, but for now, this is all 
-   we can do.  */
-@ This selects the minimum architecture level required.
-#define __ARM_ARCH__ 3
-
-#if defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__)
-# undef __ARM_ARCH__
-# define __ARM_ARCH__ 4
-#endif
-        
-#if defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) \
-	|| defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
-	|| defined(__ARM_ARCH_5TEJ__)
-# undef __ARM_ARCH__
-# define __ARM_ARCH__ 5
-#endif
-
-#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
+/* GCC 4.8 provides __ARM_ARCH; construct it otherwise.  */
+#ifndef __ARM_ARCH
+# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
+     || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
+     || defined(__ARM_ARCH_7EM__)
+#  define __ARM_ARCH 7
+# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
        || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \
        || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) \
 	|| defined(__ARM_ARCH_6M__)
-# undef __ARM_ARCH__
-# define __ARM_ARCH__ 6
-#endif
-
-#if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
-        || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
-	|| defined(__ARM_ARCH_7EM__)
-# undef __ARM_ARCH__
-# define __ARM_ARCH__ 7
-#endif
-
-#if __ARM_ARCH__ >= 5
-# define call_reg(x)	blx	x
-#elif defined (__ARM_ARCH_4T__)
-# define call_reg(x)	mov	lr, pc ; bx	x
-# if defined(__thumb__) || defined(__THUMB_INTERWORK__)
-#  define __INTERWORKING__
+#  define __ARM_ARCH 6
+# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) \
+	|| defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
+	|| defined(__ARM_ARCH_5TEJ__)
+#  define __ARM_ARCH 5
+# else
+#  define __ARM_ARCH 4
 # endif
-#else
-# define call_reg(x)	mov	lr, pc ; mov	pc, x
 #endif

 /* Conditionally compile unwinder directives.  */
+.macro UNWIND text:vararg
 #ifdef __ARM_EABI__
-#define UNWIND
-#else
-#define UNWIND @
+	\text
 #endif	
-
-.syntax unified
-
-#if defined(__thumb__) && !defined(__THUMB_INTERWORK__)
-#define ARM_FUNC_START(name) \
-	.text; \
-	.align 2; \
-	.thumb; \
-	.thumb_func; \
-	ENTRY(name); \
-	bx pc; \
-	nop; \
-	.arm; \
-	UNWIND .fnstart; \
-_L__##name:
-#else
-#define ARM_FUNC_START(name) \
-	.text; \
-	.align 2; \
-	.arm; \
-	ENTRY(name); \
-	UNWIND .fnstart
+.endm
+#if defined(HAVE_AS_CFI_PSEUDO_OP) && defined(__ARM_EABI__)
+	.cfi_sections	.debug_frame
 #endif

-.macro	RETLDM	regs=, cond=, dirn=ia
-#if defined (__INTERWORKING__)
-	.ifc "\regs",""
-	ldr\cond	lr, [sp], #4
-	.else
-	ldm\cond\dirn	sp!, {\regs, lr}
-	.endif
-	bx\cond	lr
+#define CONCAT(a, b)	CONCAT2(a, b)
+#define CONCAT2(a, b)	a ## b
+
+#ifdef __USER_LABEL_PREFIX__
+# define CNAME(X)	CONCAT (__USER_LABEL_PREFIX__, X)
 #else
-	.ifc "\regs",""
-	ldr\cond	pc, [sp], #4
-	.else
-	ldm\cond\dirn	sp!, {\regs, pc}
-	.endif
+# define CNAME(X)	X
 #endif
+#ifdef __ELF__
+# define SIZE(X)	.size CNAME(X), . - CNAME(X)
+# define TYPE(X, Y)	.type CNAME(X), Y
+#else
+# define SIZE(X)
+# define TYPE(X, Y)
+#endif
+
+#define ARM_FUNC_START(name, gl) \
+	.align	3; \
+	.ifne gl; .globl CNAME(name); FFI_HIDDEN(CNAME(name)); .endif; \
+	TYPE(name, %function); \
+	CNAME(name):
+
+#define ARM_FUNC_END(name) \
+	SIZE(name)
+
+/* Aid in defining a jump table with 8 bytes between entries.  */
+.macro E index
+	.if . - 0b - 8*\index
+	.error "type table out of sync"
+	.endif
 .endm

-	@ r0:   ffi_prep_args
-	@ r1:   &ecif
-	@ r2:   cif->bytes
-	@ r3:   fig->flags
-	@ sp+0: ecif.rvalue
+	.text
+	.syntax unified
+	.arm

-	@ This assumes we are using gas.
-ARM_FUNC_START(ffi_call_SYSV)
-	@ Save registers
-        stmfd	sp!, {r0-r3, fp, lr}
-	UNWIND .save	{r0-r3, fp, lr}
-	mov	fp, sp
+	/* We require interworking on LDM, which implies ARMv5T,
+	   which implies the existance of BLX.  */
+	.arch	armv5t

-	UNWIND .setfp	fp, sp
+	/* Note that we use STC and LDC to encode VFP instructions,
+	   so that we do not need ".fpu vfp", nor get that added to
+	   the object file attributes.  These will not be executed
+	   unless the FFI_VFP abi is used.  */

-	@ Make room for all of the new args.
-	sub	sp, fp, r2
+	@ r0:   stack
+	@ r1:   frame
+	@ r2:   fn
+	@ r3:	vfp_used

-	@ Place all of the ffi_prep_args in position
-	mov	r0, sp
-	@     r1 already set
+ARM_FUNC_START(ffi_call_VFP, 1)
+	UNWIND	.fnstart
+	cfi_startproc

-	@ Call ffi_prep_args(stack, &ecif)
-	bl	CNAME(ffi_prep_args_SYSV)
+	cmp	r3, #3			@ load only d0 if possible
+	ldcle	p11, cr0, [r0]		@ vldrle d0, [sp]
+	ldcgt	p11, cr0, [r0], {16}	@ vldmgt sp, {d0-d7}
+	add	r0, r0, #64		@ discard the vfp register args
+	/* FALLTHRU */
+ARM_FUNC_END(ffi_call_VFP)

-	@ move first 4 parameters in registers
-	ldmia	sp, {r0-r3}
+ARM_FUNC_START(ffi_call_SYSV, 1)
+	stm	r1, {fp, lr}
+	mov	fp, r1

-	@ and adjust stack
-	sub	lr, fp, sp	@ cif->bytes == fp - sp
-	ldr	ip, [fp]	@ load fn() in advance
-	cmp	lr, #16
-	movhs	lr, #16
-	add	sp, sp, lr
+	@ This is a bit of a lie wrt the origin of the unwind info, but
+	@ now we've got the usual frame pointer and two saved registers.
+	UNWIND	.save {fp,lr}
+	UNWIND	.setfp fp, sp
+	cfi_def_cfa(fp, 8)
+	cfi_rel_offset(fp, 0)
+	cfi_rel_offset(lr, 4)

-	@ call (fn) (...)
-	call_reg(ip)
-	
-	@ Remove the space we pushed for the args
-	mov	sp, fp
+	mov	sp, r0		@ install the stack pointer
+	mov	lr, r2		@ move the fn pointer out of the way
+	ldmia	sp!, {r0-r3}	@ move first 4 parameters in registers.
+	blx	lr		@ call fn

 	@ Load r2 with the pointer to storage for the return value
-	ldr	r2, [sp, #24]
+	@ Load r3 with the return type code
+	ldr	r2, [fp, #8]
+	ldr	r3, [fp, #12]

-	@ Load r3 with the return type code 
-	ldr	r3, [sp, #12]
+	@ Deallocate the stack with the arguments.
+	mov	sp, fp
+	cfi_def_cfa_register(sp)

-	@ If the return value pointer is NULL, assume no return value.
-	cmp	r2, #0
-	beq	LSYM(Lepilogue)
+	@ Store values stored in registers.
+	.align	3
+	add	pc, pc, r3, lsl #3
+	nop
+0:
+E ARM_TYPE_VFP_S
+	stc	p10, cr0, [r2]		@ vstr s0, [r2]
+	pop	{fp,pc}
+E ARM_TYPE_VFP_D
+	stc	p11, cr0, [r2]		@ vstr d0, [r2]
+	pop	{fp,pc}
+E ARM_TYPE_VFP_N
+	stc	p11, cr0, [r2], {8}	@ vstm r2, {d0-d3}
+	pop	{fp,pc}
+E ARM_TYPE_INT64
+	str	r1, [r2, #4]
+	nop
+E ARM_TYPE_INT
+	str	r0, [r2]
+	pop	{fp,pc}
+E ARM_TYPE_VOID
+	pop	{fp,pc}
+	nop
+E ARM_TYPE_STRUCT
+	pop	{fp,pc}

-@ return INT
-	cmp	r3, #FFI_TYPE_INT
-#if defined(__SOFTFP__) || defined(__ARM_EABI__)
-	cmpne	r3, #FFI_TYPE_FLOAT
-#endif
-	streq	r0, [r2]
-	beq	LSYM(Lepilogue)
-
-	@ return INT64
-	cmp	r3, #FFI_TYPE_SINT64
-#if defined(__SOFTFP__) || defined(__ARM_EABI__)
-	cmpne	r3, #FFI_TYPE_DOUBLE
-#endif
-	stmiaeq	r2, {r0, r1}
-
-#if !defined(__SOFTFP__) && !defined(__ARM_EABI__)
-	beq	LSYM(Lepilogue)
-
-@ return FLOAT
-	cmp	r3, #FFI_TYPE_FLOAT
-	stfeqs	f0, [r2]
-	beq	LSYM(Lepilogue)
-
-@ return DOUBLE or LONGDOUBLE
-	cmp	r3, #FFI_TYPE_DOUBLE
-	stfeqd	f0, [r2]
-#endif
-
-LSYM(Lepilogue):
-#if defined (__INTERWORKING__)
-	ldmia   sp!, {r0-r3,fp, lr}
-	bx	lr
-#else
-	ldmia   sp!, {r0-r3,fp, pc}
-#endif
-
-.ffi_call_SYSV_end:
-	UNWIND .fnend
-#ifdef __ELF__
-        .size    CNAME(ffi_call_SYSV),.ffi_call_SYSV_end-CNAME(ffi_call_SYSV)
-#endif
+	cfi_endproc
+	UNWIND	.fnend
+ARM_FUNC_END(ffi_call_SYSV)


 /*
@@ -251,7 +187,8 @@ LSYM(Lepilogue):
  	     void *args;
 */

-ARM_FUNC_START(ffi_closure_SYSV)
+ARM_FUNC_START(ffi_closure_SYSV, 1)
+	UNWIND	.fnstart
 	UNWIND .pad #16
 	add	ip, sp, #16
 	stmfd	sp!, {ip, lr}
@@ -310,116 +247,16 @@ ARM_FUNC_START(ffi_closure_SYSV)
 	ldfd	f0, [sp]
 	b	.Lclosure_epilogue
 #endif
-
-.ffi_closure_SYSV_end:
 	UNWIND .fnend
-#ifdef __ELF__
-        .size    CNAME(ffi_closure_SYSV),.ffi_closure_SYSV_end-CNAME(ffi_closure_SYSV)
-#endif
+ARM_FUNC_END(ffi_closure_SYSV)


 /* Below are VFP hard-float ABI call and closure implementations.
   Add VFP FPU directive here. This is only compiled into the library
   under EABI.  */
 #ifdef __ARM_EABI__
-	.fpu	vfp
-
-	@ r0:   fn
-	@ r1:   &ecif
-	@ r2:   cif->bytes
-	@ r3:   fig->flags
-	@ sp+0: ecif.rvalue
-
-ARM_FUNC_START(ffi_call_VFP)
-	@ Save registers
-        stmfd	sp!, {r0-r3, fp, lr}
-	UNWIND .save	{r0-r3, fp, lr}
-	mov	fp, sp
-	UNWIND .setfp	fp, sp
-
-	@ Make room for all of the new args.
-	sub	sp, sp, r2
-
-	@ Make room for loading VFP args
-	sub	sp, sp, #64
-
-	@ Place all of the ffi_prep_args in position
-	mov	r0, sp
-	@     r1 already set
-	sub	r2, fp, #64   @ VFP scratch space
-
-	@ Call ffi_prep_args(stack, &ecif, vfp_space)
-	bl	CNAME(ffi_prep_args_VFP)
-
-	@ Load VFP register args if needed
-	cmp	r0, #0
-	mov	ip, fp
-	beq	LSYM(Lbase_args)
-
-	@ Load only d0 if possible
-	cmp	r0, #3
-	sub	ip, fp, #64
-	flddle	d0, [ip]
-	fldmiadgt	ip, {d0-d7}
-
-LSYM(Lbase_args):
-	@ move first 4 parameters in registers
-	ldmia	sp, {r0-r3}
-
-	@ and adjust stack
-	sub	lr, ip, sp	@ cif->bytes == (fp - 64) - sp
-	ldr	ip, [fp]	@ load fn() in advance
-        cmp	lr, #16
-	movhs	lr, #16
-        add	sp, sp, lr
-
-	@ call (fn) (...)
-	call_reg(ip)
-
-	@ Remove the space we pushed for the args
-	mov	sp, fp
-
-	@ Load r2 with the pointer to storage for
-	@ the return value
-	ldr	r2, [sp, #24]
-
-	@ Load r3 with the return type code 
-	ldr	r3, [sp, #12]
-
-	@ If the return value pointer is NULL,
-	@ assume no return value.
-	cmp	r2, #0
-	beq	LSYM(Lepilogue_vfp)
-
-	cmp	r3, #FFI_TYPE_INT
-	streq	r0, [r2]
-	beq	LSYM(Lepilogue_vfp)
-
-	cmp	r3, #FFI_TYPE_SINT64
-	stmeqia	r2, {r0, r1}
-	beq	LSYM(Lepilogue_vfp)
-
-	cmp	r3, #FFI_TYPE_FLOAT
-	fstseq	s0, [r2]
-	beq	LSYM(Lepilogue_vfp)
-	
-	cmp	r3, #FFI_TYPE_DOUBLE
-	fstdeq	d0, [r2]
-	beq	LSYM(Lepilogue_vfp)
-
-	cmp	r3, #FFI_TYPE_STRUCT_VFP_FLOAT
-	cmpne	r3, #FFI_TYPE_STRUCT_VFP_DOUBLE
-	fstmiadeq	r2, {d0-d3}
-
-LSYM(Lepilogue_vfp):
-	RETLDM	"r0-r3,fp"
-
-.ffi_call_VFP_end:
-	UNWIND .fnend
-        .size    CNAME(ffi_call_VFP),.ffi_call_VFP_end-CNAME(ffi_call_VFP)
-
-
-ARM_FUNC_START(ffi_closure_VFP)
+ARM_FUNC_START(ffi_closure_VFP, 1)
+	UNWIND	.fnstart
 	fstmfdd	sp!, {d0-d7}
 	@ r0-r3, then d0-d7
 	UNWIND .pad #80
@@ -475,16 +312,15 @@ ARM_FUNC_START(ffi_closure_VFP)
 .Lretdouble_struct_vfp:
 	fldmiad	sp, {d0-d3}
 	b	.Lclosure_epilogue_vfp
-
-.ffi_closure_VFP_end:
 	UNWIND .fnend
-        .size    CNAME(ffi_closure_VFP),.ffi_closure_VFP_end-CNAME(ffi_closure_VFP)
+ARM_FUNC_END(ffi_closure_VFP)
 #endif

-ENTRY(ffi_arm_trampoline)
+ARM_FUNC_START(ffi_arm_trampoline, 1)
 	stmfd sp!, {r0-r3}
 	ldr r0, [pc]
 	ldr pc, [pc]
+ARM_FUNC_END(ffi_arm_trampoline)

 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",%progbits