arm: Rewrite ffi_closure

Move the push of the argument registers into ffi_closure_SYSV, reducing the size of the trampoline.
2014-10-17 02:07:32 -04:00
parent e7f15f60e8
commit a4b785ea69
6 changed files with 187 additions and 4833 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -48,9 +48,9 @@ EXTRA_DIST = LICENSE ChangeLog.v1 ChangeLog.libgcj			\
 	 src/xtensa/ffitarget.h src/xtensa/ffi.c src/xtensa/sysv.S	\
 	 ChangeLog.libffi m4/libtool.m4 m4/lt~obsolete.m4		\
 	 m4/ltoptions.m4 m4/ltsugar.m4 m4/ltversion.m4			\
-	 m4/ltversion.m4 src/arm/gentramp.sh src/debug.c msvcc.sh	\
+	 m4/ltversion.m4 src/debug.c msvcc.sh				\
 	 generate-darwin-source-and-headers.py				\
-	 libffi.xcodeproj/project.pbxproj src/arm/trampoline.S		\
+	 libffi.xcodeproj/project.pbxproj				\
 	 libtool-ldflags ChangeLog.libffi-3.1
 info_TEXINFOS = doc/libffi.texi
@@ -190,9 +190,6 @@ nodist_libffi_la_SOURCES += src/arc/arcompact.S src/arc/ffi.c
 endif
 if ARM
 nodist_libffi_la_SOURCES += src/arm/sysv.S src/arm/ffi.c
 if FFI_EXEC_TRAMPOLINE_TABLE
 nodist_libffi_la_SOURCES += src/arm/trampoline.S
 endif
 endif
 if AVR32
 nodist_libffi_la_SOURCES += src/avr32/sysv.S src/avr32/ffi.c
--- a/src/arm/ffi.c
+++ b/src/arm/ffi.c
@@ -369,152 +369,82 @@ ffi_call (ffi_cif * cif, void (*fn) (void), void *rvalue, void **avalue)
    memcpy (rvalue, new_rvalue, rtype->size);
 }
-/** private members **/
+static void *
-
+ffi_prep_incoming_args_SYSV (ffi_cif *cif, void *rvalue,
-static void ffi_prep_incoming_args_SYSV (char *stack, void **ret,
+			     char *argp, void **avalue)
 					 void **args, ffi_cif *cif,
 					 float *vfp_stack);
 static void ffi_prep_incoming_args_VFP (char *stack, void **ret,
 					void **args, ffi_cif *cif,
 					float *vfp_stack);
 void ffi_closure_SYSV (ffi_closure *);
 void ffi_closure_VFP (ffi_closure *);
 /* This function is jumped to by the trampoline */
 unsigned int FFI_HIDDEN
 ffi_closure_inner (ffi_closure *closure,
 		   void **respp, void *args, void *vfp_args)
 {
-  // our various things...
+  ffi_type **arg_types = cif->arg_types;
-  ffi_cif *cif;
+  int i, n;
  void **arg_area;
-  cif = closure->cif;
+  if (cif->flags == ARM_TYPE_STRUCT)
  arg_area = (void **) alloca (cif->nargs * sizeof (void *));
  /* this call will initialize ARG_AREA, such that each
   * element in that array points to the corresponding
   * value on the stack; and if the function returns
   * a structure, it will re-set RESP to point to the
   * structure return address.  */
  if (cif->abi == FFI_VFP)
    ffi_prep_incoming_args_VFP (args, respp, arg_area, cif, vfp_args);
  else
    ffi_prep_incoming_args_SYSV (args, respp, arg_area, cif, vfp_args);
  (closure->fun) (cif, *respp, arg_area, closure->user_data);
  return cif->flags;
 }
 /*@-exportheader@*/
 static void
 ffi_prep_incoming_args_SYSV (char *stack, void **rvalue,
 			     void **avalue, ffi_cif *cif,
 			     /* Used only under VFP hard-float ABI. */
 			     float *vfp_stack)
 /*@=exportheader@*/
 {
  register unsigned int i;
  register void **p_argv;
  register char *argp;
  register ffi_type **p_arg;
  argp = stack;
  if (cif->flags == FFI_TYPE_STRUCT)
    {
-      *rvalue = *(void **) argp;
+      rvalue = *(void **) argp;
      argp += 4;
    }
-  p_argv = avalue;
+  for (i = 0, n = cif->nargs; i < n; i++)
  for (i = cif->nargs, p_arg = cif->arg_types; (i != 0); i--, p_arg++)
    {
-      size_t z;
+      ffi_type *ty = arg_types[i];
      size_t z = ty->size;
-      argp = ffi_align (*p_arg, argp);
+      argp = ffi_align (ty, argp);
-
+      avalue[i] = (void *) argp;
      z = (*p_arg)->size;
      /* because we're little endian, this is what it turns into.   */
      *p_argv = (void *) argp;
      p_argv++;
      argp += z;
    }
-  return;
+  return rvalue;
 }
-/*@-exportheader@*/
+static void *
-static void
+ffi_prep_incoming_args_VFP (ffi_cif *cif, void *rvalue, char *stack,
-ffi_prep_incoming_args_VFP (char *stack, void **rvalue,
+			    char *vfp_space, void **avalue)
 			    void **avalue, ffi_cif * cif,
 			    /* Used only under VFP hard-float ABI. */
 			    float *vfp_stack)
 /*@=exportheader@*/
 {
-  register unsigned int i, vi = 0;
+  ffi_type **arg_types = cif->arg_types;
-  register void **p_argv;
+  int i, n, vi = 0;
-  register char *argp, *regp, *eo_regp;
+  char *argp, *regp, *eo_regp;
  register ffi_type **p_arg;
  char done_with_regs = 0;
  char stack_used = 0;
  FFI_ASSERT (cif->abi == FFI_VFP);
  regp = stack;
  eo_regp = argp = regp + 16;
-  if (cif->flags == FFI_TYPE_STRUCT)
+  if (cif->flags == ARM_TYPE_STRUCT)
    {
-      *rvalue = *(void **) regp;
+      rvalue = *(void **) regp;
      regp += 4;
    }
-  p_argv = avalue;
+  for (i = 0, n = cif->nargs; i < n; i++)
  for (i = cif->nargs, p_arg = cif->arg_types; (i != 0); i--, p_arg++)
    {
-      int is_vfp_type = vfp_type_p (*p_arg);
+      ffi_type *ty = arg_types[i];
-      size_t z;
+      int is_vfp_type = vfp_type_p (ty);
      size_t z = ty->size;
      if (vi < cif->vfp_nargs && is_vfp_type)
 	{
-	  *p_argv++ = (void *) (vfp_stack + cif->vfp_args[vi++]);
+	  avalue[i] = vfp_space + cif->vfp_args[vi++] * 4;
 	  continue;
 	}
      else if (!done_with_regs && !is_vfp_type)
 	{
-	  char *tregp = ffi_align (*p_arg, regp);
+	  char *tregp = ffi_align (ty, regp);
 	  z = (*p_arg)->size;
 	  z = (z < 4) ? 4 : z;	// pad
-	  /* if the arguments either fits into the registers or uses registers
+	  /* If the arguments either fits into the registers or uses registers
-	   * and stack, while we haven't read other things from the stack */
+	     and stack, while we haven't read other things from the stack */
 	  if (tregp + z <= eo_regp || !stack_used)
 	    {
-	      /* because we're little endian, this is what it turns into. */
+	      /* Because we're little endian, this is what it turns into.  */
-	      *p_argv = (void *) tregp;
+	      avalue[i] = (void *) tregp;
 	      p_argv++;
 	      regp = tregp + z;
-	      // if we read past the last core register, make sure we have not read
+
-	      // from the stack before and continue reading after regp
+	      /* If we read past the last core register, make sure we
 		 have not read from the stack before and continue
 		 reading after regp.  */
 	      if (regp > eo_regp)
 		{
-		  if (stack_used)
+		  FFI_ASSERT (!stack_used);
 		    {
 		      abort ();	// we should never read past the end of the register
 		      // are if the stack is already in use
 		    }
 		  argp = regp;
 		}
 	      if (regp >= eo_regp)
@@ -525,26 +455,41 @@ ffi_prep_incoming_args_VFP (char *stack, void **rvalue,
 	      continue;
 	    }
 	}
      stack_used = 1;
-
+      argp = ffi_align (ty, argp);
-      argp = ffi_align (*p_arg, argp);
+      avalue[i] = (void *) argp;
      z = (*p_arg)->size;
      /* because we're little endian, this is what it turns into.   */
      *p_argv = (void *) argp;
      p_argv++;
      argp += z;
    }
-  return;
+  return rvalue;
 }
-/* How to make a trampoline.  */
+int FFI_HIDDEN
 ffi_closure_inner_SYSV (ffi_closure *closure, void *rvalue, char *argp)
 {
  ffi_cif *cif = closure->cif;
  void **avalue = (void **) alloca (cif->nargs * sizeof (void *));
-extern unsigned int ffi_arm_trampoline[3];
+  rvalue = ffi_prep_incoming_args_SYSV (cif, rvalue, argp, avalue);
  closure->fun (cif, rvalue, avalue, closure->user_data);
  return cif->flags;
 }
 int FFI_HIDDEN
 ffi_closure_inner_VFP (ffi_closure *closure, void *rvalue,
 		       char *argp, char *vfp_space)
 {
  ffi_cif *cif = closure->cif;
  void **avalue = (void **) alloca (cif->nargs * sizeof (void *));
  rvalue = ffi_prep_incoming_args_VFP (cif, rvalue, argp, vfp_space, avalue);
  closure->fun (cif, rvalue, avalue, closure->user_data);
  return cif->flags;
 }
 void ffi_closure_SYSV (void) FFI_HIDDEN;
 void ffi_closure_VFP (void) FFI_HIDDEN;
 #if FFI_EXEC_TRAMPOLINE_TABLE
@@ -788,19 +733,7 @@ ffi_closure_free (void *ptr)
 #else
-#define FFI_INIT_TRAMPOLINE(TRAMP,FUN,CTX)				\
+extern unsigned int ffi_arm_trampoline[2] FFI_HIDDEN;
 ({ unsigned char *__tramp = (unsigned char*)(TRAMP);			\
   unsigned int  __fun = (unsigned int)(FUN);				\
   unsigned int  __ctx = (unsigned int)(CTX);				\
   unsigned char *insns = (unsigned char *)(CTX);                       \
   memcpy (__tramp, ffi_arm_trampoline, sizeof ffi_arm_trampoline);     \
   *(unsigned int*) &__tramp[12] = __ctx;				\
   *(unsigned int*) &__tramp[16] = __fun;				\
   __clear_cache((&__tramp[0]), (&__tramp[19])); /* Clear data mapping.  */ \
   __clear_cache(insns, insns + 3 * sizeof (unsigned int));             \
                                                 /* Clear instruction   \
                                                    mapping.  */        \
 })
 #endif
@@ -812,15 +745,15 @@ ffi_prep_closure_loc (ffi_closure * closure,
 		      void (*fun) (ffi_cif *, void *, void **, void *),
 		      void *user_data, void *codeloc)
 {
-  void (*closure_func) (ffi_closure *) = NULL;
+  void (*closure_func) (void) = ffi_closure_SYSV;
-  if (cif->abi == FFI_SYSV)
+  if (cif->abi == FFI_VFP)
-    closure_func = &ffi_closure_SYSV;
+    {
-#ifdef __ARM_EABI__
+      /* We only need take the vfp path if there are vfp arguments.  */
-  else if (cif->abi == FFI_VFP)
+      if (cif->vfp_used)
-    closure_func = &ffi_closure_VFP;
+	closure_func = ffi_closure_VFP;
-#endif
+    }
-  else
+  else if (cif->abi != FFI_SYSV)
    return FFI_BAD_ABI;
 #if FFI_EXEC_TRAMPOLINE_TABLE
@@ -828,12 +761,15 @@ ffi_prep_closure_loc (ffi_closure * closure,
  config[0] = closure;
  config[1] = closure_func;
 #else
-  FFI_INIT_TRAMPOLINE (&closure->tramp[0], closure_func, codeloc);
+  memcpy (closure->tramp, ffi_arm_trampoline, 8);
  __clear_cache(closure->tramp, closure->tramp + 8);	/* clear data map */
  __clear_cache(codeloc, codeloc + 8);			/* clear insn map */
  *(void (**)(void))(closure->tramp + 8) = closure_func;
 #endif
  closure->cif = cif;
  closure->user_data = user_data;
  closure->fun = fun;
  closure->user_data = user_data;
  return FFI_OK;
 }
--- a/src/arm/ffitarget.h
+++ b/src/arm/ffitarget.h
@@ -65,7 +65,7 @@ typedef enum ffi_abi {
 /* ---- Definitions for closures ----------------------------------------- */
 #define FFI_CLOSURES 1
-#define FFI_TRAMPOLINE_SIZE 20
+#define FFI_TRAMPOLINE_SIZE 12
 #define FFI_NATIVE_RAW_API 0
 #endif
--- a/src/arm/gentramp.sh
+++ b/src/arm/gentramp.sh
@@ -1,118 +0,0 @@
 #!/bin/sh
 # -----------------------------------------------------------------------
 #  gentramp.sh - Copyright (c) 2010, Plausible Labs Cooperative, Inc.
 #  
 #  ARM Trampoline Page Generator
 #
 #  Permission is hereby granted, free of charge, to any person obtaining
 #  a copy of this software and associated documentation files (the
 #  ``Software''), to deal in the Software without restriction, including
 #  without limitation the rights to use, copy, modify, merge, publish,
 #  distribute, sublicense, and/or sell copies of the Software, and to
 #  permit persons to whom the Software is furnished to do so, subject to
 #  the following conditions:
 #
 #  The above copyright notice and this permission notice shall be included
 #  in all copies or substantial portions of the Software.
 #
 #  THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
 #  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 #  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 #  NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 #  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 #  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 #  DEALINGS IN THE SOFTWARE.
 #  -----------------------------------------------------------------------
 PROGNAME=$0
 # Each trampoline is exactly 3 instructions, or 12 bytes. If any of these values change,
 # the entire arm trampoline implementation must be updated to match, too.
 # Size of an individual trampoline, in bytes
 TRAMPOLINE_SIZE=12
 # Page size, in bytes
 PAGE_SIZE=4096
 # Compute the size of the reachable config page; The first 16 bytes of the config page
 # are unreachable due to our maximum pc-relative ldr offset.
 PAGE_AVAIL=`expr $PAGE_SIZE - 16`
 # Compute the number of of available trampolines. 
 TRAMPOLINE_COUNT=`expr $PAGE_AVAIL / $TRAMPOLINE_SIZE`
 header () {
    echo "# GENERATED CODE - DO NOT EDIT"
    echo "# This file was generated by $PROGNAME"
    echo ""
    # Write out the license header
 cat << EOF
 #  Copyright (c) 2010, Plausible Labs Cooperative, Inc.
 #  
 #  Permission is hereby granted, free of charge, to any person obtaining
 #  a copy of this software and associated documentation files (the
 #  ``Software''), to deal in the Software without restriction, including
 #  without limitation the rights to use, copy, modify, merge, publish,
 #  distribute, sublicense, and/or sell copies of the Software, and to
 #  permit persons to whom the Software is furnished to do so, subject to
 #  the following conditions:
 #
 #  The above copyright notice and this permission notice shall be included
 #  in all copies or substantial portions of the Software.
 #
 #  THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
 #  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 #  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 #  NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 #  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 #  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 #  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 #  DEALINGS IN THE SOFTWARE.
 #  -----------------------------------------------------------------------
 EOF
    # Write out the trampoline table, aligned to the page boundary
    echo ".text"
    echo ".align 12"
    echo ".globl _ffi_closure_trampoline_table_page"
    echo "_ffi_closure_trampoline_table_page:"
 }
 # WARNING - Don't modify the trampoline code size without also updating the relevant libffi code
 trampoline () {
    cat << END
    // trampoline
    // Save to stack
    stmfd sp!, {r0-r3}
    // Load the context argument from the config page.
    // This places the first usable config value at _ffi_closure_trampoline_table-4080
    // This accounts for the above 4-byte stmfd instruction, plus 8 bytes constant when loading from pc.
    ldr r0, [pc, #-4092]
    // Load the jump address from the config page.
    ldr pc, [pc, #-4092]
 END
 }
 main () {
    # Write out the header
    header
    # Write out the trampolines
    local i=0
    while [ $i -lt ${TRAMPOLINE_COUNT} ]; do
        trampoline
        local i=`expr $i + 1`
    done
 }
 main
--- a/src/arm/sysv.S
+++ b/src/arm/sysv.S
@@ -189,139 +189,128 @@ ARM_FUNC_END(ffi_call_SYSV)
 ARM_FUNC_START(ffi_closure_SYSV, 1)
 	UNWIND	.fnstart
-	UNWIND .pad #16
+	cfi_startproc
-	add	ip, sp, #16
+	stmdb	sp!, {r0-r3}			@ save argument regs
-	stmfd	sp!, {ip, lr}
+	cfi_adjust_cfa_offset(16)
-	UNWIND .save	{r0, lr}
+	mov	r0, ip				@ load closure
-	add	r2, sp, #8
+	add	ip, sp, #16			@ compute entry sp
-	UNWIND .pad #16
+	sub	sp, sp, #32			@ allocate rvalue space
-	sub	sp, sp, #16
+	stmdb	sp!, {sp,lr}
 	str	sp, [sp, #8]
 	add	r1, sp, #8
 	bl	CNAME(ffi_closure_inner)
 	cmp	r0, #FFI_TYPE_INT
 	beq	.Lretint
-	cmp	r0, #FFI_TYPE_FLOAT
+	/* Remember that EABI unwind info only applies at call sites.
-#if defined(__SOFTFP__) || defined(__ARM_EABI__)
+	   We need do nothing except note the save of the stack pointer
-	beq	.Lretint
+	   and the link registers.  */
-#else
+	UNWIND	.save {sp,lr}
-	beq	.Lretfloat
+	cfi_adjust_cfa_offset(8)
-#endif
+	cfi_rel_offset(lr, 4)
-	cmp	r0, #FFI_TYPE_DOUBLE
+	add	r1, sp, #8			@ load respp
-#if defined(__SOFTFP__) || defined(__ARM_EABI__)
+	add	r2, sp, #8+32			@ load args
-	beq	.Lretlonglong
+	mov	r3, #0				@ load vfp_args
 #else
 	beq	.Lretdouble
 #endif
-	cmp	r0, #FFI_TYPE_LONGDOUBLE
+	bl	CNAME(ffi_closure_inner_SYSV)
 #if defined(__SOFTFP__) || defined(__ARM_EABI__)
 	beq	.Lretlonglong
 #else
 	beq	.Lretlongdouble
 #endif
-	cmp	r0, #FFI_TYPE_SINT64
+	@ Load values returned in registers.
-	beq	.Lretlonglong
+	add	r2, sp, #8			@ load respp
-.Lclosure_epilogue:
+	adr	r3, CNAME(ffi_closure_ret)
-	add	sp, sp, #16
+	add	pc, r3, r0, lsl #3
-	ldmfd	sp, {sp, pc}
+	cfi_endproc
-.Lretint:
+	UNWIND	.fnend
 	ldr	r0, [sp]
 	b	.Lclosure_epilogue
 .Lretlonglong:
 	ldr	r0, [sp]
 	ldr	r1, [sp, #4]
 	b	.Lclosure_epilogue
 #if !defined(__SOFTFP__) && !defined(__ARM_EABI__)
 .Lretfloat:
 	ldfs	f0, [sp]
 	b	.Lclosure_epilogue
 .Lretdouble:
 	ldfd	f0, [sp]
 	b	.Lclosure_epilogue
 .Lretlongdouble:
 	ldfd	f0, [sp]
 	b	.Lclosure_epilogue
 #endif
 	UNWIND .fnend
 ARM_FUNC_END(ffi_closure_SYSV)
 /* Below are VFP hard-float ABI call and closure implementations.
   Add VFP FPU directive here. This is only compiled into the library
   under EABI.  */
 #ifdef __ARM_EABI__
 ARM_FUNC_START(ffi_closure_VFP, 1)
 	UNWIND	.fnstart
-	fstmfdd	sp!, {d0-d7}
+	cfi_startproc
-	@ r0-r3, then d0-d7
+	stmdb	sp!, {r0-r3}			@ save argument regs
-	UNWIND .pad #80
+	cfi_adjust_cfa_offset(16)
-	add	ip, sp, #80
+	sub	sp, sp, #64+32			@ allocate vfp+rvalue space
-	stmfd	sp!, {ip, lr}
+	cfi_adjust_cfa_offset(64+32)
-	UNWIND .save	{r0, lr}
+	stc	p11, cr0, [sp], {16}		@ vstm sp, {d0-d7}
-	add	r2, sp, #72
+	mov	r0, ip				@ load closure
-	add	r3, sp, #8
+	add	ip, sp, #16+64+32		@ compute entry sp
-	UNWIND .pad #72
+	stmdb	sp!, {ip,lr}
 	sub	sp, sp, #72
 	str	sp, [sp, #64]
 	add	r1, sp, #64
 	bl	CNAME(ffi_closure_inner)
-	cmp	r0, #FFI_TYPE_INT
+	/* See above.  */
-	beq	.Lretint_vfp
+	UNWIND	.save {sp,lr}
 	cfi_adjust_cfa_offset(8)
 	cfi_rel_offset(sp, 0)
 	cfi_rel_offset(lr, 4)
-	cmp	r0, #FFI_TYPE_FLOAT
+	add	r1, sp, #8+64			@ load respp
-	beq	.Lretfloat_vfp
+	add	r2, sp, #8+64+32		@ load args
 	add	r3, sp, #8			@ load vfp_args
-	cmp	r0, #FFI_TYPE_DOUBLE
+	bl	CNAME(ffi_closure_inner_VFP)
 	cmpne	r0, #FFI_TYPE_LONGDOUBLE
 	beq	.Lretdouble_vfp
-	cmp	r0, #FFI_TYPE_SINT64
+	@ Load values returned in registers.
-	beq	.Lretlonglong_vfp
+	add	r2, sp, #8+64			@ load respp
-
+	adr	r3, CNAME(ffi_closure_ret)
-	cmp	r0, #FFI_TYPE_STRUCT_VFP_FLOAT
+	add	pc, r3, r0, lsl #3
-	beq	.Lretfloat_struct_vfp
+	cfi_endproc
-
+	UNWIND	.fnend
 	cmp	r0, #FFI_TYPE_STRUCT_VFP_DOUBLE
 	beq	.Lretdouble_struct_vfp
 .Lclosure_epilogue_vfp:
 	add	sp, sp, #72
 	ldmfd	sp, {sp, pc}
 .Lretfloat_vfp:
 	flds	s0, [sp]
 	b	.Lclosure_epilogue_vfp
 .Lretdouble_vfp:
 	fldd	d0, [sp]
 	b	.Lclosure_epilogue_vfp
 .Lretint_vfp:
 	ldr	r0, [sp]
 	b	.Lclosure_epilogue_vfp
 .Lretlonglong_vfp:
 	ldmia	sp, {r0, r1}
 	b	.Lclosure_epilogue_vfp
 .Lretfloat_struct_vfp:
 	fldmiad	sp, {d0-d1}
 	b	.Lclosure_epilogue_vfp
 .Lretdouble_struct_vfp:
 	fldmiad	sp, {d0-d3}
 	b	.Lclosure_epilogue_vfp
 	UNWIND .fnend
 ARM_FUNC_END(ffi_closure_VFP)
-#endif
+
 /* Load values returned in registers for both closure entry points.
   Note that we use LDM with SP in the register set.  This is deprecated
   by ARM, but not yet unpredictable.  */
 ARM_FUNC_START(ffi_closure_ret, 0)
 	cfi_startproc
 	cfi_rel_offset(sp, 0)
 	cfi_rel_offset(lr, 4)
 0:
 E ARM_TYPE_VFP_S
 	ldc	p10, cr0, [r2]			@ vldr s0, [r2]
 	ldm	sp, {sp,pc}
 E ARM_TYPE_VFP_D
 	ldc	p11, cr0, [r2]			@ vldr d0, [r2]
 	ldm	sp, {sp,pc}
 E ARM_TYPE_VFP_N
 	ldc	p11, cr0, [r2], {8}		@ vldm r2, {d0-d3}
 	ldm	sp, {sp,pc}
 E ARM_TYPE_INT64
 	ldr	r1, [r2, #4]
 	nop
 E ARM_TYPE_INT
 	ldr	r0, [r2]
 	ldm	sp, {sp,pc}
 E ARM_TYPE_VOID
 	ldm	sp, {sp,pc}
 	nop
 E ARM_TYPE_STRUCT
 	ldm	sp, {sp,pc}
 	cfi_endproc
 ARM_FUNC_END(ffi_closure_ret)
 #if FFI_EXEC_TRAMPOLINE_TABLE
 /* ??? The iOS support should be updated.  The first insn used to
   be STMFD, but that's been moved into ffi_closure_SYSV.  If the
   writable page is put after this one we can make use of the
   pc+8 feature of the architecture.  We can also reduce the size
   of the thunk to 8 and pack more of these into the page.
   In the meantime, simply replace the STMFD with a NOP so as to
   keep all the magic numbers the same within ffi.c.  */
 	.align	12
 ARM_FUNC_START(ffi_closure_trampoline_table_page)
 .rept	4096 / 12
 	nop
 	ldr	ip, [pc, #-4092]
 	ldr	pc, [pc, #-4092]
 .endr
 #else
 ARM_FUNC_START(ffi_arm_trampoline, 1)
-	stmfd sp!, {r0-r3}
+0:	adr	ip, 0b
-	ldr r0, [pc]
+	ldr	pc, 1f
-	ldr pc, [pc]
+1:	.long	0
 ARM_FUNC_END(ffi_arm_trampoline)
 #endif /* FFI_EXEC_TRAMPOLINE_TABLE */
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",%progbits
 #endif
--- a/src/arm/trampoline.S
+++ b/src/arm/trampoline.S