Merge pull request #164 from rth7680/darwin

Fix build on darwin
2015-01-10 09:22:55 -05:00
parent 9131039c93 b7f6d7aa9b
commit dd0b59a5cf
7 changed files with 899 additions and 1283 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -144,7 +144,6 @@ EXTRA_libffi_la_SOURCES = \
 	src/x86/ffi.c src/x86/sysv.S					\
 	 src/x86/ffiw64.c src/x86/win64.S 				\
 	 src/x86/ffi64.c src/x86/unix64.S				\
-	 src/x86/darwin64.S src/x86/darwin.S				\
 	src/xtensa/ffi.c src/xtensa/sysv.S

 TARGET_OBJ = @TARGET_OBJ@
--- a/configure.host
+++ b/configure.host
@@ -84,7 +84,12 @@ case "${host}" in
 	;;

  i?86-*-darwin* | x86_64-*-darwin*)
-	TARGET=X86_DARWIN; TARGETDIR=x86
+	TARGETDIR=x86
+	if test $ac_cv_sizeof_size_t = 4; then
+	  TARGET=X86_DARWIN
+	else
+	  TARGET=X86_64
+	fi
 	;;

  i?86-*-* | x86_64-*-* | amd64-*)
@@ -237,7 +242,7 @@ case "${TARGET}" in
  POWERPC_FREEBSD)
 	SOURCES="ffi.c ffi_sysv.c sysv.S ppc_closure.S"
 	;;
-  X86 | X86_FREEBSD | X86_WIN32)
+  X86 | X86_DARWIN | X86_FREEBSD | X86_WIN32)
 	SOURCES="ffi.c sysv.S"
 	;;
  X86_64)
@@ -246,9 +251,6 @@ case "${TARGET}" in
  X86_WIN64)
 	SOURCES="ffiw64.c win64.S"
 	;;
-  X86_DARWIN)
-	SOURCES="ffi.c darwin.S ffi64.c darwin64.S"
-	;;
 esac

 # If we failed to configure SOURCES, we can't do anything.
--- a/src/x86/darwin.S
+++ b/src/x86/darwin.S
@@ -1,444 +0,0 @@
-/* -----------------------------------------------------------------------
-   darwin.S - Copyright (c) 1996, 1998, 2001, 2002, 2003, 2005  Red Hat, Inc.
-	Copyright (C) 2008  Free Software Foundation, Inc.
-
-   X86 Foreign Function Interface
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   ``Software''), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be included
-   in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
-   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-   DEALINGS IN THE SOFTWARE.
-   -----------------------------------------------------------------------
-   */
-
-#ifndef __x86_64__
-
-#define LIBFFI_ASM	
-#include <fficonfig.h>
-#include <ffi.h>
-
-.text
-
-.globl _ffi_prep_args
-
-	.align 4
-.globl _ffi_call_SYSV
-
-_ffi_call_SYSV:
-.LFB1:
-        pushl %ebp
-.LCFI0:
-        movl  %esp,%ebp
-.LCFI1:
-        subl $8,%esp
-	/* Make room for all of the new args.  */
-	movl  16(%ebp),%ecx
-	subl  %ecx,%esp
-
-	movl  %esp,%eax
-
-	/* Place all of the ffi_prep_args in position  */
-	subl  $8,%esp
-	pushl 12(%ebp)
-	pushl %eax
-	call  *8(%ebp)
-
-	/* Return stack to previous state and call the function  */
-	addl  $16,%esp	
-
-	call  *28(%ebp)
-
-	/* Load %ecx with the return type code  */
-	movl  20(%ebp),%ecx	
-
-	/* Protect %esi.  We're going to pop it in the epilogue.  */
-	pushl %esi
-
-	/* If the return value pointer is NULL, assume no return value.  */
-	cmpl  $0,24(%ebp)
-	jne  0f
-
-	/* Even if there is no space for the return value, we are 
-	   obliged to handle floating-point values.  */
-	cmpl  $FFI_TYPE_FLOAT,%ecx
-	jne   noretval
-	fstp  %st(0)
-
-	jmp   epilogue
-0:
-	.align 4
-	call 1f
-.Lstore_table:
-	.long   noretval-.Lstore_table		/* FFI_TYPE_VOID */
-	.long   retint-.Lstore_table		/* FFI_TYPE_INT */
-	.long   retfloat-.Lstore_table		/* FFI_TYPE_FLOAT */
-	.long   retdouble-.Lstore_table		/* FFI_TYPE_DOUBLE */
-	.long   retlongdouble-.Lstore_table     /* FFI_TYPE_LONGDOUBLE */
-	.long   retuint8-.Lstore_table		/* FFI_TYPE_UINT8 */
-	.long   retsint8-.Lstore_table		/* FFI_TYPE_SINT8 */
-	.long   retuint16-.Lstore_table		/* FFI_TYPE_UINT16 */
-	.long   retsint16-.Lstore_table		/* FFI_TYPE_SINT16 */
-	.long   retint-.Lstore_table		/* FFI_TYPE_UINT32 */
-	.long   retint-.Lstore_table		/* FFI_TYPE_SINT32 */
-	.long   retint64-.Lstore_table		/* FFI_TYPE_UINT64 */
-	.long   retint64-.Lstore_table		/* FFI_TYPE_SINT64 */
-	.long   retstruct-.Lstore_table		/* FFI_TYPE_STRUCT */
-	.long   retint-.Lstore_table		/* FFI_TYPE_POINTER */
-	.long   retstruct1b-.Lstore_table	/* FFI_TYPE_SMALL_STRUCT_1B */
-	.long   retstruct2b-.Lstore_table	/* FFI_TYPE_SMALL_STRUCT_2B */
-1:
-	pop  %esi
-	add  (%esi, %ecx, 4), %esi
-	jmp  *%esi
-
-	/* Sign/zero extend as appropriate.  */
-retsint8:
-	movsbl  %al, %eax
-	jmp  retint
-
-retsint16:
-	movswl  %ax, %eax
-	jmp  retint
-
-retuint8:
-	movzbl  %al, %eax
-	jmp  retint
-
-retuint16:
-	movzwl  %ax, %eax
-	jmp  retint
-
-retfloat:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	fstps (%ecx)
-	jmp   epilogue
-
-retdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	fstpl (%ecx)
-	jmp   epilogue
-
-retlongdouble:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	fstpt (%ecx)
-	jmp   epilogue
-
-retint64:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movl  %eax,0(%ecx)
-	movl  %edx,4(%ecx)
-	jmp   epilogue
-
-retstruct1b:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movb  %al,0(%ecx)
-	jmp   epilogue
-
-retstruct2b:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movw  %ax,0(%ecx)
-	jmp   epilogue
-
-retint:
-	/* Load %ecx with the pointer to storage for the return value  */
-	movl  24(%ebp),%ecx
-	movl  %eax,0(%ecx)
-
-retstruct:
-	/* Nothing to do!  */
-
-noretval:
-epilogue:
-	popl %esi
-	movl %ebp,%esp
-	popl %ebp
-	ret
-
-.LFE1:
-.ffi_call_SYSV_end:
-
-	.align	4
-FFI_HIDDEN (ffi_closure_SYSV)
-.globl _ffi_closure_SYSV
-
-_ffi_closure_SYSV:
-.LFB2:
-	pushl	%ebp
-.LCFI2:
-	movl	%esp, %ebp
-.LCFI3:
-	subl	$40, %esp
-	leal	-24(%ebp), %edx
-	movl	%edx, -12(%ebp)	/* resp */
-	leal	8(%ebp), %edx
-	movl	%edx, 4(%esp)	/* args = __builtin_dwarf_cfa () */
-	leal	-12(%ebp), %edx
-	movl	%edx, (%esp)	/* &resp */
-	movl	%ebx, 8(%esp)
-.LCFI7:
-	call	L_ffi_closure_SYSV_inner$stub
-	movl	8(%esp), %ebx
-	movl	-12(%ebp), %ecx
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lcls_retint
-
-0:	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lcls_retllong
-	cmpl	$FFI_TYPE_SMALL_STRUCT_1B, %eax
-	je	.Lcls_retstruct1b
-	cmpl	$FFI_TYPE_SMALL_STRUCT_2B, %eax
-	je	.Lcls_retstruct2b
-	cmpl	$FFI_TYPE_STRUCT, %eax
-	je	.Lcls_retstruct
-.Lcls_epilogue:
-	movl	%ebp, %esp
-	popl	%ebp
-	ret
-.Lcls_retint:
-	movl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retfloat:
-	flds	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retdouble:
-	fldl	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retldouble:
-	fldt	(%ecx)
-	jmp	.Lcls_epilogue
-.Lcls_retllong:
-	movl	(%ecx), %eax
-	movl	4(%ecx), %edx
-	jmp	.Lcls_epilogue
-.Lcls_retstruct1b:
-	movsbl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retstruct2b:
-	movswl	(%ecx), %eax
-	jmp	.Lcls_epilogue
-.Lcls_retstruct:
-	lea -8(%ebp),%esp
-	movl	%ebp, %esp
-	popl	%ebp
-	ret $4
-.LFE2:
-
-#if !FFI_NO_RAW_API
-
-#define RAW_CLOSURE_CIF_OFFSET ((FFI_TRAMPOLINE_SIZE + 3) & ~3)
-#define RAW_CLOSURE_FUN_OFFSET (RAW_CLOSURE_CIF_OFFSET + 4)
-#define RAW_CLOSURE_USER_DATA_OFFSET (RAW_CLOSURE_FUN_OFFSET + 4)
-#define CIF_FLAGS_OFFSET 20
-
-	.align	4
-FFI_HIDDEN (ffi_closure_raw_SYSV)
-.globl _ffi_closure_raw_SYSV
-
-_ffi_closure_raw_SYSV:
-.LFB3:
-	pushl	%ebp
-.LCFI4:
-	movl	%esp, %ebp
-.LCFI5:
-	pushl	%esi
-.LCFI6:
-	subl	$36, %esp
-	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %esi	 /* closure->cif */
-	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* closure->user_data */
-	movl	%edx, 12(%esp)	/* user_data */
-	leal	8(%ebp), %edx	/* __builtin_dwarf_cfa () */
-	movl	%edx, 8(%esp)	/* raw_args */
-	leal	-24(%ebp), %edx
-	movl	%edx, 4(%esp)	/* &res */
-	movl	%esi, (%esp)	/* cif */
-	call	*RAW_CLOSURE_FUN_OFFSET(%eax)		 /* closure->fun */
-	movl	CIF_FLAGS_OFFSET(%esi), %eax		 /* rtype */
-	cmpl	$FFI_TYPE_INT, %eax
-	je	.Lrcls_retint
-
-	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
-	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
-	cmpl	$FFI_TYPE_UINT64, %eax
-	jge	0f
-	cmpl	$FFI_TYPE_UINT8, %eax
-	jge	.Lrcls_retint
-0:
-	cmpl	$FFI_TYPE_FLOAT, %eax
-	je	.Lrcls_retfloat
-	cmpl	$FFI_TYPE_DOUBLE, %eax
-	je	.Lrcls_retdouble
-	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
-	je	.Lrcls_retldouble
-	cmpl	$FFI_TYPE_SINT64, %eax
-	je	.Lrcls_retllong
-.Lrcls_epilogue:
-	addl	$36, %esp
-	popl	%esi
-	popl	%ebp
-	ret
-.Lrcls_retint:
-	movl	-24(%ebp), %eax
-	jmp	.Lrcls_epilogue
-.Lrcls_retfloat:
-	flds	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retdouble:
-	fldl	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retldouble:
-	fldt	-24(%ebp)
-	jmp	.Lrcls_epilogue
-.Lrcls_retllong:
-	movl	-24(%ebp), %eax
-	movl	-20(%ebp), %edx
-	jmp	.Lrcls_epilogue
-.LFE3:
-#endif
-
-.section __IMPORT,__jump_table,symbol_stubs,self_modifying_code+pure_instructions,5
-L_ffi_closure_SYSV_inner$stub:
-	.indirect_symbol _ffi_closure_SYSV_inner
-	hlt ; hlt ; hlt ; hlt ; hlt
-
-
-.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
-EH_frame1:
-	.set	L$set$0,LECIE1-LSCIE1
-	.long	L$set$0
-LSCIE1:
-	.long	0x0
-	.byte	0x1
-	.ascii "zR\0"
-	.byte	0x1
-	.byte	0x7c
-	.byte	0x8
-	.byte	0x1
-	.byte	0x10
-	.byte	0xc
-	.byte	0x5
-	.byte	0x4
-	.byte	0x88
-	.byte	0x1
-	.align 2
-LECIE1:
-.globl _ffi_call_SYSV.eh
-_ffi_call_SYSV.eh:
-LSFDE1:
-	.set	L$set$1,LEFDE1-LASFDE1
-	.long	L$set$1
-LASFDE1:
-	.long	LASFDE1-EH_frame1
-	.long	.LFB1-.
-	.set L$set$2,.LFE1-.LFB1
-	.long L$set$2
-	.byte	0x0
-	.byte	0x4
-	.set L$set$3,.LCFI0-.LFB1
-	.long L$set$3
-	.byte	0xe
-	.byte	0x8
-	.byte	0x84
-	.byte	0x2
-	.byte	0x4
-	.set L$set$4,.LCFI1-.LCFI0
-	.long L$set$4
-	.byte	0xd
-	.byte	0x4
-	.align 2
-LEFDE1:
-.globl _ffi_closure_SYSV.eh
-_ffi_closure_SYSV.eh:
-LSFDE2:
-	.set	L$set$5,LEFDE2-LASFDE2
-	.long	L$set$5
-LASFDE2:
-	.long	LASFDE2-EH_frame1
-	.long	.LFB2-.
-	.set L$set$6,.LFE2-.LFB2
-	.long L$set$6
-	.byte	0x0
-	.byte	0x4
-	.set L$set$7,.LCFI2-.LFB2
-	.long L$set$7
-	.byte	0xe
-	.byte	0x8
-	.byte	0x84
-	.byte	0x2
-	.byte	0x4
-	.set L$set$8,.LCFI3-.LCFI2
-	.long L$set$8
-	.byte	0xd
-	.byte	0x4
-	.align 2
-LEFDE2:
-
-#if !FFI_NO_RAW_API
-
-.globl _ffi_closure_raw_SYSV.eh
-_ffi_closure_raw_SYSV.eh:
-LSFDE3:
-	.set	L$set$10,LEFDE3-LASFDE3
-	.long	L$set$10
-LASFDE3:
-	.long	LASFDE3-EH_frame1
-	.long	.LFB3-.
-	.set L$set$11,.LFE3-.LFB3
-	.long L$set$11
-	.byte	0x0
-	.byte	0x4
-	.set L$set$12,.LCFI4-.LFB3
-	.long L$set$12
-	.byte	0xe
-	.byte	0x8
-	.byte	0x84
-	.byte	0x2
-	.byte	0x4
-	.set L$set$13,.LCFI5-.LCFI4
-	.long L$set$13
-	.byte	0xd
-	.byte	0x4
-	.byte	0x4
-	.set L$set$14,.LCFI6-.LCFI5
-	.long L$set$14
-	.byte	0x85
-	.byte	0x3
-	.align 2
-LEFDE3:
-
-#endif
-
-#endif /* ifndef __x86_64__ */
--- a/src/x86/darwin64.S
+++ b/src/x86/darwin64.S
@@ -1,416 +0,0 @@
-/* -----------------------------------------------------------------------
-   darwin64.S - Copyright (c) 2006 Free Software Foundation, Inc.
-	        Copyright (c) 2008 Red Hat, Inc.
-   derived from unix64.S
-
-   x86-64 Foreign Function Interface for Darwin.
-
-   Permission is hereby granted, free of charge, to any person obtaining
-   a copy of this software and associated documentation files (the
-   ``Software''), to deal in the Software without restriction, including
-   without limitation the rights to use, copy, modify, merge, publish,
-   distribute, sublicense, and/or sell copies of the Software, and to
-   permit persons to whom the Software is furnished to do so, subject to
-   the following conditions:
-
-   The above copyright notice and this permission notice shall be included
-   in all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
-   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-   IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR
-   OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-   ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-   OTHER DEALINGS IN THE SOFTWARE.
-   ----------------------------------------------------------------------- */
-
-#ifdef __x86_64__
-#define LIBFFI_ASM
-#include <fficonfig.h>
-#include <ffi.h>
-
-	.file "darwin64.S"
-.text
-
-/* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
-		    void *raddr, void (*fnaddr)(void));
-
-   Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
-   for this function.  This has been allocated by ffi_call.  We also
-   deallocate some of the stack that has been alloca'd.  */
-
-	.align	3
-	.globl	_ffi_call_unix64
-
-_ffi_call_unix64:
-LUW0:
-	movq	(%rsp), %r10		/* Load return address.  */
-	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
-	movq	%rdx, (%rax)		/* Save flags.  */
-	movq	%rcx, 8(%rax)		/* Save raddr.  */
-	movq	%rbp, 16(%rax)		/* Save old frame pointer.  */
-	movq	%r10, 24(%rax)		/* Relocate return address.  */
-	movq	%rax, %rbp		/* Finalize local stack frame.  */
-LUW1:
-	movq	%rdi, %r10		/* Save a copy of the register area. */
-	movq	%r8, %r11		/* Save a copy of the target fn.  */
-	movl	%r9d, %eax		/* Set number of SSE registers.  */
-
-	/* Load up all argument registers.  */
-	movq	(%r10), %rdi
-	movq	8(%r10), %rsi
-	movq	16(%r10), %rdx
-	movq	24(%r10), %rcx
-	movq	32(%r10), %r8
-	movq	40(%r10), %r9
-	testl	%eax, %eax
-	jnz	Lload_sse
-Lret_from_load_sse:
-
-	/* Deallocate the reg arg area.  */
-	leaq	176(%r10), %rsp
-
-	/* Call the user function.  */
-	call	*%r11
-
-	/* Deallocate stack arg area; local stack frame in redzone.  */
-	leaq	24(%rbp), %rsp
-
-	movq	0(%rbp), %rcx		/* Reload flags.  */
-	movq	8(%rbp), %rdi		/* Reload raddr.  */
-	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
-LUW2:
-
-	/* The first byte of the flags contains the FFI_TYPE.  */
-	movzbl	%cl, %r10d
-	leaq	Lstore_table(%rip), %r11
-	movslq	(%r11, %r10, 4), %r10
-	addq	%r11, %r10
-	jmp	*%r10
-
-Lstore_table:
-	.long	Lst_void-Lstore_table		/* FFI_TYPE_VOID */
-	.long	Lst_sint32-Lstore_table		/* FFI_TYPE_INT */
-	.long	Lst_float-Lstore_table		/* FFI_TYPE_FLOAT */
-	.long	Lst_double-Lstore_table		/* FFI_TYPE_DOUBLE */
-	.long	Lst_ldouble-Lstore_table	/* FFI_TYPE_LONGDOUBLE */
-	.long	Lst_uint8-Lstore_table		/* FFI_TYPE_UINT8 */
-	.long	Lst_sint8-Lstore_table		/* FFI_TYPE_SINT8 */
-	.long	Lst_uint16-Lstore_table		/* FFI_TYPE_UINT16 */
-	.long	Lst_sint16-Lstore_table		/* FFI_TYPE_SINT16 */
-	.long	Lst_uint32-Lstore_table		/* FFI_TYPE_UINT32 */
-	.long	Lst_sint32-Lstore_table		/* FFI_TYPE_SINT32 */
-	.long	Lst_int64-Lstore_table		/* FFI_TYPE_UINT64 */
-	.long	Lst_int64-Lstore_table		/* FFI_TYPE_SINT64 */
-	.long	Lst_struct-Lstore_table		/* FFI_TYPE_STRUCT */
-	.long	Lst_int64-Lstore_table		/* FFI_TYPE_POINTER */
-
-	.text
-	.align	3
-Lst_void:
-	ret
-	.align	3
-Lst_uint8:
-	movzbq	%al, %rax
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_sint8:
-	movsbq	%al, %rax
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_uint16:
-	movzwq	%ax, %rax
-	movq	%rax, (%rdi)
-	.align	3
-Lst_sint16:
-	movswq	%ax, %rax
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_uint32:
-	movl	%eax, %eax
-	movq	%rax, (%rdi)
-	.align	3
-Lst_sint32:
-	cltq
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_int64:
-	movq	%rax, (%rdi)
-	ret
-	.align	3
-Lst_float:
-	movss	%xmm0, (%rdi)
-	ret
-	.align	3
-Lst_double:
-	movsd	%xmm0, (%rdi)
-	ret
-Lst_ldouble:
-	fstpt	(%rdi)
-	ret
-	.align	3
-Lst_struct:
-	leaq	-20(%rsp), %rsi		/* Scratch area in redzone.  */
-
-	/* We have to locate the values now, and since we don't want to
-	   write too much data into the user's return value, we spill the
-	   value to a 16 byte scratch area first.  Bits 8, 9, and 10
-	   control where the values are located.  Only one of the three
-	   bits will be set; see ffi_prep_cif_machdep for the pattern.  */
-	movd	%xmm0, %r10
-	movd	%xmm1, %r11
-	testl	$0x100, %ecx
-	cmovnz	%rax, %rdx
-	cmovnz	%r10, %rax
-	testl	$0x200, %ecx
-	cmovnz	%r10, %rdx
-	testl	$0x400, %ecx
-	cmovnz	%r10, %rax
-	cmovnz	%r11, %rdx
-	movq	%rax, (%rsi)
-	movq	%rdx, 8(%rsi)
-
-	/* Bits 12-31 contain the true size of the structure.  Copy from
-	   the scratch area to the true destination.  */
-	shrl	$12, %ecx
-	rep movsb
-	ret
-
-	/* Many times we can avoid loading any SSE registers at all.
-	   It's not worth an indirect jump to load the exact set of
-	   SSE registers needed; zero or all is a good compromise.  */
-	.align	3
-LUW3:
-Lload_sse:
-	movdqa	48(%r10), %xmm0
-	movdqa	64(%r10), %xmm1
-	movdqa	80(%r10), %xmm2
-	movdqa	96(%r10), %xmm3
-	movdqa	112(%r10), %xmm4
-	movdqa	128(%r10), %xmm5
-	movdqa	144(%r10), %xmm6
-	movdqa	160(%r10), %xmm7
-	jmp	Lret_from_load_sse
-
-LUW4:
-	.align	3
-	.globl	_ffi_closure_unix64
-
-_ffi_closure_unix64:
-LUW5:
-	/* The carry flag is set by the trampoline iff SSE registers
-	   are used.  Don't clobber it before the branch instruction.  */
-	leaq    -200(%rsp), %rsp
-LUW6:
-	movq	%rdi, (%rsp)
-	movq    %rsi, 8(%rsp)
-	movq    %rdx, 16(%rsp)
-	movq    %rcx, 24(%rsp)
-	movq    %r8, 32(%rsp)
-	movq    %r9, 40(%rsp)
-	jc      Lsave_sse
-Lret_from_save_sse:
-
-	movq	%r10, %rdi
-	leaq	176(%rsp), %rsi
-	movq	%rsp, %rdx
-	leaq	208(%rsp), %rcx
-	call	_ffi_closure_unix64_inner
-
-	/* Deallocate stack frame early; return value is now in redzone.  */
-	addq	$200, %rsp
-LUW7:
-
-	/* The first byte of the return value contains the FFI_TYPE.  */
-	movzbl	%al, %r10d
-	leaq	Lload_table(%rip), %r11
-	movslq	(%r11, %r10, 4), %r10
-	addq	%r11, %r10
-	jmp	*%r10
-
-Lload_table:
-	.long	Lld_void-Lload_table		/* FFI_TYPE_VOID */
-	.long	Lld_int32-Lload_table		/* FFI_TYPE_INT */
-	.long	Lld_float-Lload_table		/* FFI_TYPE_FLOAT */
-	.long	Lld_double-Lload_table		/* FFI_TYPE_DOUBLE */
-	.long	Lld_ldouble-Lload_table		/* FFI_TYPE_LONGDOUBLE */
-	.long	Lld_int8-Lload_table		/* FFI_TYPE_UINT8 */
-	.long	Lld_int8-Lload_table		/* FFI_TYPE_SINT8 */
-	.long	Lld_int16-Lload_table		/* FFI_TYPE_UINT16 */
-	.long	Lld_int16-Lload_table		/* FFI_TYPE_SINT16 */
-	.long	Lld_int32-Lload_table		/* FFI_TYPE_UINT32 */
-	.long	Lld_int32-Lload_table		/* FFI_TYPE_SINT32 */
-	.long	Lld_int64-Lload_table		/* FFI_TYPE_UINT64 */
-	.long	Lld_int64-Lload_table		/* FFI_TYPE_SINT64 */
-	.long	Lld_struct-Lload_table		/* FFI_TYPE_STRUCT */
-	.long	Lld_int64-Lload_table		/* FFI_TYPE_POINTER */
-
-	.text
-	.align	3
-Lld_void:
-	ret
-	.align	3
-Lld_int8:
-	movzbl	-24(%rsp), %eax
-	ret
-	.align	3
-Lld_int16:
-	movzwl	-24(%rsp), %eax
-	ret
-	.align	3
-Lld_int32:
-	movl	-24(%rsp), %eax
-	ret
-	.align	3
-Lld_int64:
-	movq	-24(%rsp), %rax
-	ret
-	.align	3
-Lld_float:
-	movss	-24(%rsp), %xmm0
-	ret
-	.align	3
-Lld_double:
-	movsd	-24(%rsp), %xmm0
-	ret
-	.align	3
-Lld_ldouble:
-	fldt	-24(%rsp)
-	ret
-	.align	3
-Lld_struct:
-	/* There are four possibilities here, %rax/%rdx, %xmm0/%rax,
-	   %rax/%xmm0, %xmm0/%xmm1.  We collapse two by always loading
-	   both rdx and xmm1 with the second word.  For the remaining,
-	   bit 8 set means xmm0 gets the second word, and bit 9 means
-	   that rax gets the second word.  */
-	movq	-24(%rsp), %rcx
-	movq	-16(%rsp), %rdx
-	movq	-16(%rsp), %xmm1
-	testl	$0x100, %eax
-	cmovnz	%rdx, %rcx
-	movd	%rcx, %xmm0
-	testl	$0x200, %eax
-	movq	-24(%rsp), %rax
-	cmovnz	%rdx, %rax
-	ret
-
-	/* See the comment above Lload_sse; the same logic applies here.  */
-	.align	3
-LUW8:
-Lsave_sse:
-	movdqa	%xmm0, 48(%rsp)
-	movdqa	%xmm1, 64(%rsp)
-	movdqa	%xmm2, 80(%rsp)
-	movdqa	%xmm3, 96(%rsp)
-	movdqa	%xmm4, 112(%rsp)
-	movdqa	%xmm5, 128(%rsp)
-	movdqa	%xmm6, 144(%rsp)
-	movdqa	%xmm7, 160(%rsp)
-	jmp	Lret_from_save_sse
-
-LUW9:
-.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
-EH_frame1:
-	.set	L$set$0,LECIE1-LSCIE1		/* CIE Length */
-	.long	L$set$0
-LSCIE1:
-	.long	0x0		/* CIE Identifier Tag */
-	.byte	0x1		/* CIE Version */
-	.ascii	"zR\0"		/* CIE Augmentation */
-	.byte	0x1		/* uleb128 0x1; CIE Code Alignment Factor */
-	.byte	0x78		/* sleb128 -8; CIE Data Alignment Factor */
-	.byte	0x10		/* CIE RA Column */
-	.byte	0x1		/* uleb128 0x1; Augmentation size */
-	.byte	0x10		/* FDE Encoding (pcrel sdata4) */
-	.byte	0xc		/* DW_CFA_def_cfa, %rsp offset 8 */
-	.byte	0x7		/* uleb128 0x7 */
-	.byte	0x8		/* uleb128 0x8 */
-	.byte	0x90		/* DW_CFA_offset, column 0x10 */
-	.byte	0x1
-	.align	3
-LECIE1:
-	.globl _ffi_call_unix64.eh
-_ffi_call_unix64.eh:
-LSFDE1:
-	.set	L$set$1,LEFDE1-LASFDE1	/* FDE Length */
-	.long	L$set$1
-LASFDE1:
-	.long	LASFDE1-EH_frame1	/* FDE CIE offset */
-	.quad	LUW0-.			/* FDE initial location */
-	.set	L$set$2,LUW4-LUW0	/* FDE address range */
-	.quad	L$set$2
-	.byte	0x0			/* Augmentation size */
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$3,LUW1-LUW0
-	.long	L$set$3
-
-	/* New stack frame based off rbp.  This is a itty bit of unwind
-	   trickery in that the CFA *has* changed.  There is no easy way
-	   to describe it correctly on entry to the function.  Fortunately,
-	   it doesn't matter too much since at all points we can correctly
-	   unwind back to ffi_call.  Note that the location to which we
-	   moved the return address is (the new) CFA-8, so from the
-	   perspective of the unwind info, it hasn't moved.  */
-	.byte	0xc			/* DW_CFA_def_cfa, %rbp offset 32 */
-	.byte	0x6
-	.byte	0x20
-	.byte	0x80+6			/* DW_CFA_offset, %rbp offset 2*-8 */
-	.byte	0x2
-	.byte	0xa			/* DW_CFA_remember_state */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$4,LUW2-LUW1
-	.long	L$set$4
-	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
-	.byte	0x7
-	.byte	0x8
-	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$5,LUW3-LUW2
-	.long	L$set$5
-	.byte	0xb			/* DW_CFA_restore_state */
-
-	.align	3
-LEFDE1:
-	.globl _ffi_closure_unix64.eh
-_ffi_closure_unix64.eh:
-LSFDE3:
-	.set	L$set$6,LEFDE3-LASFDE3	/* FDE Length */
-	.long	L$set$6
-LASFDE3:
-	.long	LASFDE3-EH_frame1	/* FDE CIE offset */
-	.quad	LUW5-.			/* FDE initial location */
-	.set	L$set$7,LUW9-LUW5	/* FDE address range */
-	.quad	L$set$7
-	.byte	0x0			/* Augmentation size */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$8,LUW6-LUW5
-	.long	L$set$8
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.byte	208,1			/* uleb128 208 */
-	.byte	0xa			/* DW_CFA_remember_state */
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$9,LUW7-LUW6
-	.long	L$set$9
-	.byte	0xe			/* DW_CFA_def_cfa_offset */
-	.byte	0x8
-
-	.byte	0x4			/* DW_CFA_advance_loc4 */
-	.set	L$set$10,LUW8-LUW7
-	.long	L$set$10
-	.byte	0xb			/* DW_CFA_restore_state */
-
-	.align	3
-LEFDE3:
-	.subsections_via_symbols
-
-#endif /* __x86_64__ */
--- a/src/x86/ffi.c
+++ b/src/x86/ffi.c
@@ -332,13 +332,28 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
      else
 	{
 	  size_t za = ALIGN (z, FFI_SIZEOF_ARG);
+	  size_t align = FFI_SIZEOF_ARG;
+
+	  /* Alignment rules for arguments are quite complex.  Vectors and
+	     structures with 16 byte alignment get it.  Note that long double
+	     on Darwin does have 16 byte alignment, and does not get this
+	     alignment if passed directly; a structure with a long double
+	     inside, however, would get 16 byte alignment.  Since libffi does
+	     not support vectors, we need non concern ourselves with other
+	     cases.  */
+	  if (t == FFI_TYPE_STRUCT && ty->alignment >= 16)
+	    align = 16;
+	    
 	  if (dir < 0)
 	    {
+	      /* ??? These reverse argument ABIs are probably too old
+		 to have cared about alignment.  Someone should check.  */
 	      argp -= za;
 	      memcpy (argp, valp, z);
 	    }
 	  else
 	    {
+	      argp = (char *)ALIGN (argp, align);
 	      memcpy (argp, valp, z);
 	      argp += za;
 	    }
@@ -419,8 +434,9 @@ ffi_closure_inner (struct closure_frame *frame, char *stack)
  arg_types = cif->arg_types;
  for (i = 0; i < n; ++i)
    {
-      size_t z = arg_types[i]->size;
-      int t = arg_types[i]->type;
+      ffi_type *ty = arg_types[i];
+      size_t z = ty->size;
+      int t = ty->type;
      void *valp;

      if (z <= FFI_SIZEOF_ARG && t != FFI_TYPE_STRUCT)
@@ -441,13 +457,22 @@ ffi_closure_inner (struct closure_frame *frame, char *stack)
      else
 	{
 	  size_t za = ALIGN (z, FFI_SIZEOF_ARG);
+	  size_t align = FFI_SIZEOF_ARG;
+
+	  /* See the comment in ffi_call_int.  */
+	  if (t == FFI_TYPE_STRUCT && ty->alignment >= 16)
+	    align = 16;
+
 	  if (dir < 0)
 	    {
+	      /* ??? These reverse argument ABIs are probably too old
+		 to have cared about alignment.  Someone should check.  */
 	      argp -= za;
 	      valp = argp;
 	    }
 	  else
 	    {
+	      argp = (char *)ALIGN (argp, align);
 	      valp = argp;
 	      argp += za;
 	    }
--- a/src/x86/sysv.S
+++ b/src/x86/sysv.S
--- a/src/x86/unix64.S
+++ b/src/x86/unix64.S
@@ -30,21 +30,41 @@
 #define LIBFFI_ASM	
 #include <fficonfig.h>
 #include <ffi.h>
-#include <ffi_cfi.h>
 #include "internal64.h"

 	.text

+#define C2(X, Y)  X ## Y
+#define C1(X, Y)  C2(X, Y)
+#ifdef __USER_LABEL_PREFIX__
+# define C(X)     C1(__USER_LABEL_PREFIX__, X)
+#else
+# define C(X)     X
+#endif
+
+#ifdef __APPLE__
+# define L(X)     C1(L, X)
+#else
+# define L(X)     C1(.L, X)
+#endif
+
+#ifdef __ELF__
+# define PLT(X)	  X@PLT
+# define ENDF(X)  .type	X,@function; .size X, . - X
+#else
+# define PLT(X)	  X
+# define ENDF(X)
+#endif
+
 /* This macro allows the safe creation of jump tables without an
   actual table.  The entry points into the table are all 8 bytes.
   The use of ORG asserts that we're at the correct location.  */
 /* ??? The clang assembler doesn't handle .org with symbolic expressions.  */
-.macro E index
-	.align	8
-#ifndef __clang__
-	.org	0b + \index * 8, 0x90
+#if defined(__clang__) || defined(__APPLE__)
+# define E(BASE, X)	.balign 8
+#else
+# define E(BASE, X)	.balign 8; .org BASE + X * 8
 #endif
-.endm

 /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
 	            void *raddr, void (*fnaddr)(void));
@@ -53,13 +73,12 @@
   for this function.  This has been allocated by ffi_call.  We also
   deallocate some of the stack that has been alloca'd.  */

-	.align	8
-	.globl	ffi_call_unix64
-	.type	ffi_call_unix64,@function
-	FFI_HIDDEN(ffi_call_unix64)
+	.balign	8
+	.globl	C(ffi_call_unix64)
+	FFI_HIDDEN(C(ffi_call_unix64))

-ffi_call_unix64:
-	cfi_startproc
+C(ffi_call_unix64):
+L(UW0):
 	movq	(%rsp), %r10		/* Load return address.  */
 	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
 	movq	%rdx, (%rax)		/* Save flags.  */
@@ -75,8 +94,9 @@ ffi_call_unix64:
 	   unwind back to ffi_call.  Note that the location to which we
 	   moved the return address is (the new) CFA-8, so from the
 	   perspective of the unwind info, it hasn't moved.  */
-	cfi_def_cfa(%rbp, 32)
-	cfi_rel_offset(%rbp, 16)
+L(UW1):
+	/* cfi_def_cfa(%rbp, 32) */
+	/* cfi_rel_offset(%rbp, 16) */

 	movq	%rdi, %r10		/* Save a copy of the register area. */
 	movq	%r8, %r11		/* Save a copy of the target fn.  */
@@ -91,8 +111,8 @@ ffi_call_unix64:
 	movq	0x28(%r10), %r9
 	movl	0xb0(%r10), %eax
 	testl	%eax, %eax
-	jnz	.Lload_sse
-.Lret_from_load_sse:
+	jnz	L(load_sse)
+L(ret_from_load_sse):

 	/* Deallocate the reg arg area, except for r10, then load via pop.  */
 	leaq	0xb8(%r10), %rsp
@@ -107,94 +127,98 @@ ffi_call_unix64:
 	movq	0(%rbp), %rcx		/* Reload flags.  */
 	movq	8(%rbp), %rdi		/* Reload raddr.  */
 	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
-	cfi_remember_state
-	cfi_def_cfa(%rsp, 8)
-	cfi_restore(%rbp)
+L(UW2):
+	/* cfi_remember_state */
+	/* cfi_def_cfa(%rsp, 8) */
+	/* cfi_restore(%rbp) */

 	/* The first byte of the flags contains the FFI_TYPE.  */
 	cmpb	$UNIX64_RET_LAST, %cl
 	movzbl	%cl, %r10d
-	leaq	0f(%rip), %r11
-	ja	9f
+	leaq	L(store_table)(%rip), %r11
+	ja	L(sa)
 	leaq	(%r11, %r10, 8), %r10

 	/* Prep for the structure cases: scratch area in redzone.  */
 	leaq	-20(%rsp), %rsi
 	jmp	*%r10

-	.align	8
-0:
-E UNIX64_RET_VOID
+	.balign	8
+L(store_table):
+E(L(store_table), UNIX64_RET_VOID)
 	ret
-E UNIX64_RET_UINT8
+E(L(store_table), UNIX64_RET_UINT8)
 	movzbl	%al, %eax
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_UINT16
+E(L(store_table), UNIX64_RET_UINT16)
 	movzwl	%ax, %eax
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_UINT32
+E(L(store_table), UNIX64_RET_UINT32)
 	movl	%eax, %eax
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_SINT8
+E(L(store_table), UNIX64_RET_SINT8)
 	movsbq	%al, %rax
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_SINT16
+E(L(store_table), UNIX64_RET_SINT16)
 	movswq	%ax, %rax
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_SINT32
+E(L(store_table), UNIX64_RET_SINT32)
 	cltq
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_INT64
+E(L(store_table), UNIX64_RET_INT64)
 	movq	%rax, (%rdi)
 	ret
-E UNIX64_RET_XMM32
+E(L(store_table), UNIX64_RET_XMM32)
 	movd	%xmm0, (%rdi)
 	ret
-E UNIX64_RET_XMM64
+E(L(store_table), UNIX64_RET_XMM64)
 	movq	%xmm0, (%rdi)
 	ret
-E UNIX64_RET_X87
+E(L(store_table), UNIX64_RET_X87)
 	fstpt	(%rdi)
 	ret
-E UNIX64_RET_X87_2
+E(L(store_table), UNIX64_RET_X87_2)
 	fstpt	(%rdi)
 	fstpt	16(%rdi)
 	ret
-E UNIX64_RET_ST_XMM0_RAX
+E(L(store_table), UNIX64_RET_ST_XMM0_RAX)
 	movq	%rax, 8(%rsi)
-	jmp	3f
-E UNIX64_RET_ST_RAX_XMM0
+	jmp	L(s3)
+E(L(store_table), UNIX64_RET_ST_RAX_XMM0)
 	movq	%xmm0, 8(%rsi)
-	jmp	2f
-E UNIX64_RET_ST_XMM0_XMM1
+	jmp	L(s2)
+E(L(store_table), UNIX64_RET_ST_XMM0_XMM1)
 	movq	%xmm1, 8(%rsi)
-	jmp	3f
-E UNIX64_RET_ST_RAX_RDX
+	jmp	L(s3)
+E(L(store_table), UNIX64_RET_ST_RAX_RDX)
 	movq	%rdx, 8(%rsi)
-2:	movq	%rax, (%rsi)
+L(s2):
+	movq	%rax, (%rsi)
 	shrl	$UNIX64_SIZE_SHIFT, %ecx
 	rep movsb
 	ret
-	.align 8
-3:	movq	%xmm0, (%rsi)
+	.balign 8
+L(s3):
+	movq	%xmm0, (%rsi)
 	shrl	$UNIX64_SIZE_SHIFT, %ecx
 	rep movsb
 	ret

-9:	call	abort@PLT
+L(sa):	call	PLT(C(abort))

 	/* Many times we can avoid loading any SSE registers at all.
 	   It's not worth an indirect jump to load the exact set of
 	   SSE registers needed; zero or all is a good compromise.  */
-	.align 2
-	cfi_restore_state
-.Lload_sse:
+	.balign 2
+L(UW3):
+	/* cfi_restore_state */
+L(load_sse):
 	movdqa	0x30(%r10), %xmm0
 	movdqa	0x40(%r10), %xmm1
 	movdqa	0x50(%r10), %xmm2
@@ -203,10 +227,10 @@ E UNIX64_RET_ST_RAX_RDX
 	movdqa	0x80(%r10), %xmm5
 	movdqa	0x90(%r10), %xmm6
 	movdqa	0xa0(%r10), %xmm7
-	jmp	.Lret_from_load_sse
+	jmp	L(ret_from_load_sse)

-	cfi_endproc
-	.size    ffi_call_unix64,.-ffi_call_unix64
+L(UW4):
+ENDF(C(ffi_call_unix64))

 /* 6 general registers, 8 vector registers,
   32 bytes of rvalue, 8 bytes of alignment.  */
@@ -218,16 +242,15 @@ E UNIX64_RET_ST_RAX_RDX
 /* The location of rvalue within the red zone after deallocating the frame.  */
 #define ffi_closure_RED_RVALUE	(ffi_closure_OFS_RVALUE - ffi_closure_FS)

-	.align	2
-	.globl	ffi_closure_unix64_sse
-	.type	ffi_closure_unix64_sse,@function
-	FFI_HIDDEN(ffi_closure_unix64_sse)
+	.balign	2
+	.globl	C(ffi_closure_unix64_sse)
+	FFI_HIDDEN(C(ffi_closure_unix64_sse))

-ffi_closure_unix64_sse:
-	cfi_startproc
+C(ffi_closure_unix64_sse):
+L(UW5):
 	subq	$ffi_closure_FS, %rsp
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
-	cfi_def_cfa_offset(ffi_closure_FS + 8)
+L(UW6):
+	/* cfi_adjust_cfa_offset(ffi_closure_FS) */

 	movdqa	%xmm0, ffi_closure_OFS_V+0x00(%rsp)
 	movdqa	%xmm1, ffi_closure_OFS_V+0x10(%rsp)
@@ -237,22 +260,21 @@ ffi_closure_unix64_sse:
 	movdqa	%xmm5, ffi_closure_OFS_V+0x50(%rsp)
 	movdqa	%xmm6, ffi_closure_OFS_V+0x60(%rsp)
 	movdqa	%xmm7, ffi_closure_OFS_V+0x70(%rsp)
-	jmp	0f
+	jmp	L(sse_entry1)

-	cfi_endproc
-	.size	ffi_closure_unix64_sse,.-ffi_closure_unix64_sse
+L(UW7):
+ENDF(C(ffi_closure_unix64_sse))

-	.align	2
-	.globl	ffi_closure_unix64
-	.type	ffi_closure_unix64,@function
-	FFI_HIDDEN(ffi_closure_unix64)
+	.balign	2
+	.globl	C(ffi_closure_unix64)
+	FFI_HIDDEN(C(ffi_closure_unix64))

-ffi_closure_unix64:
-	cfi_startproc
+C(ffi_closure_unix64):
+L(UW8):
 	subq	$ffi_closure_FS, %rsp
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
-	cfi_def_cfa_offset(ffi_closure_FS + 8)
-0:
+L(UW9):
+	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
+L(sse_entry1):
 	movq	%rdi, ffi_closure_OFS_G+0x00(%rsp)
 	movq    %rsi, ffi_closure_OFS_G+0x08(%rsp)
 	movq    %rdx, ffi_closure_OFS_G+0x10(%rsp)
@@ -269,95 +291,97 @@ ffi_closure_unix64:
 	movq	FFI_TRAMPOLINE_SIZE+8(%r10), %rsi	/* Load fun */
 	movq	FFI_TRAMPOLINE_SIZE+16(%r10), %rdx	/* Load user_data */
 #endif
-.Ldo_closure:
+L(do_closure):
 	leaq	ffi_closure_OFS_RVALUE(%rsp), %rcx	/* Load rvalue */
 	movq	%rsp, %r8				/* Load reg_args */
 	leaq	ffi_closure_FS+8(%rsp), %r9		/* Load argp */
-	call	ffi_closure_unix64_inner
+	call	C(ffi_closure_unix64_inner)

 	/* Deallocate stack frame early; return value is now in redzone.  */
 	addq	$ffi_closure_FS, %rsp
-	cfi_adjust_cfa_offset(-ffi_closure_FS)
+L(UW10):
+	/* cfi_adjust_cfa_offset(-ffi_closure_FS) */

 	/* The first byte of the return value contains the FFI_TYPE.  */
 	cmpb	$UNIX64_RET_LAST, %al
 	movzbl	%al, %r10d
-	leaq	0f(%rip), %r11
-	ja	9f
+	leaq	L(load_table)(%rip), %r11
+	ja	L(la)
 	leaq	(%r11, %r10, 8), %r10
 	leaq	ffi_closure_RED_RVALUE(%rsp), %rsi
 	jmp	*%r10

-	.align	8
-0:
-E UNIX64_RET_VOID
+	.balign	8
+L(load_table):
+E(L(load_table), UNIX64_RET_VOID)
 	ret
-E UNIX64_RET_UINT8
+E(L(load_table), UNIX64_RET_UINT8)
 	movzbl	(%rsi), %eax
 	ret
-E UNIX64_RET_UINT16
+E(L(load_table), UNIX64_RET_UINT16)
 	movzwl	(%rsi), %eax
 	ret
-E UNIX64_RET_UINT32
+E(L(load_table), UNIX64_RET_UINT32)
 	movl	(%rsi), %eax
 	ret
-E UNIX64_RET_SINT8
+E(L(load_table), UNIX64_RET_SINT8)
 	movsbl	(%rsi), %eax
 	ret
-E UNIX64_RET_SINT16
+E(L(load_table), UNIX64_RET_SINT16)
 	movswl	(%rsi), %eax
 	ret
-E UNIX64_RET_SINT32
+E(L(load_table), UNIX64_RET_SINT32)
 	movl	(%rsi), %eax
 	ret
-E UNIX64_RET_INT64
+E(L(load_table), UNIX64_RET_INT64)
 	movq	(%rsi), %rax
 	ret
-E UNIX64_RET_XMM32
+E(L(load_table), UNIX64_RET_XMM32)
 	movd	(%rsi), %xmm0
 	ret
-E UNIX64_RET_XMM64
+E(L(load_table), UNIX64_RET_XMM64)
 	movq	(%rsi), %xmm0
 	ret
-E UNIX64_RET_X87
+E(L(load_table), UNIX64_RET_X87)
 	fldt	(%rsi)
 	ret
-E UNIX64_RET_X87_2
+E(L(load_table), UNIX64_RET_X87_2)
 	fldt	16(%rsi)
 	fldt	(%rsi)
 	ret
-E UNIX64_RET_ST_XMM0_RAX
+E(L(load_table), UNIX64_RET_ST_XMM0_RAX)
 	movq	8(%rsi), %rax
-	jmp	3f
-E UNIX64_RET_ST_RAX_XMM0
+	jmp	L(l3)
+E(L(load_table), UNIX64_RET_ST_RAX_XMM0)
 	movq	8(%rsi), %xmm0
-	jmp	2f
-E UNIX64_RET_ST_XMM0_XMM1
+	jmp	L(l2)
+E(L(load_table), UNIX64_RET_ST_XMM0_XMM1)
 	movq	8(%rsi), %xmm1
-	jmp	3f
-E UNIX64_RET_ST_RAX_RDX
+	jmp	L(l3)
+E(L(load_table), UNIX64_RET_ST_RAX_RDX)
 	movq	8(%rsi), %rdx
-2:	movq	(%rsi), %rax
+L(l2):
+	movq	(%rsi), %rax
 	ret
-	.align	8
-3:	movq	(%rsi), %xmm0
+	.balign	8
+L(l3):
+	movq	(%rsi), %xmm0
 	ret

-9:	call	abort@PLT
+L(la):	call	PLT(C(abort))

-	cfi_endproc
-	.size	ffi_closure_unix64,.-ffi_closure_unix64
+L(UW11):
+ENDF(C(ffi_closure_unix64))

-	.align	2
-	.globl	ffi_go_closure_unix64_sse
-	.type	ffi_go_closure_unix64_sse,@function
-	FFI_HIDDEN(ffi_go_closure_unix64_sse)
+	.balign	2
+	.globl	C(ffi_go_closure_unix64_sse)
+	FFI_HIDDEN(C(ffi_go_closure_unix64_sse))

-ffi_go_closure_unix64_sse:
-	cfi_startproc
+C(ffi_go_closure_unix64_sse):
+L(UW12):
 	subq	$ffi_closure_FS, %rsp
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
-	cfi_def_cfa_offset(ffi_closure_FS + 8)
+L(UW13):
+	/* cfi_adjust_cfa_offset(ffi_closure_FS) */

 	movdqa	%xmm0, ffi_closure_OFS_V+0x00(%rsp)
 	movdqa	%xmm1, ffi_closure_OFS_V+0x10(%rsp)
@@ -367,22 +391,21 @@ ffi_go_closure_unix64_sse:
 	movdqa	%xmm5, ffi_closure_OFS_V+0x50(%rsp)
 	movdqa	%xmm6, ffi_closure_OFS_V+0x60(%rsp)
 	movdqa	%xmm7, ffi_closure_OFS_V+0x70(%rsp)
-	jmp	0f
+	jmp	L(sse_entry2)

-	cfi_endproc
-	.size	ffi_go_closure_unix64_sse,.-ffi_go_closure_unix64_sse
+L(UW14):
+ENDF(C(ffi_go_closure_unix64_sse))

-	.align	2
-	.globl	ffi_go_closure_unix64
-	.type	ffi_go_closure_unix64,@function
-	FFI_HIDDEN(ffi_go_closure_unix64)
+	.balign	2
+	.globl	C(ffi_go_closure_unix64)
+	FFI_HIDDEN(C(ffi_go_closure_unix64))

-ffi_go_closure_unix64:
-	cfi_startproc
+C(ffi_go_closure_unix64):
+L(UW15):
 	subq	$ffi_closure_FS, %rsp
-	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
-	cfi_def_cfa_offset(ffi_closure_FS + 8)
-0:
+L(UW16):
+	/* cfi_adjust_cfa_offset(ffi_closure_FS) */
+L(sse_entry2):
 	movq	%rdi, ffi_closure_OFS_G+0x00(%rsp)
 	movq    %rsi, ffi_closure_OFS_G+0x08(%rsp)
 	movq    %rdx, ffi_closure_OFS_G+0x10(%rsp)
@@ -399,10 +422,123 @@ ffi_go_closure_unix64:
 	movq	16(%r10), %rsi		/* Load fun */
 	movq	%r10, %rdx		/* Load closure (user_data) */
 #endif
-	jmp	.Ldo_closure
+	jmp	L(do_closure)

-	cfi_endproc
-	.size	ffi_go_closure_unix64,.-ffi_go_closure_unix64
+L(UW17):
+ENDF(C(ffi_go_closure_unix64))
+
+/* Sadly, OSX cctools-as doesn't understand .cfi directives at all.  */
+
+#ifdef __APPLE__
+.section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
+EHFrame0:
+#elif defined(HAVE_AS_X86_64_UNWIND_SECTION_TYPE)
+.section .eh_frame,"a",@unwind
+#else
+.section .eh_frame,"a",@progbits
+#endif
+
+#ifdef HAVE_AS_X86_PCREL
+# define PCREL(X)	X - .
+#else
+# define PCREL(X)	X@rel
+#endif
+
+/* Simplify advancing between labels.  Assume DW_CFA_advance_loc1 fits.  */
+#define ADV(N, P)	.byte 2, L(N)-L(P)
+
+	.balign 8
+L(CIE):
+	.set	L(set0),L(ECIE)-L(SCIE)
+	.long	L(set0)			/* CIE Length */
+L(SCIE):
+	.long	0			/* CIE Identifier Tag */
+	.byte	1			/* CIE Version */
+	.ascii	"zR\0"			/* CIE Augmentation */
+	.byte	1			/* CIE Code Alignment Factor */
+	.byte	0x78			/* CIE Data Alignment Factor */
+	.byte	0x10			/* CIE RA Column */
+	.byte	1			/* Augmentation size */
+	.byte	0x1b			/* FDE Encoding (pcrel sdata4) */
+	.byte	0xc, 7, 8		/* DW_CFA_def_cfa, %rsp offset 8 */
+	.byte	0x80+16, 1		/* DW_CFA_offset, %rip offset 1*-8 */
+	.balign 8
+L(ECIE):
+
+	.set	L(set1),L(EFDE1)-L(SFDE1)
+	.long	L(set1)			/* FDE Length */
+L(SFDE1):
+	.long	L(SFDE1)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW0))		/* Initial location */
+	.long	L(UW4)-L(UW0)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW1, UW0)
+	.byte	0xc, 6, 32		/* DW_CFA_def_cfa, %rbp 32 */
+	.byte	0x80+6, 2		/* DW_CFA_offset, %rbp 2*-8 */
+	ADV(UW2, UW1)
+	.byte	0xa			/* DW_CFA_remember_state */
+	.byte	0xc, 7, 8		/* DW_CFA_def_cfa, %rsp 8 */
+	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
+	ADV(UW3, UW2)
+	.byte	0xb			/* DW_CFA_restore_state */
+	.balign	8
+L(EFDE1):
+
+	.set	L(set2),L(EFDE2)-L(SFDE2)
+	.long	L(set2)			/* FDE Length */
+L(SFDE2):
+	.long	L(SFDE2)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW5))		/* Initial location */
+	.long	L(UW7)-L(UW5)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW6, UW5)
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.byte	ffi_closure_FS + 8, 1	/* uleb128, assuming 128 <= FS < 255 */
+	.balign	8
+L(EFDE2):
+
+	.set	L(set3),L(EFDE3)-L(SFDE3)
+	.long	L(set3)			/* FDE Length */
+L(SFDE3):
+	.long	L(SFDE3)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW8))		/* Initial location */
+	.long	L(UW11)-L(UW8)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW9, UW8)
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.byte	ffi_closure_FS + 8, 1	/* uleb128, assuming 128 <= FS < 255 */
+	ADV(UW10, UW9)
+	.byte	0xe, 8			/* DW_CFA_def_cfa_offset 8 */
+L(EFDE3):
+
+	.set	L(set4),L(EFDE4)-L(SFDE4)
+	.long	L(set4)			/* FDE Length */
+L(SFDE4):
+	.long	L(SFDE4)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW12))		/* Initial location */
+	.long	L(UW14)-L(UW12)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW13, UW12)
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.byte	ffi_closure_FS + 8, 1	/* uleb128, assuming 128 <= FS < 255 */
+	.balign	8
+L(EFDE4):
+
+	.set	L(set5),L(EFDE5)-L(SFDE5)
+	.long	L(set5)			/* FDE Length */
+L(SFDE5):
+	.long	L(SFDE5)-L(CIE)		/* FDE CIE offset */
+	.long	PCREL(L(UW15))		/* Initial location */
+	.long	L(UW17)-L(UW15)		/* Address range */
+	.byte	0			/* Augmentation size */
+	ADV(UW16, UW15)
+	.byte	0xe			/* DW_CFA_def_cfa_offset */
+	.byte	ffi_closure_FS + 8, 1	/* uleb128, assuming 128 <= FS < 255 */
+	.balign	8
+L(EFDE5):
+#ifdef __APPLE__
+	.subsections_via_symbols
+#endif

 #endif /* __x86_64__ */
 #if defined __ELF__ && defined __linux__