x86: Best guess at update for Darwin

2014-11-22 20:02:43 +01:00
parent 58bf7d65d8
commit 9f112619c1
6 changed files with 103 additions and 928 deletions
--- a/Makefile.am
+++ b/Makefile.am
@@ -144,7 +144,6 @@ EXTRA_libffi_la_SOURCES = \
 	src/x86/ffi.c src/x86/sysv.S					\
 	 src/x86/ffiw64.c src/x86/win64.S 				\
 	 src/x86/ffi64.c src/x86/unix64.S				\
 	 src/x86/darwin64.S src/x86/darwin.S				\
 	src/xtensa/ffi.c src/xtensa/sysv.S
 TARGET_OBJ = @TARGET_OBJ@
--- a/configure.host
+++ b/configure.host
@@ -84,7 +84,12 @@ case "${host}" in
 	;;
  i?86-*-darwin* | x86_64-*-darwin*)
-	TARGET=X86_DARWIN; TARGETDIR=x86
+	TARGETDIR=x86
 	if test $ac_cv_sizeof_size_t = 4; then
 	  TARGET=X86_DARWIN
 	else
 	  TARGET=X86_64
 	fi
 	;;
  i?86-*-* | x86_64-*-* | amd64-*)
@@ -237,7 +242,7 @@ case "${TARGET}" in
  POWERPC_FREEBSD)
 	SOURCES="ffi.c ffi_sysv.c sysv.S ppc_closure.S"
 	;;
-  X86 | X86_FREEBSD | X86_WIN32)
+  X86 | X86_DARWIN | X86_FREEBSD | X86_WIN32)
 	SOURCES="ffi.c sysv.S"
 	;;
  X86_64)
@@ -246,9 +251,6 @@ case "${TARGET}" in
  X86_WIN64)
 	SOURCES="ffiw64.c win64.S"
 	;;
  X86_DARWIN)
 	SOURCES="ffi.c darwin.S ffi64.c darwin64.S"
 	;;
 esac
 # If we failed to configure SOURCES, we can't do anything.
--- a/src/x86/darwin.S
+++ b/src/x86/darwin.S
@@ -1,444 +0,0 @@
 /* -----------------------------------------------------------------------
   darwin.S - Copyright (c) 1996, 1998, 2001, 2002, 2003, 2005  Red Hat, Inc.
 	Copyright (C) 2008  Free Software Foundation, Inc.
   X86 Foreign Function Interface
   Permission is hereby granted, free of charge, to any person obtaining
   a copy of this software and associated documentation files (the
   ``Software''), to deal in the Software without restriction, including
   without limitation the rights to use, copy, modify, merge, publish,
   distribute, sublicense, and/or sell copies of the Software, and to
   permit persons to whom the Software is furnished to do so, subject to
   the following conditions:
   The above copyright notice and this permission notice shall be included
   in all copies or substantial portions of the Software.
   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
   WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
   DEALINGS IN THE SOFTWARE.
   -----------------------------------------------------------------------
   */
 #ifndef __x86_64__
 #define LIBFFI_ASM	
 #include <fficonfig.h>
 #include <ffi.h>
 .text
 .globl _ffi_prep_args
 	.align 4
 .globl _ffi_call_SYSV
 _ffi_call_SYSV:
 .LFB1:
        pushl %ebp
 .LCFI0:
        movl  %esp,%ebp
 .LCFI1:
        subl $8,%esp
 	/* Make room for all of the new args.  */
 	movl  16(%ebp),%ecx
 	subl  %ecx,%esp
 	movl  %esp,%eax
 	/* Place all of the ffi_prep_args in position  */
 	subl  $8,%esp
 	pushl 12(%ebp)
 	pushl %eax
 	call  *8(%ebp)
 	/* Return stack to previous state and call the function  */
 	addl  $16,%esp	
 	call  *28(%ebp)
 	/* Load %ecx with the return type code  */
 	movl  20(%ebp),%ecx	
 	/* Protect %esi.  We're going to pop it in the epilogue.  */
 	pushl %esi
 	/* If the return value pointer is NULL, assume no return value.  */
 	cmpl  $0,24(%ebp)
 	jne  0f
 	/* Even if there is no space for the return value, we are 
 	   obliged to handle floating-point values.  */
 	cmpl  $FFI_TYPE_FLOAT,%ecx
 	jne   noretval
 	fstp  %st(0)
 	jmp   epilogue
 0:
 	.align 4
 	call 1f
 .Lstore_table:
 	.long   noretval-.Lstore_table		/* FFI_TYPE_VOID */
 	.long   retint-.Lstore_table		/* FFI_TYPE_INT */
 	.long   retfloat-.Lstore_table		/* FFI_TYPE_FLOAT */
 	.long   retdouble-.Lstore_table		/* FFI_TYPE_DOUBLE */
 	.long   retlongdouble-.Lstore_table     /* FFI_TYPE_LONGDOUBLE */
 	.long   retuint8-.Lstore_table		/* FFI_TYPE_UINT8 */
 	.long   retsint8-.Lstore_table		/* FFI_TYPE_SINT8 */
 	.long   retuint16-.Lstore_table		/* FFI_TYPE_UINT16 */
 	.long   retsint16-.Lstore_table		/* FFI_TYPE_SINT16 */
 	.long   retint-.Lstore_table		/* FFI_TYPE_UINT32 */
 	.long   retint-.Lstore_table		/* FFI_TYPE_SINT32 */
 	.long   retint64-.Lstore_table		/* FFI_TYPE_UINT64 */
 	.long   retint64-.Lstore_table		/* FFI_TYPE_SINT64 */
 	.long   retstruct-.Lstore_table		/* FFI_TYPE_STRUCT */
 	.long   retint-.Lstore_table		/* FFI_TYPE_POINTER */
 	.long   retstruct1b-.Lstore_table	/* FFI_TYPE_SMALL_STRUCT_1B */
 	.long   retstruct2b-.Lstore_table	/* FFI_TYPE_SMALL_STRUCT_2B */
 1:
 	pop  %esi
 	add  (%esi, %ecx, 4), %esi
 	jmp  *%esi
 	/* Sign/zero extend as appropriate.  */
 retsint8:
 	movsbl  %al, %eax
 	jmp  retint
 retsint16:
 	movswl  %ax, %eax
 	jmp  retint
 retuint8:
 	movzbl  %al, %eax
 	jmp  retint
 retuint16:
 	movzwl  %ax, %eax
 	jmp  retint
 retfloat:
 	/* Load %ecx with the pointer to storage for the return value  */
 	movl  24(%ebp),%ecx
 	fstps (%ecx)
 	jmp   epilogue
 retdouble:
 	/* Load %ecx with the pointer to storage for the return value  */
 	movl  24(%ebp),%ecx
 	fstpl (%ecx)
 	jmp   epilogue
 retlongdouble:
 	/* Load %ecx with the pointer to storage for the return value  */
 	movl  24(%ebp),%ecx
 	fstpt (%ecx)
 	jmp   epilogue
 retint64:
 	/* Load %ecx with the pointer to storage for the return value  */
 	movl  24(%ebp),%ecx
 	movl  %eax,0(%ecx)
 	movl  %edx,4(%ecx)
 	jmp   epilogue
 retstruct1b:
 	/* Load %ecx with the pointer to storage for the return value  */
 	movl  24(%ebp),%ecx
 	movb  %al,0(%ecx)
 	jmp   epilogue
 retstruct2b:
 	/* Load %ecx with the pointer to storage for the return value  */
 	movl  24(%ebp),%ecx
 	movw  %ax,0(%ecx)
 	jmp   epilogue
 retint:
 	/* Load %ecx with the pointer to storage for the return value  */
 	movl  24(%ebp),%ecx
 	movl  %eax,0(%ecx)
 retstruct:
 	/* Nothing to do!  */
 noretval:
 epilogue:
 	popl %esi
 	movl %ebp,%esp
 	popl %ebp
 	ret
 .LFE1:
 .ffi_call_SYSV_end:
 	.align	4
 FFI_HIDDEN (ffi_closure_SYSV)
 .globl _ffi_closure_SYSV
 _ffi_closure_SYSV:
 .LFB2:
 	pushl	%ebp
 .LCFI2:
 	movl	%esp, %ebp
 .LCFI3:
 	subl	$40, %esp
 	leal	-24(%ebp), %edx
 	movl	%edx, -12(%ebp)	/* resp */
 	leal	8(%ebp), %edx
 	movl	%edx, 4(%esp)	/* args = __builtin_dwarf_cfa () */
 	leal	-12(%ebp), %edx
 	movl	%edx, (%esp)	/* &resp */
 	movl	%ebx, 8(%esp)
 .LCFI7:
 	call	L_ffi_closure_SYSV_inner$stub
 	movl	8(%esp), %ebx
 	movl	-12(%ebp), %ecx
 	cmpl	$FFI_TYPE_INT, %eax
 	je	.Lcls_retint
 	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
 	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
 	cmpl	$FFI_TYPE_UINT64, %eax
 	jge	0f
 	cmpl	$FFI_TYPE_UINT8, %eax
 	jge	.Lcls_retint
 0:	cmpl	$FFI_TYPE_FLOAT, %eax
 	je	.Lcls_retfloat
 	cmpl	$FFI_TYPE_DOUBLE, %eax
 	je	.Lcls_retdouble
 	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
 	je	.Lcls_retldouble
 	cmpl	$FFI_TYPE_SINT64, %eax
 	je	.Lcls_retllong
 	cmpl	$FFI_TYPE_SMALL_STRUCT_1B, %eax
 	je	.Lcls_retstruct1b
 	cmpl	$FFI_TYPE_SMALL_STRUCT_2B, %eax
 	je	.Lcls_retstruct2b
 	cmpl	$FFI_TYPE_STRUCT, %eax
 	je	.Lcls_retstruct
 .Lcls_epilogue:
 	movl	%ebp, %esp
 	popl	%ebp
 	ret
 .Lcls_retint:
 	movl	(%ecx), %eax
 	jmp	.Lcls_epilogue
 .Lcls_retfloat:
 	flds	(%ecx)
 	jmp	.Lcls_epilogue
 .Lcls_retdouble:
 	fldl	(%ecx)
 	jmp	.Lcls_epilogue
 .Lcls_retldouble:
 	fldt	(%ecx)
 	jmp	.Lcls_epilogue
 .Lcls_retllong:
 	movl	(%ecx), %eax
 	movl	4(%ecx), %edx
 	jmp	.Lcls_epilogue
 .Lcls_retstruct1b:
 	movsbl	(%ecx), %eax
 	jmp	.Lcls_epilogue
 .Lcls_retstruct2b:
 	movswl	(%ecx), %eax
 	jmp	.Lcls_epilogue
 .Lcls_retstruct:
 	lea -8(%ebp),%esp
 	movl	%ebp, %esp
 	popl	%ebp
 	ret $4
 .LFE2:
 #if !FFI_NO_RAW_API
 #define RAW_CLOSURE_CIF_OFFSET ((FFI_TRAMPOLINE_SIZE + 3) & ~3)
 #define RAW_CLOSURE_FUN_OFFSET (RAW_CLOSURE_CIF_OFFSET + 4)
 #define RAW_CLOSURE_USER_DATA_OFFSET (RAW_CLOSURE_FUN_OFFSET + 4)
 #define CIF_FLAGS_OFFSET 20
 	.align	4
 FFI_HIDDEN (ffi_closure_raw_SYSV)
 .globl _ffi_closure_raw_SYSV
 _ffi_closure_raw_SYSV:
 .LFB3:
 	pushl	%ebp
 .LCFI4:
 	movl	%esp, %ebp
 .LCFI5:
 	pushl	%esi
 .LCFI6:
 	subl	$36, %esp
 	movl	RAW_CLOSURE_CIF_OFFSET(%eax), %esi	 /* closure->cif */
 	movl	RAW_CLOSURE_USER_DATA_OFFSET(%eax), %edx /* closure->user_data */
 	movl	%edx, 12(%esp)	/* user_data */
 	leal	8(%ebp), %edx	/* __builtin_dwarf_cfa () */
 	movl	%edx, 8(%esp)	/* raw_args */
 	leal	-24(%ebp), %edx
 	movl	%edx, 4(%esp)	/* &res */
 	movl	%esi, (%esp)	/* cif */
 	call	*RAW_CLOSURE_FUN_OFFSET(%eax)		 /* closure->fun */
 	movl	CIF_FLAGS_OFFSET(%esi), %eax		 /* rtype */
 	cmpl	$FFI_TYPE_INT, %eax
 	je	.Lrcls_retint
 	/* Handle FFI_TYPE_UINT8, FFI_TYPE_SINT8, FFI_TYPE_UINT16,
 	   FFI_TYPE_SINT16, FFI_TYPE_UINT32, FFI_TYPE_SINT32.  */
 	cmpl	$FFI_TYPE_UINT64, %eax
 	jge	0f
 	cmpl	$FFI_TYPE_UINT8, %eax
 	jge	.Lrcls_retint
 0:
 	cmpl	$FFI_TYPE_FLOAT, %eax
 	je	.Lrcls_retfloat
 	cmpl	$FFI_TYPE_DOUBLE, %eax
 	je	.Lrcls_retdouble
 	cmpl	$FFI_TYPE_LONGDOUBLE, %eax
 	je	.Lrcls_retldouble
 	cmpl	$FFI_TYPE_SINT64, %eax
 	je	.Lrcls_retllong
 .Lrcls_epilogue:
 	addl	$36, %esp
 	popl	%esi
 	popl	%ebp
 	ret
 .Lrcls_retint:
 	movl	-24(%ebp), %eax
 	jmp	.Lrcls_epilogue
 .Lrcls_retfloat:
 	flds	-24(%ebp)
 	jmp	.Lrcls_epilogue
 .Lrcls_retdouble:
 	fldl	-24(%ebp)
 	jmp	.Lrcls_epilogue
 .Lrcls_retldouble:
 	fldt	-24(%ebp)
 	jmp	.Lrcls_epilogue
 .Lrcls_retllong:
 	movl	-24(%ebp), %eax
 	movl	-20(%ebp), %edx
 	jmp	.Lrcls_epilogue
 .LFE3:
 #endif
 .section __IMPORT,__jump_table,symbol_stubs,self_modifying_code+pure_instructions,5
 L_ffi_closure_SYSV_inner$stub:
 	.indirect_symbol _ffi_closure_SYSV_inner
 	hlt ; hlt ; hlt ; hlt ; hlt
 .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
 EH_frame1:
 	.set	L$set$0,LECIE1-LSCIE1
 	.long	L$set$0
 LSCIE1:
 	.long	0x0
 	.byte	0x1
 	.ascii "zR\0"
 	.byte	0x1
 	.byte	0x7c
 	.byte	0x8
 	.byte	0x1
 	.byte	0x10
 	.byte	0xc
 	.byte	0x5
 	.byte	0x4
 	.byte	0x88
 	.byte	0x1
 	.align 2
 LECIE1:
 .globl _ffi_call_SYSV.eh
 _ffi_call_SYSV.eh:
 LSFDE1:
 	.set	L$set$1,LEFDE1-LASFDE1
 	.long	L$set$1
 LASFDE1:
 	.long	LASFDE1-EH_frame1
 	.long	.LFB1-.
 	.set L$set$2,.LFE1-.LFB1
 	.long L$set$2
 	.byte	0x0
 	.byte	0x4
 	.set L$set$3,.LCFI0-.LFB1
 	.long L$set$3
 	.byte	0xe
 	.byte	0x8
 	.byte	0x84
 	.byte	0x2
 	.byte	0x4
 	.set L$set$4,.LCFI1-.LCFI0
 	.long L$set$4
 	.byte	0xd
 	.byte	0x4
 	.align 2
 LEFDE1:
 .globl _ffi_closure_SYSV.eh
 _ffi_closure_SYSV.eh:
 LSFDE2:
 	.set	L$set$5,LEFDE2-LASFDE2
 	.long	L$set$5
 LASFDE2:
 	.long	LASFDE2-EH_frame1
 	.long	.LFB2-.
 	.set L$set$6,.LFE2-.LFB2
 	.long L$set$6
 	.byte	0x0
 	.byte	0x4
 	.set L$set$7,.LCFI2-.LFB2
 	.long L$set$7
 	.byte	0xe
 	.byte	0x8
 	.byte	0x84
 	.byte	0x2
 	.byte	0x4
 	.set L$set$8,.LCFI3-.LCFI2
 	.long L$set$8
 	.byte	0xd
 	.byte	0x4
 	.align 2
 LEFDE2:
 #if !FFI_NO_RAW_API
 .globl _ffi_closure_raw_SYSV.eh
 _ffi_closure_raw_SYSV.eh:
 LSFDE3:
 	.set	L$set$10,LEFDE3-LASFDE3
 	.long	L$set$10
 LASFDE3:
 	.long	LASFDE3-EH_frame1
 	.long	.LFB3-.
 	.set L$set$11,.LFE3-.LFB3
 	.long L$set$11
 	.byte	0x0
 	.byte	0x4
 	.set L$set$12,.LCFI4-.LFB3
 	.long L$set$12
 	.byte	0xe
 	.byte	0x8
 	.byte	0x84
 	.byte	0x2
 	.byte	0x4
 	.set L$set$13,.LCFI5-.LCFI4
 	.long L$set$13
 	.byte	0xd
 	.byte	0x4
 	.byte	0x4
 	.set L$set$14,.LCFI6-.LCFI5
 	.long L$set$14
 	.byte	0x85
 	.byte	0x3
 	.align 2
 LEFDE3:
 #endif
 #endif /* ifndef __x86_64__ */
--- a/src/x86/darwin64.S
+++ b/src/x86/darwin64.S
@@ -1,416 +0,0 @@
 /* -----------------------------------------------------------------------
   darwin64.S - Copyright (c) 2006 Free Software Foundation, Inc.
 	        Copyright (c) 2008 Red Hat, Inc.
   derived from unix64.S
   x86-64 Foreign Function Interface for Darwin.
   Permission is hereby granted, free of charge, to any person obtaining
   a copy of this software and associated documentation files (the
   ``Software''), to deal in the Software without restriction, including
   without limitation the rights to use, copy, modify, merge, publish,
   distribute, sublicense, and/or sell copies of the Software, and to
   permit persons to whom the Software is furnished to do so, subject to
   the following conditions:
   The above copyright notice and this permission notice shall be included
   in all copies or substantial portions of the Software.
   THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS
   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
   IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR
   OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
   ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
   OTHER DEALINGS IN THE SOFTWARE.
   ----------------------------------------------------------------------- */
 #ifdef __x86_64__
 #define LIBFFI_ASM
 #include <fficonfig.h>
 #include <ffi.h>
 	.file "darwin64.S"
 .text
 /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
 		    void *raddr, void (*fnaddr)(void));
   Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
   for this function.  This has been allocated by ffi_call.  We also
   deallocate some of the stack that has been alloca'd.  */
 	.align	3
 	.globl	_ffi_call_unix64
 _ffi_call_unix64:
 LUW0:
 	movq	(%rsp), %r10		/* Load return address.  */
 	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
 	movq	%rdx, (%rax)		/* Save flags.  */
 	movq	%rcx, 8(%rax)		/* Save raddr.  */
 	movq	%rbp, 16(%rax)		/* Save old frame pointer.  */
 	movq	%r10, 24(%rax)		/* Relocate return address.  */
 	movq	%rax, %rbp		/* Finalize local stack frame.  */
 LUW1:
 	movq	%rdi, %r10		/* Save a copy of the register area. */
 	movq	%r8, %r11		/* Save a copy of the target fn.  */
 	movl	%r9d, %eax		/* Set number of SSE registers.  */
 	/* Load up all argument registers.  */
 	movq	(%r10), %rdi
 	movq	8(%r10), %rsi
 	movq	16(%r10), %rdx
 	movq	24(%r10), %rcx
 	movq	32(%r10), %r8
 	movq	40(%r10), %r9
 	testl	%eax, %eax
 	jnz	Lload_sse
 Lret_from_load_sse:
 	/* Deallocate the reg arg area.  */
 	leaq	176(%r10), %rsp
 	/* Call the user function.  */
 	call	*%r11
 	/* Deallocate stack arg area; local stack frame in redzone.  */
 	leaq	24(%rbp), %rsp
 	movq	0(%rbp), %rcx		/* Reload flags.  */
 	movq	8(%rbp), %rdi		/* Reload raddr.  */
 	movq	16(%rbp), %rbp		/* Reload old frame pointer.  */
 LUW2:
 	/* The first byte of the flags contains the FFI_TYPE.  */
 	movzbl	%cl, %r10d
 	leaq	Lstore_table(%rip), %r11
 	movslq	(%r11, %r10, 4), %r10
 	addq	%r11, %r10
 	jmp	*%r10
 Lstore_table:
 	.long	Lst_void-Lstore_table		/* FFI_TYPE_VOID */
 	.long	Lst_sint32-Lstore_table		/* FFI_TYPE_INT */
 	.long	Lst_float-Lstore_table		/* FFI_TYPE_FLOAT */
 	.long	Lst_double-Lstore_table		/* FFI_TYPE_DOUBLE */
 	.long	Lst_ldouble-Lstore_table	/* FFI_TYPE_LONGDOUBLE */
 	.long	Lst_uint8-Lstore_table		/* FFI_TYPE_UINT8 */
 	.long	Lst_sint8-Lstore_table		/* FFI_TYPE_SINT8 */
 	.long	Lst_uint16-Lstore_table		/* FFI_TYPE_UINT16 */
 	.long	Lst_sint16-Lstore_table		/* FFI_TYPE_SINT16 */
 	.long	Lst_uint32-Lstore_table		/* FFI_TYPE_UINT32 */
 	.long	Lst_sint32-Lstore_table		/* FFI_TYPE_SINT32 */
 	.long	Lst_int64-Lstore_table		/* FFI_TYPE_UINT64 */
 	.long	Lst_int64-Lstore_table		/* FFI_TYPE_SINT64 */
 	.long	Lst_struct-Lstore_table		/* FFI_TYPE_STRUCT */
 	.long	Lst_int64-Lstore_table		/* FFI_TYPE_POINTER */
 	.text
 	.align	3
 Lst_void:
 	ret
 	.align	3
 Lst_uint8:
 	movzbq	%al, %rax
 	movq	%rax, (%rdi)
 	ret
 	.align	3
 Lst_sint8:
 	movsbq	%al, %rax
 	movq	%rax, (%rdi)
 	ret
 	.align	3
 Lst_uint16:
 	movzwq	%ax, %rax
 	movq	%rax, (%rdi)
 	.align	3
 Lst_sint16:
 	movswq	%ax, %rax
 	movq	%rax, (%rdi)
 	ret
 	.align	3
 Lst_uint32:
 	movl	%eax, %eax
 	movq	%rax, (%rdi)
 	.align	3
 Lst_sint32:
 	cltq
 	movq	%rax, (%rdi)
 	ret
 	.align	3
 Lst_int64:
 	movq	%rax, (%rdi)
 	ret
 	.align	3
 Lst_float:
 	movss	%xmm0, (%rdi)
 	ret
 	.align	3
 Lst_double:
 	movsd	%xmm0, (%rdi)
 	ret
 Lst_ldouble:
 	fstpt	(%rdi)
 	ret
 	.align	3
 Lst_struct:
 	leaq	-20(%rsp), %rsi		/* Scratch area in redzone.  */
 	/* We have to locate the values now, and since we don't want to
 	   write too much data into the user's return value, we spill the
 	   value to a 16 byte scratch area first.  Bits 8, 9, and 10
 	   control where the values are located.  Only one of the three
 	   bits will be set; see ffi_prep_cif_machdep for the pattern.  */
 	movd	%xmm0, %r10
 	movd	%xmm1, %r11
 	testl	$0x100, %ecx
 	cmovnz	%rax, %rdx
 	cmovnz	%r10, %rax
 	testl	$0x200, %ecx
 	cmovnz	%r10, %rdx
 	testl	$0x400, %ecx
 	cmovnz	%r10, %rax
 	cmovnz	%r11, %rdx
 	movq	%rax, (%rsi)
 	movq	%rdx, 8(%rsi)
 	/* Bits 12-31 contain the true size of the structure.  Copy from
 	   the scratch area to the true destination.  */
 	shrl	$12, %ecx
 	rep movsb
 	ret
 	/* Many times we can avoid loading any SSE registers at all.
 	   It's not worth an indirect jump to load the exact set of
 	   SSE registers needed; zero or all is a good compromise.  */
 	.align	3
 LUW3:
 Lload_sse:
 	movdqa	48(%r10), %xmm0
 	movdqa	64(%r10), %xmm1
 	movdqa	80(%r10), %xmm2
 	movdqa	96(%r10), %xmm3
 	movdqa	112(%r10), %xmm4
 	movdqa	128(%r10), %xmm5
 	movdqa	144(%r10), %xmm6
 	movdqa	160(%r10), %xmm7
 	jmp	Lret_from_load_sse
 LUW4:
 	.align	3
 	.globl	_ffi_closure_unix64
 _ffi_closure_unix64:
 LUW5:
 	/* The carry flag is set by the trampoline iff SSE registers
 	   are used.  Don't clobber it before the branch instruction.  */
 	leaq    -200(%rsp), %rsp
 LUW6:
 	movq	%rdi, (%rsp)
 	movq    %rsi, 8(%rsp)
 	movq    %rdx, 16(%rsp)
 	movq    %rcx, 24(%rsp)
 	movq    %r8, 32(%rsp)
 	movq    %r9, 40(%rsp)
 	jc      Lsave_sse
 Lret_from_save_sse:
 	movq	%r10, %rdi
 	leaq	176(%rsp), %rsi
 	movq	%rsp, %rdx
 	leaq	208(%rsp), %rcx
 	call	_ffi_closure_unix64_inner
 	/* Deallocate stack frame early; return value is now in redzone.  */
 	addq	$200, %rsp
 LUW7:
 	/* The first byte of the return value contains the FFI_TYPE.  */
 	movzbl	%al, %r10d
 	leaq	Lload_table(%rip), %r11
 	movslq	(%r11, %r10, 4), %r10
 	addq	%r11, %r10
 	jmp	*%r10
 Lload_table:
 	.long	Lld_void-Lload_table		/* FFI_TYPE_VOID */
 	.long	Lld_int32-Lload_table		/* FFI_TYPE_INT */
 	.long	Lld_float-Lload_table		/* FFI_TYPE_FLOAT */
 	.long	Lld_double-Lload_table		/* FFI_TYPE_DOUBLE */
 	.long	Lld_ldouble-Lload_table		/* FFI_TYPE_LONGDOUBLE */
 	.long	Lld_int8-Lload_table		/* FFI_TYPE_UINT8 */
 	.long	Lld_int8-Lload_table		/* FFI_TYPE_SINT8 */
 	.long	Lld_int16-Lload_table		/* FFI_TYPE_UINT16 */
 	.long	Lld_int16-Lload_table		/* FFI_TYPE_SINT16 */
 	.long	Lld_int32-Lload_table		/* FFI_TYPE_UINT32 */
 	.long	Lld_int32-Lload_table		/* FFI_TYPE_SINT32 */
 	.long	Lld_int64-Lload_table		/* FFI_TYPE_UINT64 */
 	.long	Lld_int64-Lload_table		/* FFI_TYPE_SINT64 */
 	.long	Lld_struct-Lload_table		/* FFI_TYPE_STRUCT */
 	.long	Lld_int64-Lload_table		/* FFI_TYPE_POINTER */
 	.text
 	.align	3
 Lld_void:
 	ret
 	.align	3
 Lld_int8:
 	movzbl	-24(%rsp), %eax
 	ret
 	.align	3
 Lld_int16:
 	movzwl	-24(%rsp), %eax
 	ret
 	.align	3
 Lld_int32:
 	movl	-24(%rsp), %eax
 	ret
 	.align	3
 Lld_int64:
 	movq	-24(%rsp), %rax
 	ret
 	.align	3
 Lld_float:
 	movss	-24(%rsp), %xmm0
 	ret
 	.align	3
 Lld_double:
 	movsd	-24(%rsp), %xmm0
 	ret
 	.align	3
 Lld_ldouble:
 	fldt	-24(%rsp)
 	ret
 	.align	3
 Lld_struct:
 	/* There are four possibilities here, %rax/%rdx, %xmm0/%rax,
 	   %rax/%xmm0, %xmm0/%xmm1.  We collapse two by always loading
 	   both rdx and xmm1 with the second word.  For the remaining,
 	   bit 8 set means xmm0 gets the second word, and bit 9 means
 	   that rax gets the second word.  */
 	movq	-24(%rsp), %rcx
 	movq	-16(%rsp), %rdx
 	movq	-16(%rsp), %xmm1
 	testl	$0x100, %eax
 	cmovnz	%rdx, %rcx
 	movd	%rcx, %xmm0
 	testl	$0x200, %eax
 	movq	-24(%rsp), %rax
 	cmovnz	%rdx, %rax
 	ret
 	/* See the comment above Lload_sse; the same logic applies here.  */
 	.align	3
 LUW8:
 Lsave_sse:
 	movdqa	%xmm0, 48(%rsp)
 	movdqa	%xmm1, 64(%rsp)
 	movdqa	%xmm2, 80(%rsp)
 	movdqa	%xmm3, 96(%rsp)
 	movdqa	%xmm4, 112(%rsp)
 	movdqa	%xmm5, 128(%rsp)
 	movdqa	%xmm6, 144(%rsp)
 	movdqa	%xmm7, 160(%rsp)
 	jmp	Lret_from_save_sse
 LUW9:
 .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
 EH_frame1:
 	.set	L$set$0,LECIE1-LSCIE1		/* CIE Length */
 	.long	L$set$0
 LSCIE1:
 	.long	0x0		/* CIE Identifier Tag */
 	.byte	0x1		/* CIE Version */
 	.ascii	"zR\0"		/* CIE Augmentation */
 	.byte	0x1		/* uleb128 0x1; CIE Code Alignment Factor */
 	.byte	0x78		/* sleb128 -8; CIE Data Alignment Factor */
 	.byte	0x10		/* CIE RA Column */
 	.byte	0x1		/* uleb128 0x1; Augmentation size */
 	.byte	0x10		/* FDE Encoding (pcrel sdata4) */
 	.byte	0xc		/* DW_CFA_def_cfa, %rsp offset 8 */
 	.byte	0x7		/* uleb128 0x7 */
 	.byte	0x8		/* uleb128 0x8 */
 	.byte	0x90		/* DW_CFA_offset, column 0x10 */
 	.byte	0x1
 	.align	3
 LECIE1:
 	.globl _ffi_call_unix64.eh
 _ffi_call_unix64.eh:
 LSFDE1:
 	.set	L$set$1,LEFDE1-LASFDE1	/* FDE Length */
 	.long	L$set$1
 LASFDE1:
 	.long	LASFDE1-EH_frame1	/* FDE CIE offset */
 	.quad	LUW0-.			/* FDE initial location */
 	.set	L$set$2,LUW4-LUW0	/* FDE address range */
 	.quad	L$set$2
 	.byte	0x0			/* Augmentation size */
 	.byte	0x4			/* DW_CFA_advance_loc4 */
 	.set	L$set$3,LUW1-LUW0
 	.long	L$set$3
 	/* New stack frame based off rbp.  This is a itty bit of unwind
 	   trickery in that the CFA *has* changed.  There is no easy way
 	   to describe it correctly on entry to the function.  Fortunately,
 	   it doesn't matter too much since at all points we can correctly
 	   unwind back to ffi_call.  Note that the location to which we
 	   moved the return address is (the new) CFA-8, so from the
 	   perspective of the unwind info, it hasn't moved.  */
 	.byte	0xc			/* DW_CFA_def_cfa, %rbp offset 32 */
 	.byte	0x6
 	.byte	0x20
 	.byte	0x80+6			/* DW_CFA_offset, %rbp offset 2*-8 */
 	.byte	0x2
 	.byte	0xa			/* DW_CFA_remember_state */
 	.byte	0x4			/* DW_CFA_advance_loc4 */
 	.set	L$set$4,LUW2-LUW1
 	.long	L$set$4
 	.byte	0xc			/* DW_CFA_def_cfa, %rsp offset 8 */
 	.byte	0x7
 	.byte	0x8
 	.byte	0xc0+6			/* DW_CFA_restore, %rbp */
 	.byte	0x4			/* DW_CFA_advance_loc4 */
 	.set	L$set$5,LUW3-LUW2
 	.long	L$set$5
 	.byte	0xb			/* DW_CFA_restore_state */
 	.align	3
 LEFDE1:
 	.globl _ffi_closure_unix64.eh
 _ffi_closure_unix64.eh:
 LSFDE3:
 	.set	L$set$6,LEFDE3-LASFDE3	/* FDE Length */
 	.long	L$set$6
 LASFDE3:
 	.long	LASFDE3-EH_frame1	/* FDE CIE offset */
 	.quad	LUW5-.			/* FDE initial location */
 	.set	L$set$7,LUW9-LUW5	/* FDE address range */
 	.quad	L$set$7
 	.byte	0x0			/* Augmentation size */
 	.byte	0x4			/* DW_CFA_advance_loc4 */
 	.set	L$set$8,LUW6-LUW5
 	.long	L$set$8
 	.byte	0xe			/* DW_CFA_def_cfa_offset */
 	.byte	208,1			/* uleb128 208 */
 	.byte	0xa			/* DW_CFA_remember_state */
 	.byte	0x4			/* DW_CFA_advance_loc4 */
 	.set	L$set$9,LUW7-LUW6
 	.long	L$set$9
 	.byte	0xe			/* DW_CFA_def_cfa_offset */
 	.byte	0x8
 	.byte	0x4			/* DW_CFA_advance_loc4 */
 	.set	L$set$10,LUW8-LUW7
 	.long	L$set$10
 	.byte	0xb			/* DW_CFA_restore_state */
 	.align	3
 LEFDE3:
 	.subsections_via_symbols
 #endif /* __x86_64__ */
--- a/src/x86/sysv.S
+++ b/src/x86/sysv.S
@@ -112,7 +112,7 @@ ffi_call_i386:
 	andl	$X86_RET_TYPE_MASK, %ecx
 #ifdef __PIC__
-	call	__x86.get_pc_thunk.bx
+	call	C(__x86.get_pc_thunk.bx)
 1:	leal	0f-1b(%ebx, %ecx, 8), %ebx
 #else
 	leal	0f(,%ecx, 8), %ebx
@@ -212,31 +212,6 @@ ENDF(ffi_call_i386)
 	movl	%ecx, 32(%esp);						\
 	movl	%eax, 36(%esp)
 #ifdef __PIC__
 /* We're going to always load the got register here, even if .hidden says
   we're going to avoid the PLT call.  We'll use the got register in
   FFI_CLOSURE_MASK_AND_JUMP.  */
 # if defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE
 #  define PLT(X) X
 # else
 #  define PLT(X) X@PLT
 # endif
 # define FFI_CLOSURE_CALL_INNER						\
 	movl	%esp, %ecx;			/* load closure_data */	\
 	leal	closure_FS+4(%esp), %edx;	/* load incoming stack */ \
 	movl	%ebx, 40(%esp);			/* save ebx */		\
 	cfi_rel_offset(%ebx, 40);					\
 	call	__x86.get_pc_thunk.bx;		/* load got register */	\
 	addl	$C(_GLOBAL_OFFSET_TABLE_), %ebx;			\
 	call	PLT(ffi_closure_inner)
 #define FFI_CLOSURE_MASK_AND_JUMP					\
 	andl	$X86_RET_TYPE_MASK, %eax;				\
 	leal	0f@GOTOFF(%ebx, %eax, 8), %eax;				\
 	movl	40(%esp), %ebx;			/* restore ebx */	\
 	cfi_restore(%ebx);						\
 	jmp	*%eax
 #else
 # define FFI_CLOSURE_CALL_INNER						\
 	movl	%esp, %ecx;			/* load closure_data */	\
 	leal	closure_FS+4(%esp), %edx;	/* load incoming stack */ \
@@ -245,6 +220,33 @@ ENDF(ffi_call_i386)
 	andl	$X86_RET_TYPE_MASK, %eax;				\
 	leal	0f(, %eax, 8), %eax;					\
 	jmp	*%eax
 #ifdef __PIC__
 # if defined X86_DARWIN || defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE
 #  undef FFI_CLOSURE_MASK_AND_JUMP
 #  define FFI_CLOSURE_MASK_AND_JUMP					\
 	andl	$X86_RET_TYPE_MASK, %eax;				\
 	call	C(__x86.get_pc_thunk.dx);				\
 1:	leal	0f-1b(%edx, %eax, 8), %eax;				\
 	jmp	*%eax
 # else
 #  undef FFI_CLOSURE_CALL_INNER
 #  define FFI_CLOSURE_CALL_INNER					\
 	movl	%esp, %ecx;			/* load closure_data */	\
 	leal	closure_FS+4(%esp), %edx;	/* load incoming stack */ \
 	movl	%ebx, 40(%esp);			/* save ebx */		\
 	cfi_rel_offset(%ebx, 40);					\
 	call	C(__x86.get_pc_thunk.bx);	/* load got register */	\
 1:	addl	$C(_GLOBAL_OFFSET_TABLE_), %ebx;			\
 	call	ffi_closure_inner@PLT
 #  undef FFI_CLOSURE_MASK_AND_JUMP
 #  define FFI_CLOSURE_MASK_AND_JUMP					\
 	andl	$X86_RET_TYPE_MASK, %eax;				\
 	leal	0f@GOTOFF(%ebx, %eax, 8), %eax;				\
 	movl	40(%esp), %ebx;			/* restore ebx */	\
 	cfi_restore(%ebx);						\
 	jmp	*%eax
 # endif /* DARWIN || HIDDEN */
 #endif /* __PIC__ */
 #define FFI_GO_CLOSURE(suffix, chain, t1, t2)				\
@@ -511,7 +513,7 @@ C(ffi_closure_raw_SYSV):
 	movl	20(%ebx), %eax				/* load cif->flags */
 	andl	$X86_RET_TYPE_MASK, %eax
 #ifdef __PIC__
-	call	__x86.get_pc_thunk.bx
+	call	C(__x86.get_pc_thunk.bx)
 1:	leal	0f-1b(%ebx, %eax, 8), %eax
 #else
 	leal	0f(,%eax, 8), %eax
@@ -615,7 +617,7 @@ C(ffi_closure_raw_THISCALL):
 	movl	20(%ebx), %eax				/* load cif->flags */
 	andl	$X86_RET_TYPE_MASK, %eax
 #ifdef __PIC__
-	call	__x86.get_pc_thunk.bx
+	call	C(__x86.get_pc_thunk.bx)
 1:	leal	0f-1b(%ebx, %eax, 8), %eax
 #else
 	leal	0f(,%eax, 8), %eax
@@ -685,20 +687,41 @@ ENDF(C(ffi_closure_raw_THISCALL))
 #endif /* !FFI_NO_RAW_API */
 #ifdef X86_DARWIN
 # define COMDAT(X)							\
        .section __TEXT,__textcoal_nt,coalesced,pure_instructions;	\
        .weak_definition X;						\
        .private_extern X
 #elif defined __ELF__
 # define COMDAT(X)							\
 	.section .text.X,"axG",@progbits,X,comdat;			\
 	.globl	X;							\
 	FFI_HIDDEN(X)
 #else
 # define COMDAT(X)
 #endif
 #if defined(__PIC__)
-	.section .text.__x86.get_pc_thunk.bx,"axG",@progbits,__x86.get_pc_thunk.bx,comdat
+	COMDAT(C(__x86.get_pc_thunk.bx))
-	.globl	__x86.get_pc_thunk.bx
+C(__x86.get_pc_thunk.bx):
 	.hidden	__x86.get_pc_thunk.bx
 	.type	__x86.get_pc_thunk.bx,@function
 __x86.get_pc_thunk.bx:
 	cfi_startproc
 	movl	(%esp), %ebx
 	ret
 	cfi_endproc
-	.size	__x86.get_pc_thunk.bx, . - __x86.get_pc_thunk.bx
+ENDF(C(__x86.get_pc_thunk.bx))
 # if defined X86_DARWIN || defined HAVE_HIDDEN_VISIBILITY_ATTRIBUTE
 	COMDAT(C(__x86.get_pc_thunk.dx))
 C(__x86.get_pc_thunk.dx):
 	cfi_startproc
 	movl	(%esp), %edx
 	ret
 	cfi_endproc
 ENDF(C(__x86.get_pc_thunk.dx))
 #endif /* DARWIN || HIDDEN */
 #endif /* __PIC__ */
 #endif /* ifndef __x86_64__ */
 #if defined __ELF__ && defined __linux__
 	.section	.note.GNU-stack,"",@progbits
 #endif
--- a/src/x86/unix64.S
+++ b/src/x86/unix64.S
@@ -35,6 +35,22 @@
 	.text
 #define C2(X, Y)  X ## Y
 #define C1(X, Y)  C2(X, Y)
 #ifdef __USER_LABEL_PREFIX__
 # define C(X)     C1(__USER_LABEL_PREFIX__, X)
 #else
 # define C(X)     X
 #endif
 #ifdef __ELF__
 # define PLT(X)	  X@PLT
 # define ENDF(X)  .type	X,@function; .size X, . - X
 #else
 # define PLT(X)	  X
 # define ENDF(X)
 #endif
 /* This macro allows the safe creation of jump tables without an
   actual table.  The entry points into the table are all 8 bytes.
   The use of ORG asserts that we're at the correct location.  */
@@ -54,11 +70,10 @@
   deallocate some of the stack that has been alloca'd.  */
 	.align	8
-	.globl	ffi_call_unix64
+	.globl	C(ffi_call_unix64)
-	.type	ffi_call_unix64,@function
+	FFI_HIDDEN(C(ffi_call_unix64))
 	FFI_HIDDEN(ffi_call_unix64)
-ffi_call_unix64:
+C(ffi_call_unix64):
 	cfi_startproc
 	movq	(%rsp), %r10		/* Load return address.  */
 	leaq	(%rdi, %rsi), %rax	/* Find local stack base.  */
@@ -187,7 +202,7 @@ E UNIX64_RET_ST_RAX_RDX
 	rep movsb
 	ret
-9:	call	abort@PLT
+9:	call	PLT(C(abort))
 	/* Many times we can avoid loading any SSE registers at all.
 	   It's not worth an indirect jump to load the exact set of
@@ -206,7 +221,7 @@ E UNIX64_RET_ST_RAX_RDX
 	jmp	.Lret_from_load_sse
 	cfi_endproc
-	.size    ffi_call_unix64,.-ffi_call_unix64
+ENDF(C(ffi_call_unix64))
 /* 6 general registers, 8 vector registers,
   32 bytes of rvalue, 8 bytes of alignment.  */
@@ -219,11 +234,10 @@ E UNIX64_RET_ST_RAX_RDX
 #define ffi_closure_RED_RVALUE	(ffi_closure_OFS_RVALUE - ffi_closure_FS)
 	.align	2
-	.globl	ffi_closure_unix64_sse
+	.globl	C(ffi_closure_unix64_sse)
-	.type	ffi_closure_unix64_sse,@function
+	FFI_HIDDEN(C(ffi_closure_unix64_sse))
 	FFI_HIDDEN(ffi_closure_unix64_sse)
-ffi_closure_unix64_sse:
+C(ffi_closure_unix64_sse):
 	cfi_startproc
 	subq	$ffi_closure_FS, %rsp
 	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
@@ -240,14 +254,13 @@ ffi_closure_unix64_sse:
 	jmp	0f
 	cfi_endproc
-	.size	ffi_closure_unix64_sse,.-ffi_closure_unix64_sse
+ENDF(C(ffi_closure_unix64_sse))
 	.align	2
-	.globl	ffi_closure_unix64
+	.globl	C(ffi_closure_unix64)
-	.type	ffi_closure_unix64,@function
+	FFI_HIDDEN(C(ffi_closure_unix64))
 	FFI_HIDDEN(ffi_closure_unix64)
-ffi_closure_unix64:
+C(ffi_closure_unix64):
 	cfi_startproc
 	subq	$ffi_closure_FS, %rsp
 	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
@@ -273,7 +286,7 @@ ffi_closure_unix64:
 	leaq	ffi_closure_OFS_RVALUE(%rsp), %rcx	/* Load rvalue */
 	movq	%rsp, %r8				/* Load reg_args */
 	leaq	ffi_closure_FS+8(%rsp), %r9		/* Load argp */
-	call	ffi_closure_unix64_inner
+	call	C(ffi_closure_unix64_inner)
 	/* Deallocate stack frame early; return value is now in redzone.  */
 	addq	$ffi_closure_FS, %rsp
@@ -343,17 +356,16 @@ E UNIX64_RET_ST_RAX_RDX
 3:	movq	(%rsi), %xmm0
 	ret
-9:	call	abort@PLT
+9:	call	PLT(C(abort))
 	cfi_endproc
-	.size	ffi_closure_unix64,.-ffi_closure_unix64
+ENDF(C(ffi_closure_unix64))
 	.align	2
-	.globl	ffi_go_closure_unix64_sse
+	.globl	C(ffi_go_closure_unix64_sse)
-	.type	ffi_go_closure_unix64_sse,@function
+	FFI_HIDDEN(C(ffi_go_closure_unix64_sse))
 	FFI_HIDDEN(ffi_go_closure_unix64_sse)
-ffi_go_closure_unix64_sse:
+C(ffi_go_closure_unix64_sse):
 	cfi_startproc
 	subq	$ffi_closure_FS, %rsp
 	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
@@ -370,14 +382,13 @@ ffi_go_closure_unix64_sse:
 	jmp	0f
 	cfi_endproc
-	.size	ffi_go_closure_unix64_sse,.-ffi_go_closure_unix64_sse
+ENDF(C(ffi_go_closure_unix64_sse))
 	.align	2
-	.globl	ffi_go_closure_unix64
+	.globl	C(ffi_go_closure_unix64)
-	.type	ffi_go_closure_unix64,@function
+	FFI_HIDDEN(C(ffi_go_closure_unix64))
 	FFI_HIDDEN(ffi_go_closure_unix64)
-ffi_go_closure_unix64:
+C(ffi_go_closure_unix64):
 	cfi_startproc
 	subq	$ffi_closure_FS, %rsp
 	/* Note clang bug 21515: adjust_cfa_offset error across endproc.  */
@@ -402,7 +413,7 @@ ffi_go_closure_unix64:
 	jmp	.Ldo_closure
 	cfi_endproc
-	.size	ffi_go_closure_unix64,.-ffi_go_closure_unix64
+ENDF(C(ffi_go_closure_unix64))
 #endif /* __x86_64__ */
 #if defined __ELF__ && defined __linux__