Import OpenSSL 1.0.2k (as of svn r86089)

2017-05-22 14:30:47 -05:00
parent d239d63057
commit ccd3ab4aff
2486 changed files with 943951 additions and 0 deletions
--- a/crypto/bn/asm/README
+++ b/crypto/bn/asm/README
@@ -0,0 +1,27 @@
+<OBSOLETE>
+
+All assember in this directory are just version of the file
+crypto/bn/bn_asm.c.
+
+Quite a few of these files are just the assember output from gcc since on 
+quite a few machines they are 2 times faster than the system compiler.
+
+For the x86, I have hand written assember because of the bad job all
+compilers seem to do on it.  This normally gives a 2 time speed up in the RSA
+routines.
+
+For the DEC alpha, I also hand wrote the assember (except the division which
+is just the output from the C compiler pasted on the end of the file).
+On the 2 alpha C compilers I had access to, it was not possible to do
+64b x 64b -> 128b calculations (both long and the long long data types
+were 64 bits).  So the hand assember gives access to the 128 bit result and
+a 2 times speedup :-).
+
+There are 3 versions of assember for the HP PA-RISC.
+
+pa-risc.s is the origional one which works fine and generated using gcc :-)
+
+pa-risc2W.s and pa-risc2.s are 64 and 32-bit PA-RISC 2.0 implementations
+by Chris Ruemmler from HP (with some help from the HP C compiler).
+
+</OBSOLETE>
--- a/crypto/bn/asm/alpha-mont.pl
+++ b/crypto/bn/asm/alpha-mont.pl
@@ -0,0 +1,321 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# On 21264 RSA sign performance improves by 70/35/20/15 percent for
+# 512/1024/2048/4096 bit key lengths. This is against vendor compiler
+# instructed to '-tune host' code with in-line assembler. Other
+# benchmarks improve by 15-20%. To anchor it to something else, the
+# code provides approximately the same performance per GHz as AMD64.
+# I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
+# difference.
+
+# int bn_mul_mont(
+$rp="a0";	# BN_ULONG *rp,
+$ap="a1";	# const BN_ULONG *ap,
+$bp="a2";	# const BN_ULONG *bp,
+$np="a3";	# const BN_ULONG *np,
+$n0="a4";	# const BN_ULONG *n0,
+$num="a5";	# int num);
+
+$lo0="t0";
+$hi0="t1";
+$lo1="t2";
+$hi1="t3";
+$aj="t4";
+$bi="t5";
+$nj="t6";
+$tp="t7";
+$alo="t8";
+$ahi="t9";
+$nlo="t10";
+$nhi="t11";
+$tj="t12";
+$i="s3";
+$j="s4";
+$m1="s5";
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set	noat
+.set	noreorder
+
+.globl	bn_mul_mont
+.align	5
+.ent	bn_mul_mont
+bn_mul_mont:
+	lda	sp,-48(sp)
+	stq	ra,0(sp)
+	stq	s3,8(sp)
+	stq	s4,16(sp)
+	stq	s5,24(sp)
+	stq	fp,32(sp)
+	mov	sp,fp
+	.mask	0x0400f000,-48
+	.frame	fp,48,ra
+	.prologue 0
+
+	.align	4
+	.set	reorder
+	sextl	$num,$num
+	mov	0,v0
+	cmplt	$num,4,AT
+	bne	AT,.Lexit
+
+	ldq	$hi0,0($ap)	# ap[0]
+	s8addq	$num,16,AT
+	ldq	$aj,8($ap)
+	subq	sp,AT,sp
+	ldq	$bi,0($bp)	# bp[0]
+	lda	AT,-4096(zero)	# mov	-4096,AT
+	ldq	$n0,0($n0)
+	and	sp,AT,sp
+
+	mulq	$hi0,$bi,$lo0
+	ldq	$hi1,0($np)	# np[0]
+	umulh	$hi0,$bi,$hi0
+	ldq	$nj,8($np)
+
+	mulq	$lo0,$n0,$m1
+
+	mulq	$hi1,$m1,$lo1
+	umulh	$hi1,$m1,$hi1
+
+	addq	$lo1,$lo0,$lo1
+	cmpult	$lo1,$lo0,AT
+	addq	$hi1,AT,$hi1
+
+	mulq	$aj,$bi,$alo
+	mov	2,$j
+	umulh	$aj,$bi,$ahi
+	mov	sp,$tp
+
+	mulq	$nj,$m1,$nlo
+	s8addq	$j,$ap,$aj
+	umulh	$nj,$m1,$nhi
+	s8addq	$j,$np,$nj
+.align	4
+.L1st:
+	.set	noreorder
+	ldq	$aj,0($aj)
+	addl	$j,1,$j
+	ldq	$nj,0($nj)
+	lda	$tp,8($tp)
+
+	addq	$alo,$hi0,$lo0
+	mulq	$aj,$bi,$alo
+	cmpult	$lo0,$hi0,AT
+	addq	$nlo,$hi1,$lo1
+
+	mulq	$nj,$m1,$nlo
+	addq	$ahi,AT,$hi0
+	cmpult	$lo1,$hi1,v0
+	cmplt	$j,$num,$tj
+
+	umulh	$aj,$bi,$ahi
+	addq	$nhi,v0,$hi1
+	addq	$lo1,$lo0,$lo1
+	s8addq	$j,$ap,$aj
+
+	umulh	$nj,$m1,$nhi
+	cmpult	$lo1,$lo0,v0
+	addq	$hi1,v0,$hi1
+	s8addq	$j,$np,$nj
+
+	stq	$lo1,-8($tp)
+	nop
+	unop
+	bne	$tj,.L1st
+	.set	reorder
+
+	addq	$alo,$hi0,$lo0
+	addq	$nlo,$hi1,$lo1
+	cmpult	$lo0,$hi0,AT
+	cmpult	$lo1,$hi1,v0
+	addq	$ahi,AT,$hi0
+	addq	$nhi,v0,$hi1
+
+	addq	$lo1,$lo0,$lo1
+	cmpult	$lo1,$lo0,v0
+	addq	$hi1,v0,$hi1
+
+	stq	$lo1,0($tp)
+
+	addq	$hi1,$hi0,$hi1
+	cmpult	$hi1,$hi0,AT
+	stq	$hi1,8($tp)
+	stq	AT,16($tp)
+
+	mov	1,$i
+.align	4
+.Louter:
+	s8addq	$i,$bp,$bi
+	ldq	$hi0,0($ap)
+	ldq	$aj,8($ap)
+	ldq	$bi,0($bi)
+	ldq	$hi1,0($np)
+	ldq	$nj,8($np)
+	ldq	$tj,0(sp)
+
+	mulq	$hi0,$bi,$lo0
+	umulh	$hi0,$bi,$hi0
+
+	addq	$lo0,$tj,$lo0
+	cmpult	$lo0,$tj,AT
+	addq	$hi0,AT,$hi0
+
+	mulq	$lo0,$n0,$m1
+
+	mulq	$hi1,$m1,$lo1
+	umulh	$hi1,$m1,$hi1
+
+	addq	$lo1,$lo0,$lo1
+	cmpult	$lo1,$lo0,AT
+	mov	2,$j
+	addq	$hi1,AT,$hi1
+
+	mulq	$aj,$bi,$alo
+	mov	sp,$tp
+	umulh	$aj,$bi,$ahi
+
+	mulq	$nj,$m1,$nlo
+	s8addq	$j,$ap,$aj
+	umulh	$nj,$m1,$nhi
+.align	4
+.Linner:
+	.set	noreorder
+	ldq	$tj,8($tp)	#L0
+	nop			#U1
+	ldq	$aj,0($aj)	#L1
+	s8addq	$j,$np,$nj	#U0
+
+	ldq	$nj,0($nj)	#L0
+	nop			#U1
+	addq	$alo,$hi0,$lo0	#L1
+	lda	$tp,8($tp)
+
+	mulq	$aj,$bi,$alo	#U1
+	cmpult	$lo0,$hi0,AT	#L0
+	addq	$nlo,$hi1,$lo1	#L1
+	addl	$j,1,$j
+
+	mulq	$nj,$m1,$nlo	#U1
+	addq	$ahi,AT,$hi0	#L0
+	addq	$lo0,$tj,$lo0	#L1
+	cmpult	$lo1,$hi1,v0	#U0
+
+	umulh	$aj,$bi,$ahi	#U1
+	cmpult	$lo0,$tj,AT	#L0
+	addq	$lo1,$lo0,$lo1	#L1
+	addq	$nhi,v0,$hi1	#U0
+
+	umulh	$nj,$m1,$nhi	#U1
+	s8addq	$j,$ap,$aj	#L0
+	cmpult	$lo1,$lo0,v0	#L1
+	cmplt	$j,$num,$tj	#U0	# borrow $tj
+
+	addq	$hi0,AT,$hi0	#L0
+	addq	$hi1,v0,$hi1	#U1
+	stq	$lo1,-8($tp)	#L1
+	bne	$tj,.Linner	#U0
+	.set	reorder
+
+	ldq	$tj,8($tp)
+	addq	$alo,$hi0,$lo0
+	addq	$nlo,$hi1,$lo1
+	cmpult	$lo0,$hi0,AT
+	cmpult	$lo1,$hi1,v0
+	addq	$ahi,AT,$hi0
+	addq	$nhi,v0,$hi1
+
+	addq	$lo0,$tj,$lo0
+	cmpult	$lo0,$tj,AT
+	addq	$hi0,AT,$hi0
+
+	ldq	$tj,16($tp)
+	addq	$lo1,$lo0,$j
+	cmpult	$j,$lo0,v0
+	addq	$hi1,v0,$hi1
+
+	addq	$hi1,$hi0,$lo1
+	stq	$j,0($tp)
+	cmpult	$lo1,$hi0,$hi1
+	addq	$lo1,$tj,$lo1
+	cmpult	$lo1,$tj,AT
+	addl	$i,1,$i
+	addq	$hi1,AT,$hi1
+	stq	$lo1,8($tp)
+	cmplt	$i,$num,$tj	# borrow $tj
+	stq	$hi1,16($tp)
+	bne	$tj,.Louter
+
+	s8addq	$num,sp,$tj	# &tp[num]
+	mov	$rp,$bp		# put rp aside
+	mov	sp,$tp
+	mov	sp,$ap
+	mov	0,$hi0		# clear borrow bit
+
+.align	4
+.Lsub:	ldq	$lo0,0($tp)
+	ldq	$lo1,0($np)
+	lda	$tp,8($tp)
+	lda	$np,8($np)
+	subq	$lo0,$lo1,$lo1	# tp[i]-np[i]
+	cmpult	$lo0,$lo1,AT
+	subq	$lo1,$hi0,$lo0
+	cmpult	$lo1,$lo0,$hi0
+	or	$hi0,AT,$hi0
+	stq	$lo0,0($rp)
+	cmpult	$tp,$tj,v0
+	lda	$rp,8($rp)
+	bne	v0,.Lsub
+
+	subq	$hi1,$hi0,$hi0	# handle upmost overflow bit
+	mov	sp,$tp
+	mov	$bp,$rp		# restore rp
+
+	and	sp,$hi0,$ap
+	bic	$bp,$hi0,$bp
+	bis	$bp,$ap,$ap	# ap=borrow?tp:rp
+
+.align	4
+.Lcopy:	ldq	$aj,0($ap)	# copy or in-place refresh
+	lda	$tp,8($tp)
+	lda	$rp,8($rp)
+	lda	$ap,8($ap)
+	stq	zero,-8($tp)	# zap tp
+	cmpult	$tp,$tj,AT
+	stq	$aj,-8($rp)
+	bne	AT,.Lcopy
+	mov	1,v0
+
+.Lexit:
+	.set	noreorder
+	mov	fp,sp
+	/*ldq	ra,0(sp)*/
+	ldq	s3,8(sp)
+	ldq	s4,16(sp)
+	ldq	s5,24(sp)
+	ldq	fp,32(sp)
+	lda	sp,48(sp)
+	ret	(ra)
+.end	bn_mul_mont
+.ascii	"Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+___
+
+print $code;
+close STDOUT;
--- a/crypto/bn/asm/armv4-gf2m.pl
+++ b/crypto/bn/asm/armv4-gf2m.pl
@@ -0,0 +1,289 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... Except that it has two code paths: pure
+# integer code suitable for any ARMv4 and later CPU and NEON code
+# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs
+# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50%
+# faster than compiler-generated code. For ECDH and ECDSA verify (but
+# not for ECDSA sign) it means 25%-45% improvement depending on key
+# length, more for longer keys. Even though NEON 1x1 multiplication
+# runs in even less cycles, ~30, improvement is measurable only on
+# longer keys. One has to optimize code elsewhere to get NEON glow...
+#
+# April 2014
+#
+# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
+# referred below, which improves ECDH and ECDSA verify benchmarks
+# by 18-40%.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+# 
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+___
+################
+# private interface to mul_1x1_ialu
+#
+$a="r1";
+$b="r0";
+
+($a0,$a1,$a2,$a12,$a4,$a14)=
+($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
+
+$mask="r12";
+
+$code.=<<___;
+.type	mul_1x1_ialu,%function
+.align	5
+mul_1x1_ialu:
+	mov	$a0,#0
+	bic	$a1,$a,#3<<30		@ a1=a&0x3fffffff
+	str	$a0,[sp,#0]		@ tab[0]=0
+	add	$a2,$a1,$a1		@ a2=a1<<1
+	str	$a1,[sp,#4]		@ tab[1]=a1
+	eor	$a12,$a1,$a2		@ a1^a2
+	str	$a2,[sp,#8]		@ tab[2]=a2
+	mov	$a4,$a1,lsl#2		@ a4=a1<<2
+	str	$a12,[sp,#12]		@ tab[3]=a1^a2
+	eor	$a14,$a1,$a4		@ a1^a4
+	str	$a4,[sp,#16]		@ tab[4]=a4
+	eor	$a0,$a2,$a4		@ a2^a4
+	str	$a14,[sp,#20]		@ tab[5]=a1^a4
+	eor	$a12,$a12,$a4		@ a1^a2^a4
+	str	$a0,[sp,#24]		@ tab[6]=a2^a4
+	and	$i0,$mask,$b,lsl#2
+	str	$a12,[sp,#28]		@ tab[7]=a1^a2^a4
+
+	and	$i1,$mask,$b,lsr#1
+	ldr	$lo,[sp,$i0]		@ tab[b       & 0x7]
+	and	$i0,$mask,$b,lsr#4
+	ldr	$t1,[sp,$i1]		@ tab[b >>  3 & 0x7]
+	and	$i1,$mask,$b,lsr#7
+	ldr	$t0,[sp,$i0]		@ tab[b >>  6 & 0x7]
+	eor	$lo,$lo,$t1,lsl#3	@ stall
+	mov	$hi,$t1,lsr#29
+	ldr	$t1,[sp,$i1]		@ tab[b >>  9 & 0x7]
+
+	and	$i0,$mask,$b,lsr#10
+	eor	$lo,$lo,$t0,lsl#6
+	eor	$hi,$hi,$t0,lsr#26
+	ldr	$t0,[sp,$i0]		@ tab[b >> 12 & 0x7]
+
+	and	$i1,$mask,$b,lsr#13
+	eor	$lo,$lo,$t1,lsl#9
+	eor	$hi,$hi,$t1,lsr#23
+	ldr	$t1,[sp,$i1]		@ tab[b >> 15 & 0x7]
+
+	and	$i0,$mask,$b,lsr#16
+	eor	$lo,$lo,$t0,lsl#12
+	eor	$hi,$hi,$t0,lsr#20
+	ldr	$t0,[sp,$i0]		@ tab[b >> 18 & 0x7]
+
+	and	$i1,$mask,$b,lsr#19
+	eor	$lo,$lo,$t1,lsl#15
+	eor	$hi,$hi,$t1,lsr#17
+	ldr	$t1,[sp,$i1]		@ tab[b >> 21 & 0x7]
+
+	and	$i0,$mask,$b,lsr#22
+	eor	$lo,$lo,$t0,lsl#18
+	eor	$hi,$hi,$t0,lsr#14
+	ldr	$t0,[sp,$i0]		@ tab[b >> 24 & 0x7]
+
+	and	$i1,$mask,$b,lsr#25
+	eor	$lo,$lo,$t1,lsl#21
+	eor	$hi,$hi,$t1,lsr#11
+	ldr	$t1,[sp,$i1]		@ tab[b >> 27 & 0x7]
+
+	tst	$a,#1<<30
+	and	$i0,$mask,$b,lsr#28
+	eor	$lo,$lo,$t0,lsl#24
+	eor	$hi,$hi,$t0,lsr#8
+	ldr	$t0,[sp,$i0]		@ tab[b >> 30      ]
+
+	eorne	$lo,$lo,$b,lsl#30
+	eorne	$hi,$hi,$b,lsr#2
+	tst	$a,#1<<31
+	eor	$lo,$lo,$t1,lsl#27
+	eor	$hi,$hi,$t1,lsr#5
+	eorne	$lo,$lo,$b,lsl#31
+	eorne	$hi,$hi,$b,lsr#1
+	eor	$lo,$lo,$t0,lsl#30
+	eor	$hi,$hi,$t0,lsr#2
+
+	mov	pc,lr
+.size	mul_1x1_ialu,.-mul_1x1_ialu
+___
+################
+# void	bn_GF2m_mul_2x2(BN_ULONG *r,
+#	BN_ULONG a1,BN_ULONG a0,
+#	BN_ULONG b1,BN_ULONG b0);	# r[3..0]=a1a0·b1b0
+{
+$code.=<<___;
+.global	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,%function
+.align	5
+bn_GF2m_mul_2x2:
+#if __ARM_MAX_ARCH__>=7
+	ldr	r12,.LOPENSSL_armcap
+.Lpic:	ldr	r12,[pc,r12]
+	tst	r12,#1
+	bne	.LNEON
+#endif
+___
+$ret="r10";	# reassigned 1st argument
+$code.=<<___;
+	stmdb	sp!,{r4-r10,lr}
+	mov	$ret,r0			@ reassign 1st argument
+	mov	$b,r3			@ $b=b1
+	ldr	r3,[sp,#32]		@ load b0
+	mov	$mask,#7<<2
+	sub	sp,sp,#32		@ allocate tab[8]
+
+	bl	mul_1x1_ialu		@ a1·b1
+	str	$lo,[$ret,#8]
+	str	$hi,[$ret,#12]
+
+	eor	$b,$b,r3		@ flip b0 and b1
+	 eor	$a,$a,r2		@ flip a0 and a1
+	eor	r3,r3,$b
+	 eor	r2,r2,$a
+	eor	$b,$b,r3
+	 eor	$a,$a,r2
+	bl	mul_1x1_ialu		@ a0·b0
+	str	$lo,[$ret]
+	str	$hi,[$ret,#4]
+
+	eor	$a,$a,r2
+	eor	$b,$b,r3
+	bl	mul_1x1_ialu		@ (a1+a0)·(b1+b0)
+___
+@r=map("r$_",(6..9));
+$code.=<<___;
+	ldmia	$ret,{@r[0]-@r[3]}
+	eor	$lo,$lo,$hi
+	eor	$hi,$hi,@r[1]
+	eor	$lo,$lo,@r[0]
+	eor	$hi,$hi,@r[2]
+	eor	$lo,$lo,@r[3]
+	eor	$hi,$hi,@r[3]
+	str	$hi,[$ret,#8]
+	eor	$lo,$lo,$hi
+	add	sp,sp,#32		@ destroy tab[8]
+	str	$lo,[$ret,#4]
+
+#if __ARM_ARCH__>=5
+	ldmia	sp!,{r4-r10,pc}
+#else
+	ldmia	sp!,{r4-r10,lr}
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+___
+}
+{
+my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
+my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.align	5
+.LNEON:
+	ldr		r12, [sp]		@ 5th argument
+	vmov.32		$a, r2, r1
+	vmov.32		$b, r12, r3
+	vmov.i64	$k48, #0x0000ffffffffffff
+	vmov.i64	$k32, #0x00000000ffffffff
+	vmov.i64	$k16, #0x000000000000ffff
+
+	vext.8		$t0#lo, $a, $a, #1	@ A1
+	vmull.p8	$t0, $t0#lo, $b		@ F = A1*B
+	vext.8		$r#lo, $b, $b, #1	@ B1
+	vmull.p8	$r, $a, $r#lo		@ E = A*B1
+	vext.8		$t1#lo, $a, $a, #2	@ A2
+	vmull.p8	$t1, $t1#lo, $b		@ H = A2*B
+	vext.8		$t3#lo, $b, $b, #2	@ B2
+	vmull.p8	$t3, $a, $t3#lo		@ G = A*B2
+	vext.8		$t2#lo, $a, $a, #3	@ A3
+	veor		$t0, $t0, $r		@ L = E + F
+	vmull.p8	$t2, $t2#lo, $b		@ J = A3*B
+	vext.8		$r#lo, $b, $b, #3	@ B3
+	veor		$t1, $t1, $t3		@ M = G + H
+	vmull.p8	$r, $a, $r#lo		@ I = A*B3
+	veor		$t0#lo, $t0#lo, $t0#hi	@ t0 = (L) (P0 + P1) << 8
+	vand		$t0#hi, $t0#hi, $k48
+	vext.8		$t3#lo, $b, $b, #4	@ B4
+	veor		$t1#lo, $t1#lo, $t1#hi	@ t1 = (M) (P2 + P3) << 16
+	vand		$t1#hi, $t1#hi, $k32
+	vmull.p8	$t3, $a, $t3#lo		@ K = A*B4
+	veor		$t2, $t2, $r		@ N = I + J
+	veor		$t0#lo, $t0#lo, $t0#hi
+	veor		$t1#lo, $t1#lo, $t1#hi
+	veor		$t2#lo, $t2#lo, $t2#hi	@ t2 = (N) (P4 + P5) << 24
+	vand		$t2#hi, $t2#hi, $k16
+	vext.8		$t0, $t0, $t0, #15
+	veor		$t3#lo, $t3#lo, $t3#hi	@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	$t3#hi, #0
+	vext.8		$t1, $t1, $t1, #14
+	veor		$t2#lo, $t2#lo, $t2#hi
+	vmull.p8	$r, $a, $b		@ D = A*B
+	vext.8		$t3, $t3, $t3, #12
+	vext.8		$t2, $t2, $t2, #13
+	veor		$t0, $t0, $t1
+	veor		$t2, $t2, $t3
+	veor		$r, $r, $t0
+	veor		$r, $r, $t2
+
+	vst1.32		{$r}, [r0]
+	ret		@ bx lr
+#endif
+___
+}
+$code.=<<___;
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+#if __ARM_MAX_ARCH__>=7
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-(.Lpic+8)
+#endif
+.asciz	"GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	5
+
+#if __ARM_MAX_ARCH__>=7
+.comm	OPENSSL_armcap_P,4,4
+#endif
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/geo;
+
+	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
+	s/\bret\b/bx	lr/go		or
+	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
+
+	print $_,"\n";
+}
+close STDOUT;   # enforce flush
--- a/crypto/bn/asm/armv4-mont.pl
+++ b/crypto/bn/asm/armv4-mont.pl
@@ -0,0 +1,676 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2007.
+
+# Montgomery multiplication for ARMv4.
+#
+# Performance improvement naturally varies among CPU implementations
+# and compilers. The code was observed to provide +65-35% improvement
+# [depending on key length, less for longer keys] on ARM920T, and
+# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
+# base and compiler generated code with in-lined umull and even umlal
+# instructions. The latter means that this code didn't really have an 
+# "advantage" of utilizing some "secret" instruction.
+#
+# The code is interoperable with Thumb ISA and is rather compact, less
+# than 1/2KB. Windows CE port would be trivial, as it's exclusively
+# about decorations, ABI and instruction syntax are identical.
+
+# November 2013
+#
+# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
+# performance improvement on Cortex-A8 is ~45-100% depending on key
+# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
+# On Snapdragon S4 improvement was measured to vary from ~70% to
+# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
+# rather because original integer-only code seems to perform
+# suboptimally on S4. Situation on Cortex-A9 is unfortunately
+# different. It's being looked into, but the trouble is that
+# performance for vectors longer than 256 bits is actually couple
+# of percent worse than for integer-only code. The code is chosen
+# for execution on all NEON-capable processors, because gain on
+# others outweighs the marginal loss on Cortex-A9.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$num="r0";	# starts as num argument, but holds &tp[num-1]
+$ap="r1";
+$bp="r2"; $bi="r2"; $rp="r2";
+$np="r3";
+$tp="r4";
+$aj="r5";
+$nj="r6";
+$tj="r7";
+$n0="r8";
+###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
+$alo="r10";	# sl, gcc uses it to keep @GOT
+$ahi="r11";	# fp
+$nlo="r12";	# ip
+###########	# r13 is stack pointer
+$nhi="r14";	# lr
+###########	# r15 is program counter
+
+#### argument block layout relative to &tp[num-1], a.k.a. $num
+$_rp="$num,#12*4";
+# ap permanently resides in r1
+$_bp="$num,#13*4";
+# np permanently resides in r3
+$_n0="$num,#14*4";
+$_num="$num,#15*4";	$_bpend=$_num;
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+
+#if __ARM_MAX_ARCH__>=7
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-bn_mul_mont
+#endif
+
+.global	bn_mul_mont
+.type	bn_mul_mont,%function
+
+.align	5
+bn_mul_mont:
+	ldr	ip,[sp,#4]		@ load num
+	stmdb	sp!,{r0,r2}		@ sp points at argument block
+#if __ARM_MAX_ARCH__>=7
+	tst	ip,#7
+	bne	.Lialu
+	adr	r0,bn_mul_mont
+	ldr	r2,.LOPENSSL_armcap
+	ldr	r0,[r0,r2]
+	tst	r0,#1			@ NEON available?
+	ldmia	sp, {r0,r2}
+	beq	.Lialu
+	add	sp,sp,#8
+	b	bn_mul8x_mont_neon
+.align	4
+.Lialu:
+#endif
+	cmp	ip,#2
+	mov	$num,ip			@ load num
+	movlt	r0,#0
+	addlt	sp,sp,#2*4
+	blt	.Labrt
+
+	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
+
+	mov	$num,$num,lsl#2		@ rescale $num for byte count
+	sub	sp,sp,$num		@ alloca(4*num)
+	sub	sp,sp,#4		@ +extra dword
+	sub	$num,$num,#4		@ "num=num-1"
+	add	$tp,$bp,$num		@ &bp[num-1]
+
+	add	$num,sp,$num		@ $num to point at &tp[num-1]
+	ldr	$n0,[$_n0]		@ &n0
+	ldr	$bi,[$bp]		@ bp[0]
+	ldr	$aj,[$ap],#4		@ ap[0],ap++
+	ldr	$nj,[$np],#4		@ np[0],np++
+	ldr	$n0,[$n0]		@ *n0
+	str	$tp,[$_bpend]		@ save &bp[num]
+
+	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
+	str	$n0,[$_n0]		@ save n0 value
+	mul	$n0,$alo,$n0		@ "tp[0]"*n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
+	mov	$tp,sp
+
+.L1st:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	mov	$alo,$ahi
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.L1st
+
+	adds	$nlo,$nlo,$ahi
+	ldr	$tp,[$_bp]		@ restore bp
+	mov	$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+.Louter:
+	sub	$tj,$num,sp		@ "original" $num-1 value
+	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
+	ldr	$bi,[$tp,#4]!		@ *(++bp)
+	sub	$np,$np,$tj		@ "rewind" np to &np[1]
+	ldr	$aj,[$ap,#-4]		@ ap[0]
+	ldr	$alo,[sp]		@ tp[0]
+	ldr	$nj,[$np,#-4]		@ np[0]
+	ldr	$tj,[sp,#4]		@ tp[1]
+
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
+	str	$tp,[$_bp]		@ save bp
+	mul	$n0,$alo,$n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
+	mov	$tp,sp
+
+.Linner:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	adds	$alo,$ahi,$tj		@ +=tp[j]
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adc	$ahi,$ahi,#0
+	ldr	$tj,[$tp,#8]		@ tp[j+1]
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.Linner
+
+	adds	$nlo,$nlo,$ahi
+	mov	$nhi,#0
+	ldr	$tp,[$_bp]		@ restore bp
+	adc	$nhi,$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
+	adds	$nlo,$nlo,$tj
+	ldr	$tj,[$_bpend]		@ restore &bp[num]
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+	cmp	$tp,$tj
+	bne	.Louter
+
+	ldr	$rp,[$_rp]		@ pull rp
+	add	$num,$num,#4		@ $num to point at &tp[num]
+	sub	$aj,$num,sp		@ "original" num value
+	mov	$tp,sp			@ "rewind" $tp
+	mov	$ap,$tp			@ "borrow" $ap
+	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
+
+	subs	$tj,$tj,$tj		@ "clear" carry flag
+.Lsub:	ldr	$tj,[$tp],#4
+	ldr	$nj,[$np],#4
+	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
+	str	$tj,[$rp],#4		@ rp[j]=
+	teq	$tp,$num		@ preserve carry
+	bne	.Lsub
+	sbcs	$nhi,$nhi,#0		@ upmost carry
+	mov	$tp,sp			@ "rewind" $tp
+	sub	$rp,$rp,$aj		@ "rewind" $rp
+
+	and	$ap,$tp,$nhi
+	bic	$np,$rp,$nhi
+	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
+
+.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
+	str	sp,[$tp],#4		@ zap tp
+	str	$tj,[$rp],#4
+	cmp	$tp,$num
+	bne	.Lcopy
+
+	add	sp,$num,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4-r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+	ret				@ bx lr
+#else
+	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+#endif
+.size	bn_mul_mont,.-bn_mul_mont
+___
+{
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
+my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
+my ($Z,$Temp)=("q4","q5");
+my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
+my ($Bi,$Ni,$M0)=map("d$_",(28..31));
+my $zero=&Dlo($Z);
+my $temp=&Dlo($Temp);
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch	armv7-a
+.fpu	neon
+
+.type	bn_mul8x_mont_neon,%function
+.align	5
+bn_mul8x_mont_neon:
+	mov	ip,sp
+	stmdb	sp!,{r4-r11}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+	ldmia	ip,{r4-r5}		@ load rest of parameter block
+
+	sub		$toutptr,sp,#16
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	sub		$toutptr,$toutptr,$num,lsl#4
+	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
+	and		$toutptr,$toutptr,#-64
+	vld1.32		{${M0}[0]}, [$n0,:32]
+	mov		sp,$toutptr			@ alloca
+	veor		$zero,$zero,$zero
+	subs		$inner,$num,#8
+	vzip.16		$Bi,$zero
+
+	vmull.u32	$A0xB,$Bi,${A0}[0]
+	vmull.u32	$A1xB,$Bi,${A0}[1]
+	vmull.u32	$A2xB,$Bi,${A1}[0]
+	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
+	vmull.u32	$A3xB,$Bi,${A1}[1]
+
+	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
+	veor		$zero,$zero,$zero
+	vmul.u32	$Ni,$temp,$M0
+
+	vmull.u32	$A4xB,$Bi,${A2}[0]
+	 vld1.32	{$N0-$N3}, [$nptr]!
+	vmull.u32	$A5xB,$Bi,${A2}[1]
+	vmull.u32	$A6xB,$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmull.u32	$A7xB,$Bi,${A3}[1]
+
+	bne	.LNEON_1st
+
+	@ special case for num=8, everything is in register bank...
+
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	sub		$outer,$num,#1
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	vmov		$Temp,$A0xB
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	vmov		$A0xB,$A1xB
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	vmov		$A1xB,$A2xB
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+	vmov		$A2xB,$A3xB
+	vmov		$A3xB,$A4xB
+	vshr.u64	$temp,$temp,#16
+	vmov		$A4xB,$A5xB
+	vmov		$A5xB,$A6xB
+	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
+	vmov		$A6xB,$A7xB
+	veor		$A7xB,$A7xB
+	vshr.u64	$temp,$temp,#16
+
+	b	.LNEON_outer8
+
+.align	4
+.LNEON_outer8:
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	veor		$zero,$zero,$zero
+	vzip.16		$Bi,$zero
+	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+
+	vmlal.u32	$A0xB,$Bi,${A0}[0]
+	vmlal.u32	$A1xB,$Bi,${A0}[1]
+	vmlal.u32	$A2xB,$Bi,${A1}[0]
+	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
+	vmlal.u32	$A3xB,$Bi,${A1}[1]
+
+	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
+	veor		$zero,$zero,$zero
+	subs		$outer,$outer,#1
+	vmul.u32	$Ni,$temp,$M0
+
+	vmlal.u32	$A4xB,$Bi,${A2}[0]
+	vmlal.u32	$A5xB,$Bi,${A2}[1]
+	vmlal.u32	$A6xB,$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmlal.u32	$A7xB,$Bi,${A3}[1]
+
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	vmov		$Temp,$A0xB
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	vmov		$A0xB,$A1xB
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	vmov		$A1xB,$A2xB
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+	vmov		$A2xB,$A3xB
+	vmov		$A3xB,$A4xB
+	vshr.u64	$temp,$temp,#16
+	vmov		$A4xB,$A5xB
+	vmov		$A5xB,$A6xB
+	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
+	vmov		$A6xB,$A7xB
+	veor		$A7xB,$A7xB
+	vshr.u64	$temp,$temp,#16
+
+	bne	.LNEON_outer8
+
+	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+	mov		$toutptr,sp
+	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
+	mov		$inner,$num
+	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
+	add		$tinptr,sp,#16
+	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
+	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+
+	b	.LNEON_tail2
+
+.align	4
+.LNEON_1st:
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	 vld1.32	{$A0-$A3}, [$aptr]!
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	subs		$inner,$inner,#8
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	 vld1.32	{$N0-$N1}, [$nptr]!
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!
+
+	vmull.u32	$A0xB,$Bi,${A0}[0]
+	 vld1.32	{$N2-$N3}, [$nptr]!
+	vmull.u32	$A1xB,$Bi,${A0}[1]
+	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
+	vmull.u32	$A2xB,$Bi,${A1}[0]
+	vmull.u32	$A3xB,$Bi,${A1}[1]
+	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!
+
+	vmull.u32	$A4xB,$Bi,${A2}[0]
+	vmull.u32	$A5xB,$Bi,${A2}[1]
+	vmull.u32	$A6xB,$Bi,${A3}[0]
+	vmull.u32	$A7xB,$Bi,${A3}[1]
+
+	bne	.LNEON_1st
+
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	add		$tinptr,sp,#16
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	 vld1.64	{$Temp}, [sp,:128]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+	sub		$outer,$num,#1
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	vshr.u64	$temp,$temp,#16
+	 vld1.64	{$A0xB},       [$tinptr, :128]!
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+
+	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
+	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
+	veor		$Z,$Z,$Z
+	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
+	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
+	vst1.64		{$Z},          [$toutptr,:128]
+	vshr.u64	$temp,$temp,#16
+
+	b		.LNEON_outer
+
+.align	4
+.LNEON_outer:
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
+	vld1.32		{$A0-$A3},  [$aptr]!
+	veor		$zero,$zero,$zero
+	mov		$toutptr,sp
+	vzip.16		$Bi,$zero
+	sub		$inner,$num,#8
+	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+
+	vmlal.u32	$A0xB,$Bi,${A0}[0]
+	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
+	vmlal.u32	$A1xB,$Bi,${A0}[1]
+	vmlal.u32	$A2xB,$Bi,${A1}[0]
+	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
+	vmlal.u32	$A3xB,$Bi,${A1}[1]
+
+	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
+	veor		$zero,$zero,$zero
+	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
+	 vld1.64	{$A7xB},[$tinptr,:128]!
+	vmul.u32	$Ni,$temp,$M0
+
+	vmlal.u32	$A4xB,$Bi,${A2}[0]
+	 vld1.32	{$N0-$N3}, [$nptr]!
+	vmlal.u32	$A5xB,$Bi,${A2}[1]
+	vmlal.u32	$A6xB,$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmlal.u32	$A7xB,$Bi,${A3}[1]
+
+.LNEON_inner:
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	 vld1.32	{$A0-$A3}, [$aptr]!
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	 subs		$inner,$inner,#8
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	 vld1.64	{$A0xB},       [$tinptr, :128]!
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
+
+	vmlal.u32	$A0xB,$Bi,${A0}[0]
+	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
+	vmlal.u32	$A1xB,$Bi,${A0}[1]
+	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
+	vmlal.u32	$A2xB,$Bi,${A1}[0]
+	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
+	vmlal.u32	$A3xB,$Bi,${A1}[1]
+	 vld1.32	{$N0-$N3}, [$nptr]!
+
+	vmlal.u32	$A4xB,$Bi,${A2}[0]
+	 vld1.64	{$A7xB},       [$tinptr, :128]!
+	vmlal.u32	$A5xB,$Bi,${A2}[1]
+	vmlal.u32	$A6xB,$Bi,${A3}[0]
+	vmlal.u32	$A7xB,$Bi,${A3}[1]
+
+	bne	.LNEON_inner
+
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	add		$tinptr,sp,#16
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	 vld1.64	{$Temp}, [sp,:128]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+	subs		$outer,$outer,#1
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	 vld1.64	{$A0xB},       [$tinptr, :128]!
+	vshr.u64	$temp,$temp,#16
+	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+
+	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
+	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
+	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
+	vshr.u64	$temp,$temp,#16
+
+	bne	.LNEON_outer
+
+	mov		$toutptr,sp
+	mov		$inner,$num
+
+.LNEON_tail:
+	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
+	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
+	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
+	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
+	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
+	vld1.64		{$A7xB},       [$tinptr, :128]!
+	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+
+.LNEON_tail2:
+	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
+	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
+	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
+	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`
+
+	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
+	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
+	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
+	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`
+
+	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
+	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
+	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
+	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`
+
+	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
+	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
+	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
+	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`
+
+	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
+	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
+	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
+	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`
+
+	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
+	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
+	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
+	vld1.64		{$A0xB}, [$tinptr, :128]!
+	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
+	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`
+
+	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
+	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
+	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
+	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
+	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
+	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
+	subs		$inner,$inner,#8
+	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
+
+	bne	.LNEON_tail
+
+	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
+	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
+	subs	$aptr,sp,#0				@ clear carry flag
+	add	$bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+	ldmia	$aptr!, {r4-r7}
+	ldmia	$nptr!, {r8-r11}
+	sbcs	r8, r4,r8
+	sbcs	r9, r5,r9
+	sbcs	r10,r6,r10
+	sbcs	r11,r7,r11
+	teq	$aptr,$bptr				@ preserves carry
+	stmia	$rptr!, {r8-r11}
+	bne	.LNEON_sub
+
+	ldr	r10, [$aptr]				@ load top-most bit
+	veor	q0,q0,q0
+	sub	r11,$bptr,sp				@ this is num*4
+	veor	q1,q1,q1
+	mov	$aptr,sp
+	sub	$rptr,$rptr,r11				@ rewind $rptr
+	mov	$nptr,$bptr				@ second 3/4th of frame
+	sbcs	r10,r10,#0				@ result is carry flag
+
+.LNEON_copy_n_zap:
+	ldmia	$aptr!, {r4-r7}
+	ldmia	$rptr,  {r8-r11}
+	movcc	r8, r4
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	movcc	r11,r7
+	ldmia	$aptr, {r4-r7}
+	stmia	$rptr!, {r8-r11}
+	sub	$aptr,$aptr,#16
+	ldmia	$rptr, {r8-r11}
+	movcc	r8, r4
+	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	movcc	r11,r7
+	teq	$aptr,$bptr				@ preserves carry
+	stmia	$rptr!, {r8-r11}
+	bne	.LNEON_copy_n_zap
+
+	sub	sp,ip,#96
+        vldmia  sp!,{d8-d15}
+        ldmia   sp!,{r4-r11}
+	ret						@ bx lr
+.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_MAX_ARCH__>=7
+.comm	OPENSSL_armcap_P,4,4
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx	lr/gm;
+print $code;
+close STDOUT;
--- a/crypto/bn/asm/bn-586.pl
+++ b/crypto/bn/asm/bn-586.pl
@@ -0,0 +1,774 @@
+#!/usr/local/bin/perl
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&bn_mul_add_words("bn_mul_add_words");
+&bn_mul_words("bn_mul_words");
+&bn_sqr_words("bn_sqr_words");
+&bn_div_words("bn_div_words");
+&bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
+&bn_sub_part_words("bn_sub_part_words");
+
+&asm_finish();
+
+sub bn_mul_add_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("maw_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
+		&pxor("mm1","mm1");		# mm1 = carry_in
+		&jmp(&label("maw_sse2_entry"));
+		
+	&set_label("maw_sse2_unrolled",16);
+		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
+		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
+		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
+		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
+		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
+		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
+		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
+		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
+		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
+		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
+		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
+		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
+		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
+		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
+		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
+		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
+		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
+		&movd(&DWP(0,$r,"",0),"mm1");
+		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
+		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
+		&psrlq("mm1",32);		# mm1 = carry0
+		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
+		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
+		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
+		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
+		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
+		&movd(&DWP(4,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry1
+		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
+		&add($a,32);
+		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
+		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
+		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
+		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
+		&movd(&DWP(8,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry2
+		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
+		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
+		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
+		&movd(&DWP(12,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry3
+		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
+		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
+		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
+		&movd(&DWP(16,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry4
+		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
+		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
+		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
+		&movd(&DWP(20,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry5
+		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
+		&movd(&DWP(24,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry6
+		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
+		&movd(&DWP(28,$r,"",0),"mm1");
+		&lea($r,&DWP(32,$r));
+		&psrlq("mm1",32);		# mm1 = carry_out
+
+		&sub($c,8);
+		&jz(&label("maw_sse2_exit"));
+	&set_label("maw_sse2_entry");
+		&test($c,0xfffffff8);
+		&jnz(&label("maw_sse2_unrolled"));
+
+	&set_label("maw_sse2_loop",4);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm3");		# carry += r[i]
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
+		&jnz(&label("maw_sse2_loop"));
+	&set_label("maw_sse2_exit");
+		&movd("eax","mm1");		# c = carry_out
+		&emms();
+		&ret();
+
+	&set_label("maw_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ebp";
+	$r="edi";
+	$c="esi";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+
+	&mov("ecx",&wparam(2));	#
+	&mov($a,&wparam(1));	#
+
+	&and("ecx",0xfffffff8);	# num / 8
+	&mov($w,&wparam(3));	#
+
+	&push("ecx");		# Up the stack for a tmp variable
+
+	&jz(&label("maw_finish"));
+
+	&set_label("maw_loop",16);
+
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+= c
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",&DWP($i,$r));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&sub("ecx",8);
+	&lea($a,&DWP(32,$a));
+	&lea($r,&DWP(32,$r));
+	&jnz(&label("maw_loop"));
+
+	&set_label("maw_finish",0);
+	&mov("ecx",&wparam(2));	# get num
+	&and("ecx",7);
+	&jnz(&label("maw_finish2"));	# helps branch prediction
+	&jmp(&label("maw_end"));
+
+	&set_label("maw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a));	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &dec("ecx") if ($i != 7-1);
+		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
+		 &mov($c,"edx");		# c=  H(t);
+		&jz(&label("maw_end")) if ($i != 7-1);
+		}
+	&set_label("maw_end",0);
+	&mov("eax",$c);
+
+	&pop("ecx");	# clear variable from
+
+	&function_end($name);
+	}
+
+sub bn_mul_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("mw_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
+		&pxor("mm1","mm1");		# mm1 = carry = 0
+
+	&set_label("mw_sse2_loop",16);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
+		&jnz(&label("mw_sse2_loop"));
+
+		&movd("eax","mm1");		# return carry
+		&emms();
+		&ret();
+	&set_label("mw_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ecx";
+	$r="edi";
+	$c="esi";
+	$num="ebp";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+	&mov($w,&wparam(3));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("mw_finish"));
+
+	&set_label("mw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jz(&label("mw_finish"));
+	&jmp(&label("mw_loop"));
+
+	&set_label("mw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jnz(&label("mw_finish2"));
+	&jmp(&label("mw_end"));
+
+	&set_label("mw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a,"",0));# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		 &dec($num) if ($i != 7-1);
+		&jz(&label("mw_end")) if ($i != 7-1);
+		}
+	&set_label("mw_end",0);
+	&mov("eax",$c);
+
+	&function_end($name);
+	}
+
+sub bn_sqr_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("sqr_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+
+	&set_label("sqr_sse2_loop",16);
+		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
+		&pmuludq("mm0","mm0");		# a[i] *= a[i]
+		&lea($a,&DWP(4,$a));		# a++
+		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
+		&sub($c,1);
+		&lea($r,&DWP(8,$r));		# r += 2
+		&jnz(&label("sqr_sse2_loop"));
+
+		&emms();
+		&ret();
+	&set_label("sqr_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$r="esi";
+	$a="edi";
+	$num="ebx";
+
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("sw_finish"));
+
+	&set_label("sw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+		&mov("eax",&DWP($i,$a,"",0)); 	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*2,$r,"",0),"eax");	#
+		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,64);
+	&sub($num,8);
+	&jnz(&label("sw_loop"));
+
+	&set_label("sw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jz(&label("sw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov("eax",&DWP($i*4,$a,"",0));	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*8,$r,"",0),"eax");	#
+		 &dec($num) if ($i != 7-1);
+		&mov(&DWP($i*8+4,$r,"",0),"edx");
+		 &jz(&label("sw_end")) if ($i != 7-1);
+		}
+	&set_label("sw_end",0);
+
+	&function_end($name);
+	}
+
+sub bn_div_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,"");
+	&mov("edx",&wparam(0));	#
+	&mov("eax",&wparam(1));	#
+	&mov("ecx",&wparam(2));	#
+	&div("ecx");
+	&ret();
+	&function_end_B($name);
+	}
+
+sub bn_add_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+sub bn_sub_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+sub bn_sub_part_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP(0,$a,"",0));	# *a
+		 &mov($tmp2,&DWP(0,$b,"",0));# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
+		&add($a, 4);
+		&add($b, 4);
+		&add($r, 4);
+		 &dec($num) if ($i != 6);
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+	&cmp(&wparam(4),0);
+	&je(&label("pw_end"));
+
+	&mov($num,&wparam(4));	# get dl
+	&cmp($num,0);
+	&je(&label("pw_end"));
+	&jge(&label("pw_pos"));
+
+	&comment("pw_neg");
+	&mov($tmp2,0);
+	&sub($tmp2,$num);
+	&mov($num,$tmp2);
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("pw_neg_finish"));
+
+	&set_label("pw_neg_loop",0);
+	for ($i=0; $i<8; $i++)
+	{
+	    &comment("dl<0 Round $i");
+
+	    &mov($tmp1,0);
+	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+	    &sub($tmp1,$c);
+	    &mov($c,0);
+	    &adc($c,$c);
+	    &sub($tmp1,$tmp2);
+	    &adc($c,0);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+	}
+	    
+	&comment("");
+	&add($b,32);
+	&add($r,32);
+	&sub($num,8);
+	&jnz(&label("pw_neg_loop"));
+	    
+	&set_label("pw_neg_finish",0);
+	&mov($tmp2,&wparam(4));	# get dl
+	&mov($num,0);
+	&sub($num,$tmp2);
+	&and($num,7);
+	&jz(&label("pw_end"));
+	    
+	for ($i=0; $i<7; $i++)
+	{
+	    &comment("dl<0 Tail Round $i");
+	    &mov($tmp1,0);
+	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+	    &sub($tmp1,$c);
+	    &mov($c,0);
+	    &adc($c,$c);
+	    &sub($tmp1,$tmp2);
+	    &adc($c,0);
+	    &dec($num) if ($i != 6);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &jz(&label("pw_end")) if ($i != 6);
+	}
+
+	&jmp(&label("pw_end"));
+	
+	&set_label("pw_pos",0);
+	
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("pw_pos_finish"));
+
+	&set_label("pw_pos_loop",0);
+
+	for ($i=0; $i<8; $i++)
+	{
+	    &comment("dl>0 Round $i");
+
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &sub($tmp1,$c);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &jnc(&label("pw_nc".$i));
+	}
+	    
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jnz(&label("pw_pos_loop"));
+	    
+	&set_label("pw_pos_finish",0);
+	&mov($num,&wparam(4));	# get dl
+	&and($num,7);
+	&jz(&label("pw_end"));
+	    
+	for ($i=0; $i<7; $i++)
+	{
+	    &comment("dl>0 Tail Round $i");
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &sub($tmp1,$c);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &jnc(&label("pw_tail_nc".$i));
+	    &dec($num) if ($i != 6);
+	    &jz(&label("pw_end")) if ($i != 6);
+	}
+	&mov($c,1);
+	&jmp(&label("pw_end"));
+
+	&set_label("pw_nc_loop",0);
+	for ($i=0; $i<8; $i++)
+	{
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &set_label("pw_nc".$i,0);
+	}
+	    
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jnz(&label("pw_nc_loop"));
+	    
+	&mov($num,&wparam(4));	# get dl
+	&and($num,7);
+	&jz(&label("pw_nc_end"));
+	    
+	for ($i=0; $i<7; $i++)
+	{
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &set_label("pw_tail_nc".$i,0);
+	    &dec($num) if ($i != 6);
+	    &jz(&label("pw_nc_end")) if ($i != 6);
+	}
+
+	&set_label("pw_nc_end",0);
+	&mov($c,0);
+
+	&set_label("pw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
--- a/crypto/bn/asm/co-586.pl
+++ b/crypto/bn/asm/co-586.pl
@@ -0,0 +1,287 @@
+#!/usr/local/bin/perl
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+&bn_mul_comba("bn_mul_comba8",8);
+&bn_mul_comba("bn_mul_comba4",4);
+&bn_sqr_comba("bn_sqr_comba8",8);
+&bn_sqr_comba("bn_sqr_comba4",4);
+
+&asm_finish();
+
+sub mul_add_c
+	{
+	local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("mul a[$ai]*b[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	&mul("edx");
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# laod next a
+	 &mov("eax",&wparam(0)) if $pos > 0;			# load r[]
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0;	# laod next b
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1;	# laod next b
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop 
+	 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# laod next a
+	}
+
+sub sqr_add_c
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop 
+	 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# load next b
+	}
+
+sub sqr_add_c2
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$a,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add("eax","eax");
+	 ###
+	&adc("edx","edx");
+	 ###
+	&adc($c2,0);
+	 &add($c0,"eax");
+	&adc($c1,"edx");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;	# load next b
+	&adc($c2,0);
+	&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
+	 ###
+	}
+
+sub bn_mul_comba
+	{
+	local($name,$num)=@_;
+	local($a,$b,$c0,$c1,$c2);
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($tot,$end);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$b="edi";
+	
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	&push("esi");
+	 &mov($a,&wparam(1));
+	&push("edi");
+	 &mov($b,&wparam(2));
+	&push("ebp");
+	 &push("ebx");
+
+	&xor($c0,$c0);
+	 &mov("eax",&DWP(0,$a,"",0));	# load the first word 
+	&xor($c1,$c1);
+	 &mov("edx",&DWP(0,$b,"",0));	# load the first second 
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("################## Calculate word $i"); 
+
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($j+1) == $end)
+				{
+				$v=1;
+				$v=2 if (($i+1) == $tot);
+				}
+			else
+				{ $v=0; }
+			if (($j+1) != $end)
+				{
+				$na=($ai-1);
+				$nb=($bi+1);
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
+			&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				# &mov("eax",&wparam(0));
+				# &mov(&DWP($i*4,"eax","",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&comment("save r[$i]");
+	# &mov("eax",&wparam(0));
+	&mov(&DWP($i*4,"eax","",0),$c0);
+
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
+
+sub bn_sqr_comba
+	{
+	local($name,$num)=@_;
+	local($r,$a,$c0,$c1,$c2)=@_;
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($b,$tot,$end,$half);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$r="edi";
+
+	&push("esi");
+	 &push("edi");
+	&push("ebp");
+	 &push("ebx");
+	&mov($r,&wparam(0));
+	 &mov($a,&wparam(1));
+	&xor($c0,$c0);
+	 &xor($c1,$c1);
+	&mov("eax",&DWP(0,$a,"",0)); # load the first word
+
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("############### Calculate word $i");
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($ai-1) < ($bi+1))
+				{
+				$v=1;
+				$v=2 if ($i+1) == $tot;
+				}
+			else
+				{ $v=0; }
+			if (!$v)
+				{
+				$na=$ai-1;
+				$nb=$bi+1;
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+			if ($ai == $bi)
+				{
+				&sqr_add_c($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			else
+				{
+				&sqr_add_c2($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				#&mov(&DWP($i*4,$r,"",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				last;
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&mov(&DWP($i*4,$r,"",0),$c0);
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
--- a/crypto/bn/asm/ia64-mont.pl
+++ b/crypto/bn/asm/ia64-mont.pl
@@ -0,0 +1,851 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2010
+#
+# "Teaser" Montgomery multiplication module for IA-64. There are
+# several possibilities for improvement:
+#
+# - modulo-scheduling outer loop would eliminate quite a number of
+#   stalls after ldf8, xma and getf.sig outside inner loop and
+#   improve shorter key performance;
+# - shorter vector support [with input vectors being fetched only
+#   once] should be added;
+# - 2x unroll with help of n0[1] would make the code scalable on
+#   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
+#   acute interest, because upcoming Tukwila's individual cores are
+#   reportedly based on Itanium 2 design;
+# - dedicated squaring procedure(?);
+#
+# January 2010
+#
+# Shorter vector support is implemented by zero-padding ap and np
+# vectors up to 8 elements, or 512 bits. This means that 256-bit
+# inputs will be processed only 2 times faster than 512-bit inputs,
+# not 4 [as one would expect, because algorithm complexity is n^2].
+# The reason for padding is that inputs shorter than 512 bits won't
+# be processed faster anyway, because minimal critical path of the
+# core loop happens to match 512-bit timing. Either way, it resulted
+# in >100% improvement of 512-bit RSA sign benchmark and 50% - of
+# 1024-bit one [in comparison to original version of *this* module].
+#
+# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
+# this module is:
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.000290s 0.000024s   3452.8  42031.4
+# rsa 1024 bits 0.000793s 0.000058s   1261.7  17172.0
+# rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
+# rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
+# dsa  512 bits 0.000253s 0.000198s   3949.9   5057.0
+# dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
+# dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
+#
+# ... and *without* (but still with ia64.S):
+#
+# rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
+# rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
+# rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
+# rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
+# dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
+# dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
+# dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
+#
+# As it can be seen, RSA sign performance improves by 130-30%,
+# hereafter less for longer keys, while verify - by 74-13%.
+# DSA performance improves by 115-30%.
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+
+$code=<<___;
+.explicit
+.text
+
+// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
+//		    const BN_ULONG *bp,const BN_ULONG *np,
+//		    const BN_ULONG *n0p,int num);			
+.align	64
+.global	bn_mul_mont#
+.proc	bn_mul_mont#
+bn_mul_mont:
+	.prologue
+	.body
+{ .mmi;	cmp4.le		p6,p7=2,r37;;
+(p6)	cmp4.lt.unc	p8,p9=8,r37
+	mov		ret0=r0		};;
+{ .bbb;
+(p9)	br.cond.dptk.many	bn_mul_mont_8
+(p8)	br.cond.dpnt.many	bn_mul_mont_general
+(p7)	br.ret.spnt.many	b0	};;
+.endp	bn_mul_mont#
+
+prevfs=r2;	prevpr=r3;	prevlc=r10;	prevsp=r11;
+
+rptr=r8;	aptr=r9;	bptr=r14;	nptr=r15;
+tptr=r16;	// &tp[0]
+tp_1=r17;	// &tp[-1]
+num=r18;	len=r19;	lc=r20;
+topbit=r21;	// carry bit from tmp[num]
+
+n0=f6;
+m0=f7;
+bi=f8;
+
+.align	64
+.local	bn_mul_mont_general#
+.proc	bn_mul_mont_general#
+bn_mul_mont_general:
+	.prologue
+{ .mmi;	.save	ar.pfs,prevfs
+	alloc	prevfs=ar.pfs,6,2,0,8
+	$ADDP	aptr=0,in1
+	.save	ar.lc,prevlc
+	mov	prevlc=ar.lc		}
+{ .mmi;	.vframe	prevsp
+	mov	prevsp=sp
+	$ADDP	bptr=0,in2
+	.save	pr,prevpr
+	mov	prevpr=pr		};;
+
+	.body
+	.rotf		alo[6],nlo[4],ahi[8],nhi[6]
+	.rotr		a[3],n[3],t[2]
+
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		alo[4]=[aptr],16	// ap[0]
+	$ADDP		r30=8,in1	};;
+{ .mmi;	ldf8		alo[3]=[r30],16		// ap[1]
+	ldf8		alo[2]=[aptr],16	// ap[2]
+	$ADDP		in4=0,in4	};;
+{ .mmi;	ldf8		alo[1]=[r30]		// ap[3]
+	ldf8		n0=[in4]		// n0
+	$ADDP		rptr=0,in0		}
+{ .mmi;	$ADDP		nptr=0,in3
+	mov		r31=16
+	zxt4		num=in5		};;
+{ .mmi;	ldf8		nlo[2]=[nptr],8		// np[0]
+	shladd		len=num,3,r0
+	shladd		r31=num,3,r31	};;
+{ .mmi;	ldf8		nlo[1]=[nptr],8		// np[1]
+	add		lc=-5,num
+	sub		r31=sp,r31	};;
+{ .mfb;	and		sp=-16,r31		// alloca
+	xmpy.hu		ahi[2]=alo[4],bi	// ap[0]*bp[0]
+	nop.b		0		}
+{ .mfb;	nop.m		0
+	xmpy.lu		alo[4]=alo[4],bi
+	brp.loop.imp	.L1st_ctop,.L1st_cend-16
+					};;
+{ .mfi;	nop.m		0
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[0]
+	add		tp_1=8,sp	}
+{ .mfi;	nop.m		0
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20001f<<16
+			// ------^----- (p40) at first (p23)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	nop.m		0
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;	nop.m		0
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+.align	32
+.L1st_ctop:
+.pred.rel	"mutex",p40,p42
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)					}
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)(p16)
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p42)	cmp.leu		p41,p39=n[2],a[2]   	}   // (p23)
+{ .mfi;	(p23)	st8		[tp_1]=n[2],8
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mmb;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	br.ctop.sptk	.L1st_ctop			};;
+.L1st_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	add		num=-1,num	};;	// num--
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	.pred.rel	"mutex",p40,p42
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+	sub		nptr=nptr,len	};;
+{ .mmi;	.pred.rel	"mutex",p39,p41
+(p39)	add		topbit=r0,r0
+(p41)	add		topbit=r0,r0,1
+	nop.i		0		}	
+{ .mmi;	st8		[tp_1]=n[0]
+	add		tptr=16,sp
+	add		tp_1=8,sp	};;
+
+.Louter:
+{ .mmi;	ldf8		bi=[bptr],8		// (*bp++)
+	ldf8		ahi[3]=[tptr]		// tp[0]
+	add		r30=8,aptr	};;
+{ .mmi;	ldf8		alo[4]=[aptr],16	// ap[0]
+	ldf8		alo[3]=[r30],16		// ap[1]
+	add		r31=8,nptr	};;
+{ .mfb;	ldf8		alo[2]=[aptr],16	// ap[2]
+	xma.hu		ahi[2]=alo[4],bi,ahi[3]	// ap[0]*bp[i]+tp[0]
+	brp.loop.imp	.Linner_ctop,.Linner_cend-16
+					}
+{ .mfb;	ldf8		alo[1]=[r30]		// ap[3]
+	xma.lu		alo[4]=alo[4],bi,ahi[3]
+	clrrrb.pr			};;
+{ .mfi;	ldf8		nlo[2]=[nptr],16	// np[0]
+	xma.hu		ahi[1]=alo[3],bi,ahi[2]	// ap[1]*bp[i]
+	nop.i		0		}
+{ .mfi;	ldf8		nlo[1]=[r31]		// np[1]
+	xma.lu		alo[3]=alo[3],bi,ahi[2]
+	mov		pr.rot=0x20101f<<16
+			// ------^----- (p40) at first (p23)
+			// --------^--- (p30) at first (p22)
+			// ----------^^ p[16:20]=1
+					};;
+{ .mfi;	st8		[tptr]=r0		// tp[0] is already accounted
+	xmpy.lu		m0=alo[4],n0		// (ap[0]*bp[i]+tp[0])*n0
+	mov		ar.lc=lc	}
+{ .mfi;
+	fcvt.fxu.s1	nhi[1]=f0
+	mov		ar.ec=8		};;
+
+// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
+// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
+// in latter case accounts for two-tick pipeline stall, which means
+// that its performance would be ~20% lower than optimal one. No
+// attempt was made to address this, because original Itanium is
+// hardly represented out in the wild...
+.align	32
+.Linner_ctop:
+.pred.rel	"mutex",p40,p42
+.pred.rel	"mutex",p30,p32
+{ .mfi;	(p16)	ldf8		alo[0]=[aptr],8		    // *(aptr++)
+	(p18)	xma.hu		ahi[0]=alo[2],bi,ahi[1]
+	(p40)	add		n[2]=n[2],a[2]		}   // (p23)
+{ .mfi;	(p16)	nop.m		0
+	(p18)	xma.lu		alo[2]=alo[2],bi,ahi[1]
+	(p42)	add		n[2]=n[2],a[2],1	};; // (p23)
+{ .mfi;	(p21)	getf.sig	a[0]=alo[5]
+	(p16)	nop.f		0
+	(p40)	cmp.ltu		p41,p39=n[2],a[2]	}   // (p23)
+{ .mfi;	(p21)	ld8		t[0]=[tptr],8
+	(p16)	nop.f		0
+	(p42)	cmp.leu		p41,p39=n[2],a[2]	};; // (p23)
+{ .mfi;	(p18)	ldf8		nlo[0]=[nptr],8		    // *(nptr++)
+	(p20)	xma.hu		nhi[0]=nlo[2],m0,nhi[1]
+	(p30)	add		a[1]=a[1],t[1]		}   // (p22)
+{ .mfi;	(p16)	nop.m		0
+	(p20)	xma.lu		nlo[2]=nlo[2],m0,nhi[1]
+	(p32)	add		a[1]=a[1],t[1],1	};; // (p22)
+{ .mmi;	(p21)	getf.sig	n[0]=nlo[3]
+	(p16)	nop.m		0
+	(p30)	cmp.ltu		p31,p29=a[1],t[1]	}   // (p22)
+{ .mmb;	(p23)	st8		[tp_1]=n[2],8
+	(p32)	cmp.leu		p31,p29=a[1],t[1]	    // (p22)
+	br.ctop.sptk	.Linner_ctop			};;
+.Linner_cend:
+
+{ .mmi;	getf.sig	a[0]=ahi[6]		// (p24)
+	getf.sig	n[0]=nhi[4]
+	nop.i		0		};;
+
+{ .mmi;	.pred.rel	"mutex",p31,p33
+(p31)	add		a[0]=a[0],topbit
+(p33)	add		a[0]=a[0],topbit,1
+	mov		topbit=r0	};;
+{ .mfi; .pred.rel	"mutex",p31,p33
+(p31)	cmp.ltu		p32,p30=a[0],topbit
+(p33)	cmp.leu		p32,p30=a[0],topbit
+					}
+{ .mfi;	.pred.rel	"mutex",p40,p42
+(p40)	add		n[0]=n[0],a[0]
+(p42)	add		n[0]=n[0],a[0],1
+					};;
+{ .mmi;	.pred.rel	"mutex",p44,p46
+(p40)	cmp.ltu		p41,p39=n[0],a[0]
+(p42)	cmp.leu		p41,p39=n[0],a[0]
+(p32)	add		topbit=r0,r0,1	}
+
+{ .mmi;	st8		[tp_1]=n[0],8
+	cmp4.ne		p6,p0=1,num
+	sub		aptr=aptr,len	};;	// rewind
+{ .mmi;	sub		nptr=nptr,len
+(p41)	add		topbit=r0,r0,1
+	add		tptr=16,sp	}
+{ .mmb;	add		tp_1=8,sp
+	add		num=-1,num		// num--
+(p6)	br.cond.sptk.many	.Louter	};;
+
+{ .mbb;	add		lc=4,lc
+	brp.loop.imp	.Lsub_ctop,.Lsub_cend-16
+	clrrrb.pr			};;
+{ .mii;	nop.m		0
+	mov		pr.rot=0x10001<<16
+			// ------^---- (p33) at first (p17)
+	mov		ar.lc=lc	}
+{ .mii;	nop.m		0
+	mov		ar.ec=3
+	nop.i		0		};;
+
+.Lsub_ctop:
+.pred.rel	"mutex",p33,p35
+{ .mfi;	(p16)	ld8		t[0]=[tptr],8		    // t=*(tp++)
+	(p16)	nop.f		0
+	(p33)	sub		n[1]=t[1],n[1]		}   // (p17)
+{ .mfi;	(p16)	ld8		n[0]=[nptr],8		    // n=*(np++)
+	(p16)	nop.f		0
+	(p35)	sub		n[1]=t[1],n[1],1	};; // (p17)
+{ .mib;	(p18)	st8		[rptr]=n[2],8		    // *(rp++)=r
+	(p33)	cmp.gtu		p34,p32=n[1],t[1]	    // (p17)
+	(p18)	nop.b		0			}
+{ .mib;	(p18)	nop.m		0
+	(p35)	cmp.geu		p34,p32=n[1],t[1]	    // (p17)
+	br.ctop.sptk	.Lsub_ctop			};;
+.Lsub_cend:
+
+{ .mmb;	.pred.rel	"mutex",p34,p36
+(p34)	sub	topbit=topbit,r0	// (p19)
+(p36)	sub	topbit=topbit,r0,1
+	brp.loop.imp	.Lcopy_ctop,.Lcopy_cend-16
+					}
+{ .mmb;	sub	rptr=rptr,len		// rewind
+	sub	tptr=tptr,len
+	clrrrb.pr			};;
+{ .mmi;	and	aptr=tptr,topbit
+	andcm	bptr=rptr,topbit
+	mov	pr.rot=1<<16		};;
+{ .mii;	or	nptr=aptr,bptr
+	mov	ar.lc=lc
+	mov	ar.ec=3			};;
+
+.Lcopy_ctop:
+{ .mmb;	(p16)	ld8	n[0]=[nptr],8
+	(p18)	st8	[tptr]=r0,8
+	(p16)	nop.b	0		}
+{ .mmb;	(p16)	nop.m	0
+	(p18)	st8	[rptr]=n[2],8
+	br.ctop.sptk	.Lcopy_ctop	};;
+.Lcopy_cend:
+
+{ .mmi;	mov		ret0=1			// signal "handled"
+	rum		1<<5			// clear um.mfh
+	mov		ar.lc=prevlc	}
+{ .mib;	.restore	sp
+	mov		sp=prevsp
+	mov		pr=prevpr,0x1ffff
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_general#
+
+a1=r16;  a2=r17;  a3=r18;  a4=r19;  a5=r20;  a6=r21;  a7=r22;  a8=r23;
+n1=r24;  n2=r25;  n3=r26;  n4=r27;  n5=r28;  n6=r29;  n7=r30;  n8=r31;
+t0=r15;
+
+ai0=f8;  ai1=f9;  ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15;
+ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23;
+
+.align	64
+.skip	48		// aligns loop body
+.local	bn_mul_mont_8#
+.proc	bn_mul_mont_8#
+bn_mul_mont_8:
+	.prologue
+{ .mmi;	.save		ar.pfs,prevfs
+	alloc		prevfs=ar.pfs,6,2,0,8
+	.vframe		prevsp
+	mov		prevsp=sp
+	.save		ar.lc,prevlc
+	mov		prevlc=ar.lc	}
+{ .mmi;	add		r17=-6*16,sp
+	add		sp=-7*16,sp
+	.save		pr,prevpr
+	mov		prevpr=pr	};;
+
+{ .mmi;	.save.gf	0,0x10
+	stf.spill	[sp]=f16,-16
+	.save.gf	0,0x20
+	stf.spill	[r17]=f17,32
+	add		r16=-5*16,prevsp};;
+{ .mmi;	.save.gf	0,0x40
+	stf.spill	[r16]=f18,32
+	.save.gf	0,0x80
+	stf.spill	[r17]=f19,32
+	$ADDP		aptr=0,in1	};;
+{ .mmi;	.save.gf	0,0x100
+	stf.spill	[r16]=f20,32
+	.save.gf	0,0x200
+	stf.spill	[r17]=f21,32
+	$ADDP		r29=8,in1	};;
+{ .mmi;	.save.gf	0,0x400
+	stf.spill	[r16]=f22
+	.save.gf	0,0x800
+	stf.spill	[r17]=f23
+	$ADDP		rptr=0,in0	};;
+
+	.body
+	.rotf		bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10]
+	.rotr		t[8]
+
+// load input vectors padding them to 8 elements
+{ .mmi;	ldf8		ai0=[aptr],16		// ap[0]
+	ldf8		ai1=[r29],16		// ap[1]
+	$ADDP		bptr=0,in2	}
+{ .mmi;	$ADDP		r30=8,in2
+	$ADDP		nptr=0,in3
+	$ADDP		r31=8,in3	};;
+{ .mmi;	ldf8		bj[7]=[bptr],16		// bp[0]
+	ldf8		bj[6]=[r30],16		// bp[1]
+	cmp4.le		p4,p5=3,in5	}
+{ .mmi;	ldf8		ni0=[nptr],16		// np[0]
+	ldf8		ni1=[r31],16		// np[1]
+	cmp4.le		p6,p7=4,in5	};;
+
+{ .mfi;	(p4)ldf8	ai2=[aptr],16		// ap[2]
+	(p5)fcvt.fxu	ai2=f0
+	cmp4.le		p8,p9=5,in5	}
+{ .mfi;	(p6)ldf8	ai3=[r29],16		// ap[3]
+	(p7)fcvt.fxu	ai3=f0
+	cmp4.le		p10,p11=6,in5	}
+{ .mfi;	(p4)ldf8	bj[5]=[bptr],16		// bp[2]
+	(p5)fcvt.fxu	bj[5]=f0
+	cmp4.le		p12,p13=7,in5	}
+{ .mfi;	(p6)ldf8	bj[4]=[r30],16		// bp[3]
+	(p7)fcvt.fxu	bj[4]=f0
+	cmp4.le		p14,p15=8,in5	}
+{ .mfi;	(p4)ldf8	ni2=[nptr],16		// np[2]
+	(p5)fcvt.fxu	ni2=f0
+	addp4		r28=-1,in5	}
+{ .mfi;	(p6)ldf8	ni3=[r31],16		// np[3]
+	(p7)fcvt.fxu	ni3=f0
+	$ADDP		in4=0,in4	};;
+
+{ .mfi;	ldf8		n0=[in4]
+	fcvt.fxu	tf[1]=f0
+	nop.i		0		}
+
+{ .mfi;	(p8)ldf8	ai4=[aptr],16		// ap[4]
+	(p9)fcvt.fxu	ai4=f0
+	mov		t[0]=r0		}
+{ .mfi;	(p10)ldf8	ai5=[r29],16		// ap[5]
+	(p11)fcvt.fxu	ai5=f0
+	mov		t[1]=r0		}
+{ .mfi;	(p8)ldf8	bj[3]=[bptr],16		// bp[4]
+	(p9)fcvt.fxu	bj[3]=f0
+	mov		t[2]=r0		}
+{ .mfi;	(p10)ldf8	bj[2]=[r30],16		// bp[5]
+	(p11)fcvt.fxu	bj[2]=f0
+	mov		t[3]=r0		}
+{ .mfi;	(p8)ldf8	ni4=[nptr],16		// np[4]
+	(p9)fcvt.fxu	ni4=f0
+	mov		t[4]=r0		}
+{ .mfi;	(p10)ldf8	ni5=[r31],16		// np[5]
+	(p11)fcvt.fxu	ni5=f0
+	mov		t[5]=r0		};;
+
+{ .mfi;	(p12)ldf8	ai6=[aptr],16		// ap[6]
+	(p13)fcvt.fxu	ai6=f0
+	mov		t[6]=r0		}
+{ .mfi;	(p14)ldf8	ai7=[r29],16		// ap[7]
+	(p15)fcvt.fxu	ai7=f0
+	mov		t[7]=r0		}
+{ .mfi;	(p12)ldf8	bj[1]=[bptr],16		// bp[6]
+	(p13)fcvt.fxu	bj[1]=f0
+	mov		ar.lc=r28	}
+{ .mfi;	(p14)ldf8	bj[0]=[r30],16		// bp[7]
+	(p15)fcvt.fxu	bj[0]=f0
+	mov		ar.ec=1		}
+{ .mfi;	(p12)ldf8	ni6=[nptr],16		// np[6]
+	(p13)fcvt.fxu	ni6=f0
+	mov		pr.rot=1<<16	}
+{ .mfb;	(p14)ldf8	ni7=[r31],16		// np[7]
+	(p15)fcvt.fxu	ni7=f0
+	brp.loop.imp	.Louter_8_ctop,.Louter_8_cend-16
+					};;
+
+// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt
+// to measure with help of Interval Time Counter indicated that the
+// factor is a tad higher: 33 or 34, if not 35. Exact measurement and
+// addressing the issue is problematic, because I don't have access
+// to platform-specific instruction-level profiler. On Itanium it
+// should run in 56*n ticks, because of higher xma latency...
+.Louter_8_ctop:
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 0:
+	(p16)	xma.hu		ahi[0]=ai0,bj[7],tf[1]	//	ap[0]*b[i]+t[0]
+	(p40)	add		a3=a3,n3	}	//	(p17) a3+=n3
+{ .mfi;	(p42)	add		a3=a3,n3,1
+	(p16)	xma.lu		alo[0]=ai0,bj[7],tf[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
+	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
+	(p50)	add		t[6]=t[6],a3,1	};;
+{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
+	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
+	(p40)	cmp.ltu		p43,p41=a3,n3	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
+	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
+	(p48)	cmp.ltu		p51,p49=t[6],a3
+	(p50)	cmp.leu		p51,p49=t[6],a3	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 4:
+	(p16)	xma.hu		ahi[1]=ai1,bj[7],ahi[0]	//	ap[1]*b[i]
+	(p41)	add		a4=a4,n4	}	//	(p17) a4+=n4
+{ .mfi;	(p43)	add		a4=a4,n4,1
+	(p16)	xma.lu		alo[1]=ai1,bj[7],ahi[0]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
+	(p16)	xmpy.lu		mj[0]=alo[0],n0		//	(ap[0]*b[i]+t[0])*n0
+	(p51)	add		t[5]=t[5],a4,1	};;
+{ .mfi;	(p16)	nop.m		0			// 6:
+	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
+	(p41)	cmp.ltu		p42,p40=a4,n4	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
+	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
+	(p49)	cmp.ltu		p50,p48=t[5],a4
+	(p51)	cmp.leu		p50,p48=t[5],a4	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 8:
+	(p16)	xma.hu		ahi[2]=ai2,bj[7],ahi[1]	//	ap[2]*b[i]
+	(p40)	add		a5=a5,n5	}	//	(p17) a5+=n5
+{ .mfi;	(p42)	add		a5=a5,n5,1
+	(p16)	xma.lu		alo[2]=ai2,bj[7],ahi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a1=alo[1]		// 9:
+	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
+	(p50)	add		t[4]=t[4],a5,1	};;
+{ .mfi;	(p16)	nop.m		0			// 10:
+	(p16)	xma.hu		nhi[0]=ni0,mj[0],alo[0]	//	np[0]*m0
+	(p40)	cmp.ltu		p43,p41=a5,n5	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a5,n5
+	(p16)	xma.lu		nlo[0]=ni0,mj[0],alo[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
+	(p48)	cmp.ltu		p51,p49=t[4],a5
+	(p50)	cmp.leu		p51,p49=t[4],a5	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p17)	getf.sig	n8=nhi[8]		// 12:
+	(p16)	xma.hu		ahi[3]=ai3,bj[7],ahi[2]	//	ap[3]*b[i]
+	(p41)	add		a6=a6,n6	}	//	(p17) a6+=n6
+{ .mfi;	(p43)	add		a6=a6,n6,1
+	(p16)	xma.lu		alo[3]=ai3,bj[7],ahi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a2=alo[2]		// 13:
+	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
+	(p51)	add		t[3]=t[3],a6,1	};;
+{ .mfi;	(p16)	nop.m		0			// 14:
+	(p16)	xma.hu		nhi[1]=ni1,mj[0],nhi[0]	//	np[1]*m0
+	(p41)	cmp.ltu		p42,p40=a6,n6	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a6,n6
+	(p16)	xma.lu		nlo[1]=ni1,mj[0],nhi[0]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	nop.m		0			// 15:
+	(p49)	cmp.ltu		p50,p48=t[3],a6
+	(p51)	cmp.leu		p50,p48=t[3],a6	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 16:
+	(p16)	xma.hu		ahi[4]=ai4,bj[7],ahi[3]	//	ap[4]*b[i]
+	(p40)	add		a7=a7,n7	}	//	(p17) a7+=n7
+{ .mfi;	(p42)	add		a7=a7,n7,1
+	(p16)	xma.lu		alo[4]=ai4,bj[7],ahi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a3=alo[3]		// 17:
+	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
+	(p50)	add		t[2]=t[2],a7,1	};;
+{ .mfi;	(p16)	nop.m		0			// 18:
+	(p16)	xma.hu		nhi[2]=ni2,mj[0],nhi[1]	//	np[2]*m0
+	(p40)	cmp.ltu		p43,p41=a7,n7	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a7,n7
+	(p16)	xma.lu		nlo[2]=ni2,mj[0],nhi[1]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n1=nlo[1]		// 19:
+	(p48)	cmp.ltu		p51,p49=t[2],a7
+	(p50)	cmp.leu		p51,p49=t[2],a7	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mfi;	(p16)	nop.m		0			// 20:
+	(p16)	xma.hu		ahi[5]=ai5,bj[7],ahi[4]	//	ap[5]*b[i]
+	(p41)	add		a8=a8,n8	}	//	(p17) a8+=n8
+{ .mfi;	(p43)	add		a8=a8,n8,1
+	(p16)	xma.lu		alo[5]=ai5,bj[7],ahi[4]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a4=alo[4]		// 21:
+	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
+	(p51)	add		t[1]=t[1],a8,1	};;
+{ .mfi;	(p16)	nop.m		0			// 22:
+	(p16)	xma.hu		nhi[3]=ni3,mj[0],nhi[2]	//	np[3]*m0
+	(p41)	cmp.ltu		p42,p40=a8,n8	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a8,n8
+	(p16)	xma.lu		nlo[3]=ni3,mj[0],nhi[2]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n2=nlo[2]		// 23:
+	(p49)	cmp.ltu		p50,p48=t[1],a8
+	(p51)	cmp.leu		p50,p48=t[1],a8	};;
+{ .mfi;	(p16)	nop.m		0			// 24:
+	(p16)	xma.hu		ahi[6]=ai6,bj[7],ahi[5]	//	ap[6]*b[i]
+	(p16)	add		a1=a1,n1	}	//	(p16) a1+=n1
+{ .mfi;	(p16)	nop.m		0
+	(p16)	xma.lu		alo[6]=ai6,bj[7],ahi[5]
+	(p17)	mov		t[0]=r0		};;
+{ .mii;	(p16)	getf.sig	a5=alo[5]		// 25:
+	(p16)	add		t0=t[7],a1		//	(p16) t[7]+=a1
+	(p42)	add		t[0]=t[0],r0,1	};;
+{ .mfi;	(p16)	setf.sig	tf[0]=t0		// 26:
+	(p16)	xma.hu		nhi[4]=ni4,mj[0],nhi[3]	//	np[4]*m0
+	(p50)	add		t[0]=t[0],r0,1	}
+{ .mfi;	(p16)	cmp.ltu.unc	p42,p40=a1,n1
+	(p16)	xma.lu		nlo[4]=ni4,mj[0],nhi[3]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	n3=nlo[3]		// 27:
+	(p16)	cmp.ltu.unc	p50,p48=t0,a1
+	(p16)	nop.i		0		};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mfi;	(p16)	nop.m		0			// 28:
+	(p16)	xma.hu		ahi[7]=ai7,bj[7],ahi[6]	//	ap[7]*b[i]
+	(p40)	add		a2=a2,n2	}	//	(p16) a2+=n2
+{ .mfi;	(p42)	add		a2=a2,n2,1
+	(p16)	xma.lu		alo[7]=ai7,bj[7],ahi[6]
+	(p16)	nop.i		0		};;
+{ .mii;	(p16)	getf.sig	a6=alo[6]		// 29:
+	(p48)	add		t[6]=t[6],a2		//	(p16) t[6]+=a2
+	(p50)	add		t[6]=t[6],a2,1	};;
+{ .mfi;	(p16)	nop.m		0			// 30:
+	(p16)	xma.hu		nhi[5]=ni5,mj[0],nhi[4]	//	np[5]*m0
+	(p40)	cmp.ltu		p41,p39=a2,n2	}
+{ .mfi;	(p42)	cmp.leu		p41,p39=a2,n2
+	(p16)	xma.lu		nlo[5]=ni5,mj[0],nhi[4]
+	(p16)	nop.i		0		};;
+{ .mfi;	(p16)	getf.sig	n4=nlo[4]		// 31:
+	(p16)	nop.f		0
+	(p48)	cmp.ltu		p49,p47=t[6],a2	}
+{ .mfb;	(p50)	cmp.leu		p49,p47=t[6],a2
+	(p16)	nop.f		0
+	br.ctop.sptk.many	.Louter_8_ctop	};;
+.Louter_8_cend:
+
+// above loop has to execute one more time, without (p16), which is
+// replaced with merged move of np[8] to GPR bank
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mmi;	(p0)	getf.sig	n1=ni0			// 0:
+	(p40)	add		a3=a3,n3		//	(p17) a3+=n3
+	(p42)	add		a3=a3,n3,1	};;
+{ .mii;	(p17)	getf.sig	a7=alo[8]		// 1:
+	(p48)	add		t[6]=t[6],a3		//	(p17) t[6]+=a3
+	(p50)	add		t[6]=t[6],a3,1	};;
+{ .mfi;	(p17)	getf.sig	a8=ahi[8]		// 2:
+	(p17)	xma.hu		nhi[7]=ni6,mj[1],nhi[6]	//	np[6]*m0
+	(p40)	cmp.ltu		p43,p41=a3,n3	}
+{ .mfi;	(p42)	cmp.leu		p43,p41=a3,n3
+	(p17)	xma.lu		nlo[7]=ni6,mj[1],nhi[6]
+	(p0)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n5=nlo[6]		// 3:
+	(p48)	cmp.ltu		p51,p49=t[6],a3
+	(p50)	cmp.leu		p51,p49=t[6],a3	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mmi;	(p0)	getf.sig	n2=ni1			// 4:
+	(p41)	add		a4=a4,n4		//	(p17) a4+=n4
+	(p43)	add		a4=a4,n4,1	};;
+{ .mfi;	(p49)	add		t[5]=t[5],a4		// 5:	(p17) t[5]+=a4
+	(p0)	nop.f		0
+	(p51)	add		t[5]=t[5],a4,1	};;
+{ .mfi;	(p0)	getf.sig	n3=ni2			// 6:
+	(p17)	xma.hu		nhi[8]=ni7,mj[1],nhi[7]	//	np[7]*m0
+	(p41)	cmp.ltu		p42,p40=a4,n4	}
+{ .mfi;	(p43)	cmp.leu		p42,p40=a4,n4
+	(p17)	xma.lu		nlo[8]=ni7,mj[1],nhi[7]
+	(p0)	nop.i		0		};;
+{ .mii;	(p17)	getf.sig	n6=nlo[7]		// 7:
+	(p49)	cmp.ltu		p50,p48=t[5],a4
+	(p51)	cmp.leu		p50,p48=t[5],a4	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mii;	(p0)	getf.sig	n4=ni3			// 8:
+	(p40)	add		a5=a5,n5		//	(p17) a5+=n5
+	(p42)	add		a5=a5,n5,1	};;
+{ .mii;	(p0)	nop.m		0			// 9:
+	(p48)	add		t[4]=t[4],a5		//	p(17) t[4]+=a5
+	(p50)	add		t[4]=t[4],a5,1	};;
+{ .mii;	(p0)	nop.m		0			// 10:
+	(p40)	cmp.ltu		p43,p41=a5,n5
+	(p42)	cmp.leu		p43,p41=a5,n5	};;
+{ .mii;	(p17)	getf.sig	n7=nlo[8]		// 11:
+	(p48)	cmp.ltu		p51,p49=t[4],a5
+	(p50)	cmp.leu		p51,p49=t[4],a5	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mii;	(p17)	getf.sig	n8=nhi[8]		// 12:
+	(p41)	add		a6=a6,n6		//	(p17) a6+=n6
+	(p43)	add		a6=a6,n6,1	};;
+{ .mii;	(p0)	getf.sig	n5=ni4			// 13:
+	(p49)	add		t[3]=t[3],a6		//	(p17) t[3]+=a6
+	(p51)	add		t[3]=t[3],a6,1	};;
+{ .mii;	(p0)	nop.m		0			// 14:
+	(p41)	cmp.ltu		p42,p40=a6,n6
+	(p43)	cmp.leu		p42,p40=a6,n6	};;
+{ .mii;	(p0)	getf.sig	n6=ni5			// 15:
+	(p49)	cmp.ltu		p50,p48=t[3],a6
+	(p51)	cmp.leu		p50,p48=t[3],a6	};;
+	.pred.rel		"mutex",p40,p42
+	.pred.rel		"mutex",p48,p50
+{ .mii;	(p0)	nop.m		0			// 16:
+	(p40)	add		a7=a7,n7		//	(p17) a7+=n7
+	(p42)	add		a7=a7,n7,1	};;
+{ .mii;	(p0)	nop.m		0			// 17:
+	(p48)	add		t[2]=t[2],a7		//	(p17) t[2]+=a7
+	(p50)	add		t[2]=t[2],a7,1	};;
+{ .mii;	(p0)	nop.m		0			// 18:
+	(p40)	cmp.ltu		p43,p41=a7,n7
+	(p42)	cmp.leu		p43,p41=a7,n7	};;
+{ .mii;	(p0)	getf.sig	n7=ni6			// 19:
+	(p48)	cmp.ltu		p51,p49=t[2],a7
+	(p50)	cmp.leu		p51,p49=t[2],a7	};;
+	.pred.rel		"mutex",p41,p43
+	.pred.rel		"mutex",p49,p51
+{ .mii;	(p0)	nop.m		0			// 20:
+	(p41)	add		a8=a8,n8		//	(p17) a8+=n8
+	(p43)	add		a8=a8,n8,1	};;
+{ .mmi;	(p0)	nop.m		0			// 21:
+	(p49)	add		t[1]=t[1],a8		//	(p17) t[1]+=a8
+	(p51)	add		t[1]=t[1],a8,1	}
+{ .mmi;	(p17)	mov		t[0]=r0
+	(p41)	cmp.ltu		p42,p40=a8,n8
+	(p43)	cmp.leu		p42,p40=a8,n8	};;
+{ .mmi;	(p0)	getf.sig	n8=ni7			// 22:
+	(p49)	cmp.ltu		p50,p48=t[1],a8
+	(p51)	cmp.leu		p50,p48=t[1],a8	}
+{ .mmi;	(p42)	add		t[0]=t[0],r0,1
+	(p0)	add		r16=-7*16,prevsp
+	(p0)	add		r17=-6*16,prevsp	};;
+
+// subtract np[8] from carrybit|tmp[8]
+// carrybit|tmp[8] layout upon exit from above loop is:
+//	t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant)
+{ .mmi;	(p50)add	t[0]=t[0],r0,1
+	add		r18=-5*16,prevsp
+	sub		n1=t0,n1	};;
+{ .mmi;	cmp.gtu		p34,p32=n1,t0;;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n2=t[7],n2
+	(p34)sub	n2=t[7],n2,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n2,t[7]
+	(p34)cmp.geu	p35,p33=n2,t[7];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n3=t[6],n3	}
+{ .mmi;	(p35)sub	n3=t[6],n3,1;;
+	(p33)cmp.gtu	p34,p32=n3,t[6]
+	(p35)cmp.geu	p34,p32=n3,t[6]	};;
+	.pred.rel	"mutex",p32,p34
+{ .mii;	(p32)sub	n4=t[5],n4
+	(p34)sub	n4=t[5],n4,1;;
+	(p32)cmp.gtu	p35,p33=n4,t[5]	}
+{ .mmi;	(p34)cmp.geu	p35,p33=n4,t[5];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	n5=t[4],n5
+	(p35)sub	n5=t[4],n5,1	};;
+{ .mii;	(p33)cmp.gtu	p34,p32=n5,t[4]
+	(p35)cmp.geu	p34,p32=n5,t[4];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n6=t[3],n6	}
+{ .mmi;	(p34)sub	n6=t[3],n6,1;;
+	(p32)cmp.gtu	p35,p33=n6,t[3]
+	(p34)cmp.geu	p35,p33=n6,t[3]	};;
+	.pred.rel	"mutex",p33,p35
+{ .mii;	(p33)sub	n7=t[2],n7
+	(p35)sub	n7=t[2],n7,1;;
+	(p33)cmp.gtu	p34,p32=n7,t[2]	}
+{ .mmi;	(p35)cmp.geu	p34,p32=n7,t[2];;
+	.pred.rel	"mutex",p32,p34
+	(p32)sub	n8=t[1],n8
+	(p34)sub	n8=t[1],n8,1	};;
+{ .mii;	(p32)cmp.gtu	p35,p33=n8,t[1]
+	(p34)cmp.geu	p35,p33=n8,t[1];;
+	.pred.rel	"mutex",p33,p35
+	(p33)sub	a8=t[0],r0	}
+{ .mmi;	(p35)sub	a8=t[0],r0,1;;
+	(p33)cmp.gtu	p34,p32=a8,t[0]
+	(p35)cmp.geu	p34,p32=a8,t[0]	};;
+
+// save the result, either tmp[num] or tmp[num]-np[num]
+	.pred.rel	"mutex",p32,p34
+{ .mmi;	(p32)st8	[rptr]=n1,8
+	(p34)st8	[rptr]=t0,8
+	add		r19=-4*16,prevsp};;
+{ .mmb;	(p32)st8	[rptr]=n2,8
+	(p34)st8	[rptr]=t[7],8
+	(p5)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n3,8
+	(p34)st8	[rptr]=t[6],8
+	(p7)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n4,8
+	(p34)st8	[rptr]=t[5],8
+	(p9)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n5,8
+	(p34)st8	[rptr]=t[4],8
+	(p11)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n6,8
+	(p34)st8	[rptr]=t[3],8
+	(p13)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n7,8
+	(p34)st8	[rptr]=t[2],8
+	(p15)br.cond.dpnt.few	.Ldone	};;
+{ .mmb;	(p32)st8	[rptr]=n8,8
+	(p34)st8	[rptr]=t[1],8
+	nop.b		0		};;
+.Ldone:						// epilogue
+{ .mmi;	ldf.fill	f16=[r16],64
+	ldf.fill	f17=[r17],64
+	nop.i		0		}
+{ .mmi;	ldf.fill	f18=[r18],64
+	ldf.fill	f19=[r19],64
+	mov		pr=prevpr,0x1ffff	};;
+{ .mmi;	ldf.fill	f20=[r16]
+	ldf.fill	f21=[r17]
+	mov		ar.lc=prevlc	}
+{ .mmi;	ldf.fill	f22=[r18]
+	ldf.fill	f23=[r19]
+	mov		ret0=1		}	// signal "handled"
+{ .mib;	rum		1<<5
+	.restore	sp
+	mov		sp=prevsp
+	br.ret.sptk.many	b0	};;
+.endp	bn_mul_mont_8#
+
+.type	copyright#,\@object
+copyright:
+stringz	"Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
--- a/crypto/bn/asm/ia64.S
+++ b/crypto/bn/asm/ia64.S
--- a/crypto/bn/asm/mips-mont.pl
+++ b/crypto/bn/asm/mips-mont.pl
@@ -0,0 +1,426 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This module doesn't present direct interest for OpenSSL, because it
+# doesn't provide better performance for longer keys, at least not on
+# in-order-execution cores. While 512-bit RSA sign operations can be
+# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
+# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
+# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
+# verify:-( All comparisons are against bn_mul_mont-free assembler.
+# The module might be of interest to embedded system developers, as
+# the code is smaller than 1KB, yet offers >3x improvement on MIPS64
+# and 75-30% [less for longer keys] on MIPS32 over compiler-generated
+# code.
+
+######################################################################
+# There is a number of MIPS ABI in use, O32 and N32/64 are most
+# widely used. Then there is a new contender: NUBI. It appears that if
+# one picks the latter, it's possible to arrange code in ABI neutral
+# manner. Therefore let's stick to NUBI register layout:
+#
+($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
+($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
+($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
+#
+# The return value is placed in $a0. Following coding rules facilitate
+# interoperability:
+#
+# - never ever touch $tp, "thread pointer", former $gp;
+# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
+#   old code];
+# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
+#
+# For reference here is register layout for N32/64 MIPS ABIs:
+#
+# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
+# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
+# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
+# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
+# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
+#
+$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
+
+if ($flavour =~ /64|n32/i) {
+	$PTR_ADD="dadd";	# incidentally works even on n32
+	$PTR_SUB="dsub";	# incidentally works even on n32
+	$REG_S="sd";
+	$REG_L="ld";
+	$SZREG=8;
+} else {
+	$PTR_ADD="add";
+	$PTR_SUB="sub";
+	$REG_S="sw";
+	$REG_L="lw";
+	$SZREG=4;
+}
+$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
+#
+# <appro@openssl.org>
+#
+######################################################################
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+if ($flavour =~ /64|n32/i) {
+	$LD="ld";
+	$ST="sd";
+	$MULTU="dmultu";
+	$ADDU="daddu";
+	$SUBU="dsubu";
+	$BNSZ=8;
+} else {
+	$LD="lw";
+	$ST="sw";
+	$MULTU="multu";
+	$ADDU="addu";
+	$SUBU="subu";
+	$BNSZ=4;
+}
+
+# int bn_mul_mont(
+$rp=$a0;	# BN_ULONG *rp,
+$ap=$a1;	# const BN_ULONG *ap,
+$bp=$a2;	# const BN_ULONG *bp,
+$np=$a3;	# const BN_ULONG *np,
+$n0=$a4;	# const BN_ULONG *n0,
+$num=$a5;	# int num);
+
+$lo0=$a6;
+$hi0=$a7;
+$lo1=$t1;
+$hi1=$t2;
+$aj=$s0;
+$bi=$s1;
+$nj=$s2;
+$tp=$s3;
+$alo=$s4;
+$ahi=$s5;
+$nlo=$s6;
+$nhi=$s7;
+$tj=$s8;
+$i=$s9;
+$j=$s10;
+$m1=$s11;
+
+$FRAMESIZE=14;
+
+$code=<<___;
+.text
+
+.set	noat
+.set	noreorder
+
+.align	5
+.globl	bn_mul_mont
+.ent	bn_mul_mont
+bn_mul_mont:
+___
+$code.=<<___ if ($flavour =~ /o32/i);
+	lw	$n0,16($sp)
+	lw	$num,20($sp)
+___
+$code.=<<___;
+	slt	$at,$num,4
+	bnez	$at,1f
+	li	$t0,0
+	slt	$at,$num,17	# on in-order CPU
+	bnez	$at,bn_mul_mont_internal
+	nop
+1:	jr	$ra
+	li	$a0,0
+.end	bn_mul_mont
+
+.align	5
+.ent	bn_mul_mont_internal
+bn_mul_mont_internal:
+	.frame	$fp,$FRAMESIZE*$SZREG,$ra
+	.mask	0x40000000|$SAVED_REGS_MASK,-$SZREG
+	$PTR_SUB $sp,$FRAMESIZE*$SZREG
+	$REG_S	$fp,($FRAMESIZE-1)*$SZREG($sp)
+	$REG_S	$s11,($FRAMESIZE-2)*$SZREG($sp)
+	$REG_S	$s10,($FRAMESIZE-3)*$SZREG($sp)
+	$REG_S	$s9,($FRAMESIZE-4)*$SZREG($sp)
+	$REG_S	$s8,($FRAMESIZE-5)*$SZREG($sp)
+	$REG_S	$s7,($FRAMESIZE-6)*$SZREG($sp)
+	$REG_S	$s6,($FRAMESIZE-7)*$SZREG($sp)
+	$REG_S	$s5,($FRAMESIZE-8)*$SZREG($sp)
+	$REG_S	$s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_S	$s3,($FRAMESIZE-10)*$SZREG($sp)
+	$REG_S	$s2,($FRAMESIZE-11)*$SZREG($sp)
+	$REG_S	$s1,($FRAMESIZE-12)*$SZREG($sp)
+	$REG_S	$s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+	move	$fp,$sp
+
+	.set	reorder
+	$LD	$n0,0($n0)
+	$LD	$bi,0($bp)	# bp[0]
+	$LD	$aj,0($ap)	# ap[0]
+	$LD	$nj,0($np)	# np[0]
+
+	$PTR_SUB $sp,2*$BNSZ	# place for two extra words
+	sll	$num,`log($BNSZ)/log(2)`
+	li	$at,-4096
+	$PTR_SUB $sp,$num
+	and	$sp,$at
+
+	$MULTU	$aj,$bi
+	$LD	$alo,$BNSZ($ap)
+	$LD	$nlo,$BNSZ($np)
+	mflo	$lo0
+	mfhi	$hi0
+	$MULTU	$lo0,$n0
+	mflo	$m1
+
+	$MULTU	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	$MULTU	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+	$MULTU	$nlo,$m1
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$ADDU	$hi1,$at
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,$sp
+	li	$j,2*$BNSZ
+.align	4
+.L1st:
+	.set	noreorder
+	$PTR_ADD $aj,$ap,$j
+	$PTR_ADD $nj,$np,$j
+	$LD	$aj,($aj)
+	$LD	$nj,($nj)
+
+	$MULTU	$aj,$bi
+	$ADDU	$lo0,$alo,$hi0
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$at,$lo0,$hi0
+	sltu	$t0,$lo1,$hi1
+	$ADDU	$hi0,$ahi,$at
+	$ADDU	$hi1,$nhi,$t0
+	mflo	$alo
+	mfhi	$ahi
+
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$MULTU	$nj,$m1
+	$ADDU	$hi1,$at
+	addu	$j,$BNSZ
+	$ST	$lo1,($tp)
+	sltu	$t0,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+
+	bnez	$t0,.L1st
+	$PTR_ADD $tp,$BNSZ
+	.set	reorder
+
+	$ADDU	$lo0,$alo,$hi0
+	sltu	$at,$lo0,$hi0
+	$ADDU	$hi0,$ahi,$at
+
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$t0,$lo1,$hi1
+	$ADDU	$hi1,$nhi,$t0
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$ADDU	$hi1,$at
+
+	$ST	$lo1,($tp)
+
+	$ADDU	$hi1,$hi0
+	sltu	$at,$hi1,$hi0
+	$ST	$hi1,$BNSZ($tp)
+	$ST	$at,2*$BNSZ($tp)
+
+	li	$i,$BNSZ
+.align	4
+.Louter:
+	$PTR_ADD $bi,$bp,$i
+	$LD	$bi,($bi)
+	$LD	$aj,($ap)
+	$LD	$alo,$BNSZ($ap)
+	$LD	$tj,($sp)
+
+	$MULTU	$aj,$bi
+	$LD	$nj,($np)
+	$LD	$nlo,$BNSZ($np)
+	mflo	$lo0
+	mfhi	$hi0
+	$ADDU	$lo0,$tj
+	$MULTU	$lo0,$n0
+	sltu	$at,$lo0,$tj
+	$ADDU	$hi0,$at
+	mflo	$m1
+
+	$MULTU	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	$MULTU	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+
+	$MULTU	$nlo,$m1
+	$ADDU	$lo1,$lo0
+	sltu	$at,$lo1,$lo0
+	$ADDU	$hi1,$at
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,$sp
+	li	$j,2*$BNSZ
+	$LD	$tj,$BNSZ($tp)
+.align	4
+.Linner:
+	.set	noreorder
+	$PTR_ADD $aj,$ap,$j
+	$PTR_ADD $nj,$np,$j
+	$LD	$aj,($aj)
+	$LD	$nj,($nj)
+
+	$MULTU	$aj,$bi
+	$ADDU	$lo0,$alo,$hi0
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$at,$lo0,$hi0
+	sltu	$t0,$lo1,$hi1
+	$ADDU	$hi0,$ahi,$at
+	$ADDU	$hi1,$nhi,$t0
+	mflo	$alo
+	mfhi	$ahi
+
+	$ADDU	$lo0,$tj
+	addu	$j,$BNSZ
+	$MULTU	$nj,$m1
+	sltu	$at,$lo0,$tj
+	$ADDU	$lo1,$lo0
+	$ADDU	$hi0,$at
+	sltu	$t0,$lo1,$lo0
+	$LD	$tj,2*$BNSZ($tp)
+	$ADDU	$hi1,$t0
+	sltu	$at,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+	$ST	$lo1,($tp)
+	bnez	$at,.Linner
+	$PTR_ADD $tp,$BNSZ
+	.set	reorder
+
+	$ADDU	$lo0,$alo,$hi0
+	sltu	$at,$lo0,$hi0
+	$ADDU	$hi0,$ahi,$at
+	$ADDU	$lo0,$tj
+	sltu	$t0,$lo0,$tj
+	$ADDU	$hi0,$t0
+
+	$LD	$tj,2*$BNSZ($tp)
+	$ADDU	$lo1,$nlo,$hi1
+	sltu	$at,$lo1,$hi1
+	$ADDU	$hi1,$nhi,$at
+	$ADDU	$lo1,$lo0
+	sltu	$t0,$lo1,$lo0
+	$ADDU	$hi1,$t0
+	$ST	$lo1,($tp)
+
+	$ADDU	$lo1,$hi1,$hi0
+	sltu	$hi1,$lo1,$hi0
+	$ADDU	$lo1,$tj
+	sltu	$at,$lo1,$tj
+	$ADDU	$hi1,$at
+	$ST	$lo1,$BNSZ($tp)
+	$ST	$hi1,2*$BNSZ($tp)
+
+	addu	$i,$BNSZ
+	sltu	$t0,$i,$num
+	bnez	$t0,.Louter
+
+	.set	noreorder
+	$PTR_ADD $tj,$sp,$num	# &tp[num]
+	move	$tp,$sp
+	move	$ap,$sp
+	li	$hi0,0		# clear borrow bit
+
+.align	4
+.Lsub:	$LD	$lo0,($tp)
+	$LD	$lo1,($np)
+	$PTR_ADD $tp,$BNSZ
+	$PTR_ADD $np,$BNSZ
+	$SUBU	$lo1,$lo0,$lo1	# tp[i]-np[i]
+	sgtu	$at,$lo1,$lo0
+	$SUBU	$lo0,$lo1,$hi0
+	sgtu	$hi0,$lo0,$lo1
+	$ST	$lo0,($rp)
+	or	$hi0,$at
+	sltu	$at,$tp,$tj
+	bnez	$at,.Lsub
+	$PTR_ADD $rp,$BNSZ
+
+	$SUBU	$hi0,$hi1,$hi0	# handle upmost overflow bit
+	move	$tp,$sp
+	$PTR_SUB $rp,$num	# restore rp
+	not	$hi1,$hi0
+
+	and	$ap,$hi0,$sp
+	and	$bp,$hi1,$rp
+	or	$ap,$ap,$bp	# ap=borrow?tp:rp
+
+.align	4
+.Lcopy:	$LD	$aj,($ap)
+	$PTR_ADD $ap,$BNSZ
+	$ST	$zero,($tp)
+	$PTR_ADD $tp,$BNSZ
+	sltu	$at,$tp,$tj
+	$ST	$aj,($rp)
+	bnez	$at,.Lcopy
+	$PTR_ADD $rp,$BNSZ
+
+	li	$a0,1
+	li	$t0,1
+
+	.set	noreorder
+	move	$sp,$fp
+	$REG_L	$fp,($FRAMESIZE-1)*$SZREG($sp)
+	$REG_L	$s11,($FRAMESIZE-2)*$SZREG($sp)
+	$REG_L	$s10,($FRAMESIZE-3)*$SZREG($sp)
+	$REG_L	$s9,($FRAMESIZE-4)*$SZREG($sp)
+	$REG_L	$s8,($FRAMESIZE-5)*$SZREG($sp)
+	$REG_L	$s7,($FRAMESIZE-6)*$SZREG($sp)
+	$REG_L	$s6,($FRAMESIZE-7)*$SZREG($sp)
+	$REG_L	$s5,($FRAMESIZE-8)*$SZREG($sp)
+	$REG_L	$s4,($FRAMESIZE-9)*$SZREG($sp)
+___
+$code.=<<___ if ($flavour =~ /nubi/i);
+	$REG_L	$s3,($FRAMESIZE-10)*$SZREG($sp)
+	$REG_L	$s2,($FRAMESIZE-11)*$SZREG($sp)
+	$REG_L	$s1,($FRAMESIZE-12)*$SZREG($sp)
+	$REG_L	$s0,($FRAMESIZE-13)*$SZREG($sp)
+___
+$code.=<<___;
+	jr	$ra
+	$PTR_ADD $sp,$FRAMESIZE*$SZREG
+.end	bn_mul_mont_internal
+.rdata
+.asciiz	"Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
+close STDOUT;
--- a/crypto/bn/asm/mips.pl
+++ b/crypto/bn/asm/mips.pl
--- a/crypto/bn/asm/mips3-mont.pl
+++ b/crypto/bn/asm/mips3-mont.pl
@@ -0,0 +1,327 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# This module doesn't present direct interest for OpenSSL, because it
+# doesn't provide better performance for longer keys. While 512-bit
+# RSA private key operations are 40% faster, 1024-bit ones are hardly
+# faster at all, while longer key operations are slower by up to 20%.
+# It might be of interest to embedded system developers though, as
+# it's smaller than 1KB, yet offers ~3x improvement over compiler
+# generated code.
+#
+# The module targets N32 and N64 MIPS ABIs and currently is a bit
+# IRIX-centric, i.e. is likely to require adaptation for other OSes.
+
+# int bn_mul_mont(
+$rp="a0";	# BN_ULONG *rp,
+$ap="a1";	# const BN_ULONG *ap,
+$bp="a2";	# const BN_ULONG *bp,
+$np="a3";	# const BN_ULONG *np,
+$n0="a4";	# const BN_ULONG *n0,
+$num="a5";	# int num);
+
+$lo0="a6";
+$hi0="a7";
+$lo1="v0";
+$hi1="v1";
+$aj="t0";
+$bi="t1";
+$nj="t2";
+$tp="t3";
+$alo="s0";
+$ahi="s1";
+$nlo="s2";
+$nhi="s3";
+$tj="s4";
+$i="s5";
+$j="s6";
+$fp="t8";
+$m1="t9";
+
+$FRAME=8*(2+8);
+
+$code=<<___;
+#include <asm.h>
+#include <regdef.h>
+
+.text
+
+.set	noat
+.set	reorder
+
+.align	5
+.globl	bn_mul_mont
+.ent	bn_mul_mont
+bn_mul_mont:
+	.set	noreorder
+	PTR_SUB	sp,64
+	move	$fp,sp
+	.frame	$fp,64,ra
+	slt	AT,$num,4
+	li	v0,0
+	beqzl	AT,.Lproceed
+	nop
+	jr	ra
+	PTR_ADD	sp,$fp,64
+	.set	reorder
+.align	5
+.Lproceed:
+	ld	$n0,0($n0)
+	ld	$bi,0($bp)	# bp[0]
+	ld	$aj,0($ap)	# ap[0]
+	ld	$nj,0($np)	# np[0]
+	PTR_SUB	sp,16		# place for two extra words
+	sll	$num,3
+	li	AT,-4096
+	PTR_SUB	sp,$num
+	and	sp,AT
+
+	sd	s0,0($fp)
+	sd	s1,8($fp)
+	sd	s2,16($fp)
+	sd	s3,24($fp)
+	sd	s4,32($fp)
+	sd	s5,40($fp)
+	sd	s6,48($fp)
+	sd	s7,56($fp)
+
+	dmultu	$aj,$bi
+	ld	$alo,8($ap)
+	ld	$nlo,8($np)
+	mflo	$lo0
+	mfhi	$hi0
+	dmultu	$lo0,$n0
+	mflo	$m1
+
+	dmultu	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	dmultu	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+	dmultu	$nlo,$m1
+	daddu	$lo1,$lo0
+	sltu	AT,$lo1,$lo0
+	daddu	$hi1,AT
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,sp
+	li	$j,16
+.align	4
+.L1st:
+	.set	noreorder
+	PTR_ADD	$aj,$ap,$j
+	ld	$aj,($aj)
+	PTR_ADD	$nj,$np,$j
+	ld	$nj,($nj)
+
+	dmultu	$aj,$bi
+	daddu	$lo0,$alo,$hi0
+	daddu	$lo1,$nlo,$hi1
+	sltu	AT,$lo0,$hi0
+	sltu	s7,$lo1,$hi1
+	daddu	$hi0,$ahi,AT
+	daddu	$hi1,$nhi,s7
+	mflo	$alo
+	mfhi	$ahi
+
+	daddu	$lo1,$lo0
+	sltu	AT,$lo1,$lo0
+	dmultu	$nj,$m1
+	daddu	$hi1,AT
+	addu	$j,8
+	sd	$lo1,($tp)
+	sltu	s7,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+
+	bnez	s7,.L1st
+	PTR_ADD	$tp,8
+	.set	reorder
+
+	daddu	$lo0,$alo,$hi0
+	sltu	AT,$lo0,$hi0
+	daddu	$hi0,$ahi,AT
+
+	daddu	$lo1,$nlo,$hi1
+	sltu	s7,$lo1,$hi1
+	daddu	$hi1,$nhi,s7
+	daddu	$lo1,$lo0
+	sltu	AT,$lo1,$lo0
+	daddu	$hi1,AT
+
+	sd	$lo1,($tp)
+
+	daddu	$hi1,$hi0
+	sltu	AT,$hi1,$hi0
+	sd	$hi1,8($tp)
+	sd	AT,16($tp)
+
+	li	$i,8
+.align	4
+.Louter:
+	PTR_ADD	$bi,$bp,$i
+	ld	$bi,($bi)
+	ld	$aj,($ap)
+	ld	$alo,8($ap)
+	ld	$tj,(sp)
+
+	dmultu	$aj,$bi
+	ld	$nj,($np)
+	ld	$nlo,8($np)
+	mflo	$lo0
+	mfhi	$hi0
+	daddu	$lo0,$tj
+	dmultu	$lo0,$n0
+	sltu	AT,$lo0,$tj
+	daddu	$hi0,AT
+	mflo	$m1
+
+	dmultu	$alo,$bi
+	mflo	$alo
+	mfhi	$ahi
+
+	dmultu	$nj,$m1
+	mflo	$lo1
+	mfhi	$hi1
+
+	dmultu	$nlo,$m1
+	daddu	$lo1,$lo0
+	sltu	AT,$lo1,$lo0
+	daddu	$hi1,AT
+	mflo	$nlo
+	mfhi	$nhi
+
+	move	$tp,sp
+	li	$j,16
+	ld	$tj,8($tp)
+.align	4
+.Linner:
+	.set	noreorder
+	PTR_ADD	$aj,$ap,$j
+	ld	$aj,($aj)
+	PTR_ADD	$nj,$np,$j
+	ld	$nj,($nj)
+
+	dmultu	$aj,$bi
+	daddu	$lo0,$alo,$hi0
+	daddu	$lo1,$nlo,$hi1
+	sltu	AT,$lo0,$hi0
+	sltu	s7,$lo1,$hi1
+	daddu	$hi0,$ahi,AT
+	daddu	$hi1,$nhi,s7
+	mflo	$alo
+	mfhi	$ahi
+
+	daddu	$lo0,$tj
+	addu	$j,8
+	dmultu	$nj,$m1
+	sltu	AT,$lo0,$tj
+	daddu	$lo1,$lo0
+	daddu	$hi0,AT
+	sltu	s7,$lo1,$lo0
+	ld	$tj,16($tp)
+	daddu	$hi1,s7
+	sltu	AT,$j,$num
+	mflo	$nlo
+	mfhi	$nhi
+	sd	$lo1,($tp)
+	bnez	AT,.Linner
+	PTR_ADD	$tp,8
+	.set	reorder
+
+	daddu	$lo0,$alo,$hi0
+	sltu	AT,$lo0,$hi0
+	daddu	$hi0,$ahi,AT
+	daddu	$lo0,$tj
+	sltu	s7,$lo0,$tj
+	daddu	$hi0,s7
+
+	ld	$tj,16($tp)
+	daddu	$lo1,$nlo,$hi1
+	sltu	AT,$lo1,$hi1
+	daddu	$hi1,$nhi,AT
+	daddu	$lo1,$lo0
+	sltu	s7,$lo1,$lo0
+	daddu	$hi1,s7
+	sd	$lo1,($tp)
+
+	daddu	$lo1,$hi1,$hi0
+	sltu	$hi1,$lo1,$hi0
+	daddu	$lo1,$tj
+	sltu	AT,$lo1,$tj
+	daddu	$hi1,AT
+	sd	$lo1,8($tp)
+	sd	$hi1,16($tp)
+
+	addu	$i,8
+	sltu	s7,$i,$num
+	bnez	s7,.Louter
+
+	.set	noreorder
+	PTR_ADD	$tj,sp,$num	# &tp[num]
+	move	$tp,sp
+	move	$ap,sp
+	li	$hi0,0		# clear borrow bit
+
+.align	4
+.Lsub:	ld	$lo0,($tp)
+	ld	$lo1,($np)
+	PTR_ADD	$tp,8
+	PTR_ADD	$np,8
+	dsubu	$lo1,$lo0,$lo1	# tp[i]-np[i]
+	sgtu	AT,$lo1,$lo0
+	dsubu	$lo0,$lo1,$hi0
+	sgtu	$hi0,$lo0,$lo1
+	sd	$lo0,($rp)
+	or	$hi0,AT
+	sltu	AT,$tp,$tj
+	bnez	AT,.Lsub
+	PTR_ADD	$rp,8
+
+	dsubu	$hi0,$hi1,$hi0	# handle upmost overflow bit
+	move	$tp,sp
+	PTR_SUB	$rp,$num	# restore rp
+	not	$hi1,$hi0
+
+	and	$ap,$hi0,sp
+	and	$bp,$hi1,$rp
+	or	$ap,$ap,$bp	# ap=borrow?tp:rp
+
+.align	4
+.Lcopy:	ld	$aj,($ap)
+	PTR_ADD	$ap,8
+	PTR_ADD	$tp,8
+	sd	zero,-8($tp)
+	sltu	AT,$tp,$tj
+	sd	$aj,($rp)
+	bnez	AT,.Lcopy
+	PTR_ADD	$rp,8
+
+	ld	s0,0($fp)
+	ld	s1,8($fp)
+	ld	s2,16($fp)
+	ld	s3,24($fp)
+	ld	s4,32($fp)
+	ld	s5,40($fp)
+	ld	s6,48($fp)
+	ld	s7,56($fp)
+	li	v0,1
+	jr	ra
+	PTR_ADD	sp,$fp,64
+	.set	reorder
+END(bn_mul_mont)
+.rdata
+.asciiz	"Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+print $code;
+close STDOUT;
--- a/crypto/bn/asm/mips3.s
+++ b/crypto/bn/asm/mips3.s
--- a/crypto/bn/asm/pa-risc2.s
+++ b/crypto/bn/asm/pa-risc2.s
--- a/crypto/bn/asm/pa-risc2W.s
+++ b/crypto/bn/asm/pa-risc2W.s
--- a/crypto/bn/asm/parisc-mont.pl
+++ b/crypto/bn/asm/parisc-mont.pl
@@ -0,0 +1,995 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# On PA-7100LC this module performs ~90-50% better, less for longer
+# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
+# that compiler utilized xmpyu instruction to perform 32x32=64-bit
+# multiplication, which in turn means that "baseline" performance was
+# optimal in respect to instruction set capabilities. Fair comparison
+# with vendor compiler is problematic, because OpenSSL doesn't define
+# BN_LLONG [presumably] for historical reasons, which drives compiler
+# toward 4 times 16x16=32-bit multiplicatons [plus complementary
+# shifts and additions] instead. This means that you should observe
+# several times improvement over code generated by vendor compiler
+# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
+# improvement coefficient was never collected on PA-7100LC, or any
+# other 1.1 CPU, because I don't have access to such machine with
+# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
+# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
+# of ~5x on PA-8600.
+#
+# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
+# reportedly ~2x faster than vendor compiler generated code [according
+# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
+# this implementation is actually 32-bit one, in the sense that it
+# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
+# 64-bit BN_LONGs... How do they interoperate then? No problem. This
+# module picks halves of 64-bit values in reverse order and pretends
+# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
+# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
+# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
+# i.e. there is no "wider" multiplication like on most other 64-bit
+# platforms. This means that even being effectively 32-bit, this
+# implementation performs "64-bit" computational task in same amount
+# of arithmetic operations, most notably multiplications. It requires
+# more memory references, most notably to tp[num], but this doesn't
+# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
+# 2.0 code path provides virtually same performance as pa-risc2[W].s:
+# it's ~10% better for shortest key length and ~10% worse for longest
+# one.
+#
+# In case it wasn't clear. The module has two distinct code paths:
+# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
+# additions and 64-bit integer loads, not to mention specific
+# instruction scheduling. In 64-bit build naturally only 2.0 code path
+# is assembled. In 32-bit application context both code paths are
+# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
+# is taken automatically. Also, in 32-bit build the module imposes
+# couple of limitations: vector lengths has to be even and vector
+# addresses has to be 64-bit aligned. Normally neither is a problem:
+# most common key lengths are even and vectors are commonly malloc-ed,
+# which ensures alignment.
+#
+# Special thanks to polarhome.com for providing HP-UX account on
+# PA-RISC 1.1 machine, and to correspondent who chose to remain
+# anonymous for testing the code on PA-RISC 2.0 machine.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+
+$flavour = shift;
+$output = shift;
+
+open STDOUT,">$output";
+
+if ($flavour =~ /64/) {
+	$LEVEL		="2.0W";
+	$SIZE_T		=8;
+	$FRAME_MARKER	=80;
+	$SAVED_RP	=16;
+	$PUSH		="std";
+	$PUSHMA		="std,ma";
+	$POP		="ldd";
+	$POPMB		="ldd,mb";
+	$BN_SZ		=$SIZE_T;
+} else {
+	$LEVEL		="1.1";	#$LEVEL.="\n\t.ALLOW\t2.0";
+	$SIZE_T		=4;
+	$FRAME_MARKER	=48;
+	$SAVED_RP	=20;
+	$PUSH		="stw";
+	$PUSHMA		="stwm";
+	$POP		="ldw";
+	$POPMB		="ldwm";
+	$BN_SZ		=$SIZE_T;
+	if (open CONF,"<${dir}../../opensslconf.h") {
+	    while(<CONF>) {
+		if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
+		    $BN_SZ=8;
+		    $LEVEL="2.0";
+		    last;
+		}
+	    }
+	    close CONF;
+	}
+}
+
+$FRAME=8*$SIZE_T+$FRAME_MARKER;	# 8 saved regs + frame marker
+				#                [+ argument transfer]
+$LOCALS=$FRAME-$FRAME_MARKER;
+$FRAME+=32;			# local variables
+
+$tp="%r31";
+$ti1="%r29";
+$ti0="%r28";
+
+$rp="%r26";
+$ap="%r25";
+$bp="%r24";
+$np="%r23";
+$n0="%r22";	# passed through stack in 32-bit
+$num="%r21";	# passed through stack in 32-bit
+$idx="%r20";
+$arrsz="%r19";
+
+$nm1="%r7";
+$nm0="%r6";
+$ab1="%r5";
+$ab0="%r4";
+
+$fp="%r3";
+$hi1="%r2";
+$hi0="%r1";
+
+$xfer=$n0;	# accomodates [-16..15] offset in fld[dw]s
+
+$fm0="%fr4";	$fti=$fm0;
+$fbi="%fr5L";
+$fn0="%fr5R";
+$fai="%fr6";	$fab0="%fr7";	$fab1="%fr8";
+$fni="%fr9";	$fnm0="%fr10";	$fnm1="%fr11";
+
+$code=<<___;
+	.LEVEL	$LEVEL
+	.SPACE	\$TEXT\$
+	.SUBSPA	\$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
+
+	.EXPORT	bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
+	.ALIGN	64
+bn_mul_mont
+	.PROC
+	.CALLINFO	FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
+	.ENTRY
+	$PUSH	%r2,-$SAVED_RP(%sp)		; standard prologue
+	$PUSHMA	%r3,$FRAME(%sp)
+	$PUSH	%r4,`-$FRAME+1*$SIZE_T`(%sp)
+	$PUSH	%r5,`-$FRAME+2*$SIZE_T`(%sp)
+	$PUSH	%r6,`-$FRAME+3*$SIZE_T`(%sp)
+	$PUSH	%r7,`-$FRAME+4*$SIZE_T`(%sp)
+	$PUSH	%r8,`-$FRAME+5*$SIZE_T`(%sp)
+	$PUSH	%r9,`-$FRAME+6*$SIZE_T`(%sp)
+	$PUSH	%r10,`-$FRAME+7*$SIZE_T`(%sp)
+	ldo	-$FRAME(%sp),$fp
+___
+$code.=<<___ if ($SIZE_T==4);
+	ldw	`-$FRAME_MARKER-4`($fp),$n0
+	ldw	`-$FRAME_MARKER-8`($fp),$num
+	nop
+	nop					; alignment
+___
+$code.=<<___ if ($BN_SZ==4);
+	comiclr,<=	6,$num,%r0		; are vectors long enough?
+	b		L\$abort
+	ldi		0,%r28			; signal "unhandled"
+	add,ev		%r0,$num,$num		; is $num even?
+	b		L\$abort
+	nop
+	or		$ap,$np,$ti1
+	extru,=		$ti1,31,3,%r0		; are ap and np 64-bit aligned?
+	b		L\$abort
+	nop
+	nop					; alignment
+	nop
+
+	fldws		0($n0),${fn0}
+	fldws,ma	4($bp),${fbi}		; bp[0]
+___
+$code.=<<___ if ($BN_SZ==8);
+	comib,>		3,$num,L\$abort		; are vectors long enough?
+	ldi		0,%r28			; signal "unhandled"
+	addl		$num,$num,$num		; I operate on 32-bit values
+
+	fldws		4($n0),${fn0}		; only low part of n0
+	fldws		4($bp),${fbi}		; bp[0] in flipped word order
+___
+$code.=<<___;
+	fldds		0($ap),${fai}		; ap[0,1]
+	fldds		0($np),${fni}		; np[0,1]
+
+	sh2addl		$num,%r0,$arrsz
+	ldi		31,$hi0
+	ldo		36($arrsz),$hi1		; space for tp[num+1]
+	andcm		$hi1,$hi0,$hi1		; align
+	addl		$hi1,%sp,%sp
+	$PUSH		$fp,-$SIZE_T(%sp)
+
+	ldo		`$LOCALS+16`($fp),$xfer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[0]
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[0]
+	xmpyu		${fn0},${fab0}R,${fm0}
+
+	addl		$arrsz,$ap,$ap		; point at the end
+	addl		$arrsz,$np,$np
+	subi		0,$arrsz,$idx		; j=0
+	ldo		8($idx),$idx		; j++++
+
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	fstds		${fab1},0($xfer)
+	fstds		${fnm1},8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+___
+$code.=<<___ if ($BN_SZ==4);
+	mtctl		$hi0,%cr11		; $hi0 still holds 31
+	extrd,u,*=	$hi0,%sar,1,$hi0	; executes on PA-RISC 1.0
+	b		L\$parisc11
+	nop
+___
+$code.=<<___;					# PA-RISC 2.0 code-path
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+
+	extrd,u		$ab0,31,32,$hi0
+	extrd,u		$ab0,63,32,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 ldo		8($idx),$idx		; j++++
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	 extrd,u	$nm0,31,32,$hi1
+
+L\$1st
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$ab1,$nm1,$nm1
+	 extrd,u	$nm1,31,32,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,63,32,$ab1
+	 addl		$hi1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	 addl		$ab1,$nm1,$nm1
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	 addl		$hi0,$ab0,$ab0
+	 extrd,u	$ab0,31,32,$hi0
+	stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[1]
+___
+$code.=<<___ if ($BN_SZ==8);
+	fldws		0($bp),${fbi}		; bp[1] in flipped word order
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$hi0,$ab1,$ab1
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	addl		$hi1,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2]
+	 flddx		$idx($np),${fni}	; np[2]
+	 ldo		8($idx),$idx		; j++++
+	ldd		-16($xfer),$ab0		; 33-bit value
+	ldd		-8($xfer),$nm0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	 extrd,u	$ab0,31,32,$ti0		; carry bit
+	 extrd,u	$ab0,63,32,$ab0
+	fstds		${fab1},0($xfer)
+	 addl		$ti0,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 addl		$ab0,$nm0,$nm0		; low part is discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 extrd,u	$nm0,31,32,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+L\$inner
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldw		4($tp),$ti0		; tp[j]
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldd		-16($xfer),$ab0
+	fstds		${fab0},-16($xfer)
+	 addl		$hi0,$ti0,$ti0
+	 addl		$ti0,$ab0,$ab0
+	ldd		-8($xfer),$nm0
+	fstds		${fnm0},-8($xfer)
+	 extrd,u	$ab0,31,32,$hi0
+	 extrd,u	$nm1,31,32,$hi1
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	 addl		$ab0,$nm0,$nm0
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner		; j++++
+	 extrd,u	$nm0,31,32,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldd		0($xfer),$ab1
+	fstds		${fab1},0($xfer)
+	 addl		$hi0,$ti1,$ti1
+	 addl		$ti1,$ab1,$ab1
+	ldd		8($xfer),$nm1
+	fstds		${fnm1},8($xfer)
+	 extrd,u	$ab1,31,32,$hi0
+	 extrd,u	$ab1,63,32,$ab1
+	ldw		4($tp),$ti0		; tp[j]
+	 addl		$hi1,$nm1,$nm1
+	 addl		$ab1,$nm1,$nm1
+	ldd		-16($xfer),$ab0
+	ldd		-8($xfer),$nm0
+	 extrd,u	$nm1,31,32,$hi1
+
+	addl		$hi0,$ab0,$ab0
+	 addl		$ti0,$ab0,$ab0
+	 stw		$nm1,-4($tp)		; tp[j-1]
+	 extrd,u	$ab0,31,32,$hi0
+	ldw		8($tp),$ti1		; tp[j]
+	 extrd,u	$ab0,63,32,$ab0
+	 addl		$hi1,$nm0,$nm0
+	ldd		0($xfer),$ab1
+	 addl		$ab0,$nm0,$nm0
+	ldd,mb		8($xfer),$nm1
+	 extrd,u	$nm0,31,32,$hi1
+	 stw,ma		$nm0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone	; i--
+	subi		0,$arrsz,$idx		; j=0
+___
+$code.=<<___ if ($BN_SZ==4);
+	fldws,ma	4($bp),${fbi}		; bp[i]
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldi		12,$ti0			; bp[i] in flipped word order
+	addl,ev		%r0,$num,$num
+	ldi		-4,$ti0
+	addl		$ti0,$bp,$bp
+	fldws		0($bp),${fbi}
+___
+$code.=<<___;
+	 flddx		$idx($ap),${fai}	; ap[0]
+	addl		$hi0,$ab1,$ab1
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	addl		$hi1,$hi0,$hi0
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone
+	addl		$hi0,$ab1,$ab1
+	addl		$ti1,$ab1,$ab1
+	extrd,u		$ab1,31,32,$hi0
+	extrd,u		$ab1,63,32,$ab1
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	addl		$hi1,$nm1,$nm1
+	addl		$ab1,$nm1,$nm1
+	extrd,u		$nm1,31,32,$hi1
+	stw		$nm1,-4($tp)		; tp[j-1]
+
+	addl		$hi1,$hi0,$hi0
+	addl		$ti0,$hi0,$hi0
+	extrd,u		$hi0,31,32,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+___
+$code.=<<___ if ($BN_SZ==4);
+	ldws,ma		4($tp),$ti0
+	extru,=		$rp,31,3,%r0		; is rp 64-bit aligned?
+	b		L\$sub_pa11
+	addl		$tp,$arrsz,$tp
+L\$sub
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+___
+$code.=<<___ if ($BN_SZ==8);
+	ldd,ma		8($tp),$ti0
+L\$sub
+	ldd		$idx($np),$hi0
+	shrpd		$ti0,$ti0,32,$ti0	; flip word order
+	std		$ti0,-8($tp)		; save flipped value
+	sub,db		$ti0,$hi0,$hi1
+	ldd,ma		8($tp),$ti0
+	addib,<>	8,$idx,L\$sub
+	std,ma		$hi1,8($rp)
+
+	extrd,u		$ti0,31,32,$ti0		; carry in flipped word order
+	sub,db		$ti0,%r0,$hi1
+	ldo		-8($tp),$tp
+___
+$code.=<<___;
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy
+	ldd		$idx($np),$hi0
+	std,ma		%r0,8($tp)
+	addib,<>	8,$idx,.-8		; L\$copy
+	std,ma		$hi0,8($rp)	
+___
+
+if ($BN_SZ==4) {				# PA-RISC 1.1 code-path
+$ablo=$ab0;
+$abhi=$ab1;
+$nmlo0=$nm0;
+$nmhi0=$nm1;
+$nmlo1="%r9";
+$nmhi1="%r8";
+
+$code.=<<___;
+	b		L\$done
+	nop
+
+	.ALIGN		8
+L\$parisc11
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-12($xfer),$ablo
+	ldw		-16($xfer),$hi0
+	ldw		-4($xfer),$nmlo0
+	ldw		-8($xfer),$nmhi0
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+
+	 ldo		8($idx),$idx		; j++++
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	 addc		%r0,$nmhi0,$hi1
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+	nop
+
+L\$1st_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[0]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		12($xfer),$nmlo1
+	 addc		%r0,$abhi,$hi0
+	ldw		8($xfer),$nmhi1
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	fstds		${fnm1},8($xfer)
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-16($xfer),$abhi
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[0]
+	ldw		-4($xfer),$nmlo0
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-8($xfer),$nmhi0
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab0},-16($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	fstds		${fnm0},-8($xfer)
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$1st_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	 ldw		8($xfer),$nmhi1
+	 ldw		12($xfer),$nmlo1
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[0]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	 add		$hi0,$ablo,$ablo
+	fstds		${fab1},0($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$nmhi1,$hi1
+	ldw		-4($xfer),$nmlo0
+
+	 add		$hi0,$ablo,$ablo
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	stws,ma		$nmlo0,8($tp)		; tp[j-1]
+
+	ldo		-1($num),$num		; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[1]
+	 flddx		$idx($ap),${fai}	; ap[0,1]
+	 flddx		$idx($np),${fni}	; np[0,1]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[1]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[1]
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	xmpyu		${fn0},${fab0}R,${fm0}
+	ldo		`$LOCALS+32+4`($fp),$tp
+L\$outer_pa11
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[0]*m
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[1]*m
+	fstds		${fab0},-16($xfer)	; 33-bit value
+	fstds		${fnm0},-8($xfer)
+	 flddx		$idx($ap),${fai}	; ap[2,3]
+	 flddx		$idx($np),${fni}	; np[2,3]
+	ldw		-16($xfer),$abhi	; carry bit actually
+	 ldo		8($idx),$idx		; j++++
+	ldw		-12($xfer),$ablo
+	ldw		-8($xfer),$nmhi0
+	ldw		-4($xfer),$nmlo0
+	ldw		0($xfer),$hi0		; high part
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	fstds		${fab1},0($xfer)
+	 addl		$abhi,$hi0,$hi0		; account carry bit
+	fstds		${fnm1},8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0	; discarded
+	ldw		0($tp),$ti1		; tp[1]
+	 addc		%r0,$nmhi0,$hi1
+	fstds		${fab0},-16($xfer)
+	fstds		${fnm0},-8($xfer)
+	ldw		4($xfer),$ablo
+	ldw		0($xfer),$abhi
+
+L\$inner_pa11
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j+1]*bp[i]
+	flddx		$idx($ap),${fai}	; ap[j,j+1]
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j+1]*m
+	flddx		$idx($np),${fni}	; np[j,j+1]
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	ldw		12($xfer),$nmlo1
+	 add		$ti1,$ablo,$ablo
+	ldw		8($xfer),$nmhi1
+	 addc		%r0,$abhi,$hi0
+	fstds		${fab1},0($xfer)
+	 add		$ablo,$nmlo1,$nmlo1
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-12($xfer),$ablo
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-16($xfer),$abhi
+	 addc		%r0,$nmhi1,$hi1
+
+	xmpyu		${fai}L,${fbi},${fab0}	; ap[j]*bp[i]
+	ldw		8($tp),$ti1		; tp[j]
+	xmpyu		${fni}L,${fm0}R,${fnm0}	; np[j]*m
+	ldw		-4($xfer),$nmlo0
+	 add		$hi0,$ablo,$ablo
+	ldw		-8($xfer),$nmhi0
+	 addc		%r0,$abhi,$abhi
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+	 add		$ti0,$ablo,$ablo
+	fstds		${fab0},-16($xfer)
+	 addc		%r0,$abhi,$hi0
+	fstds		${fnm0},-8($xfer)
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldw		0($xfer),$abhi
+	 add		$hi1,$nmlo0,$nmlo0
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+	addib,<>	8,$idx,L\$inner_pa11	; j++++
+	 addc		%r0,$nmhi0,$hi1
+
+	xmpyu		${fai}R,${fbi},${fab1}	; ap[j]*bp[i]
+	ldw		12($xfer),$nmlo1
+	xmpyu		${fni}R,${fm0}R,${fnm1}	; np[j]*m
+	ldw		8($xfer),$nmhi1
+	 add		$hi0,$ablo,$ablo
+	ldw		4($tp),$ti0		; tp[j]
+	 addc		%r0,$abhi,$abhi
+	fstds		${fab1},0($xfer)
+	 add		$ti1,$ablo,$ablo
+	fstds		${fnm1},8($xfer)
+	 addc		%r0,$abhi,$hi0
+	ldw		-16($xfer),$abhi
+	 add		$ablo,$nmlo1,$nmlo1
+	ldw		-12($xfer),$ablo
+	 addc		%r0,$nmhi1,$nmhi1
+	ldw		-8($xfer),$nmhi0
+	 add		$hi1,$nmlo1,$nmlo1
+	ldw		-4($xfer),$nmlo0
+	 addc		%r0,$nmhi1,$hi1
+
+	add		$hi0,$ablo,$ablo
+	 stw		$nmlo1,-4($tp)		; tp[j-1]
+	addc		%r0,$abhi,$abhi
+	 add		$ti0,$ablo,$ablo
+	ldw		8($tp),$ti1		; tp[j]
+	 addc		%r0,$abhi,$hi0
+	ldw		0($xfer),$abhi
+	 add		$ablo,$nmlo0,$nmlo0
+	ldw		4($xfer),$ablo
+	 addc		%r0,$nmhi0,$nmhi0
+	ldws,mb		8($xfer),$nmhi1
+	 add		$hi1,$nmlo0,$nmlo0
+	ldw		4($xfer),$nmlo1
+	 addc		%r0,$nmhi0,$hi1
+	 stws,ma	$nmlo0,8($tp)		; tp[j-1]
+
+	addib,=		-1,$num,L\$outerdone_pa11; i--
+	subi		0,$arrsz,$idx		; j=0
+
+	 fldws,ma	4($bp),${fbi}		; bp[i]
+	 flddx		$idx($ap),${fai}	; ap[0]
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	 flddx		$idx($np),${fni}	; np[0]
+	 fldws		8($xfer),${fti}R	; tp[0]
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	 ldo		8($idx),$idx		; j++++
+	 xmpyu		${fai}L,${fbi},${fab0}	; ap[0]*bp[i]
+	 xmpyu		${fai}R,${fbi},${fab1}	; ap[1]*bp[i]
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	 fstws,mb	${fab0}L,-8($xfer)	; save high part
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	 fcpy,sgl	%fr0,${fti}L		; zero high part
+	 fcpy,sgl	%fr0,${fab0}L
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	 fcnvxf,dbl,dbl	${fti},${fti}		; 32-bit unsigned int -> double
+	 fcnvxf,dbl,dbl	${fab0},${fab0}
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	 fadd,dbl	${fti},${fab0},${fab0}	; add tp[0]
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	 fcnvfx,dbl,dbl	${fab0},${fab0}		; double -> 33-bit unsigned int
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+	 xmpyu		${fn0},${fab0}R,${fm0}
+
+	b		L\$outer_pa11
+	ldo		`$LOCALS+32+4`($fp),$tp
+
+L\$outerdone_pa11
+	add		$hi0,$ablo,$ablo
+	addc		%r0,$abhi,$abhi
+	add		$ti1,$ablo,$ablo
+	addc		%r0,$abhi,$hi0
+
+	ldw		4($tp),$ti0		; tp[j]
+
+	add		$hi1,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$nmhi1
+	add		$ablo,$nmlo1,$nmlo1
+	addc		%r0,$nmhi1,$hi1
+	stw		$nmlo1,-4($tp)		; tp[j-1]
+
+	add		$hi1,$hi0,$hi0
+	addc		%r0,%r0,$hi1
+	add		$ti0,$hi0,$hi0
+	addc		%r0,$hi1,$hi1
+	stw		$hi0,0($tp)
+	stw		$hi1,4($tp)
+
+	ldo		`$LOCALS+32+4`($fp),$tp
+	sub		%r0,%r0,%r0		; clear borrow
+	ldw		-4($tp),$ti0
+	addl		$tp,$arrsz,$tp
+L\$sub_pa11
+	ldwx		$idx($np),$hi0
+	subb		$ti0,$hi0,$hi1
+	ldwx		$idx($tp),$ti0
+	addib,<>	4,$idx,L\$sub_pa11
+	stws,ma		$hi1,4($rp)
+
+	subb		$ti0,%r0,$hi1
+	ldo		-4($tp),$tp
+	and		$tp,$hi1,$ap
+	andcm		$rp,$hi1,$bp
+	or		$ap,$bp,$np
+
+	sub		$rp,$arrsz,$rp		; rewind rp
+	subi		0,$arrsz,$idx
+	ldo		`$LOCALS+32`($fp),$tp
+L\$copy_pa11
+	ldwx		$idx($np),$hi0
+	stws,ma		%r0,4($tp)
+	addib,<>	4,$idx,L\$copy_pa11
+	stws,ma		$hi0,4($rp)	
+
+	nop					; alignment
+L\$done
+___
+}
+
+$code.=<<___;
+	ldi		1,%r28			; signal "handled"
+	ldo		$FRAME($fp),%sp		; destroy tp[num+1]
+
+	$POP	`-$FRAME-$SAVED_RP`(%sp),%r2	; standard epilogue
+	$POP	`-$FRAME+1*$SIZE_T`(%sp),%r4
+	$POP	`-$FRAME+2*$SIZE_T`(%sp),%r5
+	$POP	`-$FRAME+3*$SIZE_T`(%sp),%r6
+	$POP	`-$FRAME+4*$SIZE_T`(%sp),%r7
+	$POP	`-$FRAME+5*$SIZE_T`(%sp),%r8
+	$POP	`-$FRAME+6*$SIZE_T`(%sp),%r9
+	$POP	`-$FRAME+7*$SIZE_T`(%sp),%r10
+L\$abort
+	bv	(%r2)
+	.EXIT
+	$POPMB	-$FRAME(%sp),%r3
+	.PROCEND
+	.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# Explicitly encode PA-RISC 2.0 instructions used in this module, so
+# that it can be compiled with .LEVEL 1.0. It should be noted that I
+# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
+# directive...
+
+my $ldd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "ldd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)		# format 4
+    {	my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/)	# format 5
+    {	my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
+	$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12);		# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $std = sub {
+  my ($mod,$args) = @_;
+  my $orig = "std$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/)	# format 6
+    {	my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
+	$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4);			# encode offset
+	$opcode|=(1<<5)  if ($mod =~ /^,m/);
+	$opcode|=(1<<13) if ($mod =~ /^,mb/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $extrd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "extrd$mod\t$args";
+
+    # I only have ",u" completer, it's implicitly encoded...
+    if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/)	# format 15
+    {	my $opcode=(0x36<<26)|($1<<21)|($4<<16);
+	my $len=32-$3;
+	$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5);		# encode pos
+	$opcode |= (($len&0x20)<<7)|($len&0x1f);		# encode len
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/)	# format 12
+    {	my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
+	my $len=32-$2;
+	$opcode |= (($len&0x20)<<3)|($len&0x1f);		# encode len
+	$opcode |= (1<<13) if ($mod =~ /,\**=/);
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $shrpd = sub {
+  my ($mod,$args) = @_;
+  my $orig = "shrpd$mod\t$args";
+
+    if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/)	# format 14
+    {	my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
+	my $cpos=63-$3;
+	$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5);		# encode sa
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
+    }
+    else { "\t".$orig; }
+};
+
+my $sub = sub {
+  my ($mod,$args) = @_;
+  my $orig = "sub$mod\t$args";
+
+    if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
+	my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
+	$opcode|=(1<<10);	# e1
+	$opcode|=(1<<8);	# e2
+	$opcode|=(1<<5);	# d
+	sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
+    }
+    else { "\t".$orig; }
+};
+
+sub assemble {
+  my ($mnemonic,$mod,$args)=@_;
+  my $opcode = eval("\$$mnemonic");
+
+    ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	# flip word order in 64-bit mode...
+	s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
+	# assemble 2.0 instructions in 32-bit mode...
+	s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
+
+	s/\bbv\b/bve/gm	if ($SIZE_T==8);
+
+	print $_,"\n";
+}
+close STDOUT;
--- a/crypto/bn/asm/ppc-mont.pl
+++ b/crypto/bn/asm/ppc-mont.pl
@@ -0,0 +1,335 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# April 2006
+
+# "Teaser" Montgomery multiplication module for PowerPC. It's possible
+# to gain a bit more by modulo-scheduling outer loop, then dedicated
+# squaring procedure should give further 20% and code can be adapted
+# for 32-bit application running on 64-bit CPU. As for the latter.
+# It won't be able to achieve "native" 64-bit performance, because in
+# 32-bit application context every addc instruction will have to be
+# expanded as addc, twice right shift by 32 and finally adde, etc.
+# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
+# for 64-bit application running on PPC970/G5 is:
+#
+# 512-bit	+65%	
+# 1024-bit	+35%
+# 2048-bit	+18%
+# 4096-bit	+4%
+
+$flavour = shift;
+
+if ($flavour =~ /32/) {
+	$BITS=	32;
+	$BNSZ=	$BITS/8;
+	$SIZE_T=4;
+	$RZONE=	224;
+
+	$LD=	"lwz";		# load
+	$LDU=	"lwzu";		# load and update
+	$LDX=	"lwzx";		# load indexed
+	$ST=	"stw";		# store
+	$STU=	"stwu";		# store and update
+	$STX=	"stwx";		# store indexed
+	$STUX=	"stwux";	# store indexed and update
+	$UMULL=	"mullw";	# unsigned multiply low
+	$UMULH=	"mulhwu";	# unsigned multiply high
+	$UCMP=	"cmplw";	# unsigned compare
+	$SHRI=	"srwi";		# unsigned shift right by immediate	
+	$PUSH=	$ST;
+	$POP=	$LD;
+} elsif ($flavour =~ /64/) {
+	$BITS=	64;
+	$BNSZ=	$BITS/8;
+	$SIZE_T=8;
+	$RZONE=	288;
+
+	# same as above, but 64-bit mnemonics...
+	$LD=	"ld";		# load
+	$LDU=	"ldu";		# load and update
+	$LDX=	"ldx";		# load indexed
+	$ST=	"std";		# store
+	$STU=	"stdu";		# store and update
+	$STX=	"stdx";		# store indexed
+	$STUX=	"stdux";	# store indexed and update
+	$UMULL=	"mulld";	# unsigned multiply low
+	$UMULH=	"mulhdu";	# unsigned multiply high
+	$UCMP=	"cmpld";	# unsigned compare
+	$SHRI=	"srdi";		# unsigned shift right by immediate	
+	$PUSH=	$ST;
+	$POP=	$LD;
+} else { die "nonsense $flavour"; }
+
+$FRAME=8*$SIZE_T+$RZONE;
+$LOCALS=8*$SIZE_T;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$sp="r1";
+$toc="r2";
+$rp="r3";	$ovf="r3";
+$ap="r4";
+$bp="r5";
+$np="r6";
+$n0="r7";
+$num="r8";
+$rp="r9";	# $rp is reassigned
+$aj="r10";
+$nj="r11";
+$tj="r12";
+# non-volatile registers
+$i="r20";
+$j="r21";
+$tp="r22";
+$m0="r23";
+$m1="r24";
+$lo0="r25";
+$hi0="r26";
+$lo1="r27";
+$hi1="r28";
+$alo="r29";
+$ahi="r30";
+$nlo="r31";
+#
+$nhi="r0";
+
+$code=<<___;
+.machine "any"
+.text
+
+.globl	.bn_mul_mont_int
+.align	4
+.bn_mul_mont_int:
+	cmpwi	$num,4
+	mr	$rp,r3		; $rp is reassigned
+	li	r3,0
+	bltlr
+___
+$code.=<<___ if ($BNSZ==4);
+	cmpwi	$num,32		; longer key performance is not better
+	bgelr
+___
+$code.=<<___;
+	slwi	$num,$num,`log($BNSZ)/log(2)`
+	li	$tj,-4096
+	addi	$ovf,$num,$FRAME
+	subf	$ovf,$ovf,$sp	; $sp-$ovf
+	and	$ovf,$ovf,$tj	; minimize TLB usage
+	subf	$ovf,$sp,$ovf	; $ovf-$sp
+	mr	$tj,$sp
+	srwi	$num,$num,`log($BNSZ)/log(2)`
+	$STUX	$sp,$sp,$ovf
+
+	$PUSH	r20,`-12*$SIZE_T`($tj)
+	$PUSH	r21,`-11*$SIZE_T`($tj)
+	$PUSH	r22,`-10*$SIZE_T`($tj)
+	$PUSH	r23,`-9*$SIZE_T`($tj)
+	$PUSH	r24,`-8*$SIZE_T`($tj)
+	$PUSH	r25,`-7*$SIZE_T`($tj)
+	$PUSH	r26,`-6*$SIZE_T`($tj)
+	$PUSH	r27,`-5*$SIZE_T`($tj)
+	$PUSH	r28,`-4*$SIZE_T`($tj)
+	$PUSH	r29,`-3*$SIZE_T`($tj)
+	$PUSH	r30,`-2*$SIZE_T`($tj)
+	$PUSH	r31,`-1*$SIZE_T`($tj)
+
+	$LD	$n0,0($n0)	; pull n0[0] value
+	addi	$num,$num,-2	; adjust $num for counter register
+
+	$LD	$m0,0($bp)	; m0=bp[0]
+	$LD	$aj,0($ap)	; ap[0]
+	addi	$tp,$sp,$LOCALS
+	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[0]
+	$UMULH	$hi0,$aj,$m0
+
+	$LD	$aj,$BNSZ($ap)	; ap[1]
+	$LD	$nj,0($np)	; np[0]
+
+	$UMULL	$m1,$lo0,$n0	; "tp[0]"*n0
+
+	$UMULL	$alo,$aj,$m0	; ap[1]*bp[0]
+	$UMULH	$ahi,$aj,$m0
+
+	$UMULL	$lo1,$nj,$m1	; np[0]*m1
+	$UMULH	$hi1,$nj,$m1
+	$LD	$nj,$BNSZ($np)	; np[1]
+	addc	$lo1,$lo1,$lo0
+	addze	$hi1,$hi1
+
+	$UMULL	$nlo,$nj,$m1	; np[1]*m1
+	$UMULH	$nhi,$nj,$m1
+
+	mtctr	$num
+	li	$j,`2*$BNSZ`
+.align	4
+L1st:
+	$LDX	$aj,$ap,$j	; ap[j]
+	addc	$lo0,$alo,$hi0
+	$LDX	$nj,$np,$j	; np[j]
+	addze	$hi0,$ahi
+	$UMULL	$alo,$aj,$m0	; ap[j]*bp[0]
+	addc	$lo1,$nlo,$hi1
+	$UMULH	$ahi,$aj,$m0
+	addze	$hi1,$nhi
+	$UMULL	$nlo,$nj,$m1	; np[j]*m1
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
+	$UMULH	$nhi,$nj,$m1
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+
+	addi	$j,$j,$BNSZ	; j++
+	addi	$tp,$tp,$BNSZ	; tp++
+	bdnz	L1st
+;L1st
+	addc	$lo0,$alo,$hi0
+	addze	$hi0,$ahi
+
+	addc	$lo1,$nlo,$hi1
+	addze	$hi1,$nhi
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[0]
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+
+	li	$ovf,0
+	addc	$hi1,$hi1,$hi0
+	addze	$ovf,$ovf	; upmost overflow bit
+	$ST	$hi1,$BNSZ($tp)
+
+	li	$i,$BNSZ
+.align	4
+Louter:
+	$LDX	$m0,$bp,$i	; m0=bp[i]
+	$LD	$aj,0($ap)	; ap[0]
+	addi	$tp,$sp,$LOCALS
+	$LD	$tj,$LOCALS($sp); tp[0]
+	$UMULL	$lo0,$aj,$m0	; ap[0]*bp[i]
+	$UMULH	$hi0,$aj,$m0
+	$LD	$aj,$BNSZ($ap)	; ap[1]
+	$LD	$nj,0($np)	; np[0]
+	addc	$lo0,$lo0,$tj	; ap[0]*bp[i]+tp[0]
+	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
+	addze	$hi0,$hi0
+	$UMULL	$m1,$lo0,$n0	; tp[0]*n0
+	$UMULH	$ahi,$aj,$m0
+	$UMULL	$lo1,$nj,$m1	; np[0]*m1
+	$UMULH	$hi1,$nj,$m1
+	$LD	$nj,$BNSZ($np)	; np[1]
+	addc	$lo1,$lo1,$lo0
+	$UMULL	$nlo,$nj,$m1	; np[1]*m1
+	addze	$hi1,$hi1
+	$UMULH	$nhi,$nj,$m1
+
+	mtctr	$num
+	li	$j,`2*$BNSZ`
+.align	4
+Linner:
+	$LDX	$aj,$ap,$j	; ap[j]
+	addc	$lo0,$alo,$hi0
+	$LD	$tj,$BNSZ($tp)	; tp[j]
+	addze	$hi0,$ahi
+	$LDX	$nj,$np,$j	; np[j]
+	addc	$lo1,$nlo,$hi1
+	$UMULL	$alo,$aj,$m0	; ap[j]*bp[i]
+	addze	$hi1,$nhi
+	$UMULH	$ahi,$aj,$m0
+	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
+	$UMULL	$nlo,$nj,$m1	; np[j]*m1
+	addze	$hi0,$hi0
+	$UMULH	$nhi,$nj,$m1
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
+	addi	$j,$j,$BNSZ	; j++
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+	addi	$tp,$tp,$BNSZ	; tp++
+	bdnz	Linner
+;Linner
+	$LD	$tj,$BNSZ($tp)	; tp[j]
+	addc	$lo0,$alo,$hi0
+	addze	$hi0,$ahi
+	addc	$lo0,$lo0,$tj	; ap[j]*bp[i]+tp[j]
+	addze	$hi0,$hi0
+
+	addc	$lo1,$nlo,$hi1
+	addze	$hi1,$nhi
+	addc	$lo1,$lo1,$lo0	; np[j]*m1+ap[j]*bp[i]+tp[j]
+	addze	$hi1,$hi1
+	$ST	$lo1,0($tp)	; tp[j-1]
+
+	addic	$ovf,$ovf,-1	; move upmost overflow to XER[CA]
+	li	$ovf,0
+	adde	$hi1,$hi1,$hi0
+	addze	$ovf,$ovf
+	$ST	$hi1,$BNSZ($tp)
+;
+	slwi	$tj,$num,`log($BNSZ)/log(2)`
+	$UCMP	$i,$tj
+	addi	$i,$i,$BNSZ
+	ble	Louter
+
+	addi	$num,$num,2	; restore $num
+	subfc	$j,$j,$j	; j=0 and "clear" XER[CA]
+	addi	$tp,$sp,$LOCALS
+	mtctr	$num
+
+.align	4
+Lsub:	$LDX	$tj,$tp,$j
+	$LDX	$nj,$np,$j
+	subfe	$aj,$nj,$tj	; tp[j]-np[j]
+	$STX	$aj,$rp,$j
+	addi	$j,$j,$BNSZ
+	bdnz	Lsub
+
+	li	$j,0
+	mtctr	$num
+	subfe	$ovf,$j,$ovf	; handle upmost overflow bit
+	and	$ap,$tp,$ovf
+	andc	$np,$rp,$ovf
+	or	$ap,$ap,$np	; ap=borrow?tp:rp
+
+.align	4
+Lcopy:				; copy or in-place refresh
+	$LDX	$tj,$ap,$j
+	$STX	$tj,$rp,$j
+	$STX	$j,$tp,$j	; zap at once
+	addi	$j,$j,$BNSZ
+	bdnz	Lcopy
+
+	$POP	$tj,0($sp)
+	li	r3,1
+	$POP	r20,`-12*$SIZE_T`($tj)
+	$POP	r21,`-11*$SIZE_T`($tj)
+	$POP	r22,`-10*$SIZE_T`($tj)
+	$POP	r23,`-9*$SIZE_T`($tj)
+	$POP	r24,`-8*$SIZE_T`($tj)
+	$POP	r25,`-7*$SIZE_T`($tj)
+	$POP	r26,`-6*$SIZE_T`($tj)
+	$POP	r27,`-5*$SIZE_T`($tj)
+	$POP	r28,`-4*$SIZE_T`($tj)
+	$POP	r29,`-3*$SIZE_T`($tj)
+	$POP	r30,`-2*$SIZE_T`($tj)
+	$POP	r31,`-1*$SIZE_T`($tj)
+	mr	$sp,$tj
+	blr
+	.long	0
+	.byte	0,12,4,0,0x80,12,6,0
+	.long	0
+.size	.bn_mul_mont_int,.-.bn_mul_mont_int
+
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;
--- a/crypto/bn/asm/ppc.pl
+++ b/crypto/bn/asm/ppc.pl
--- a/crypto/bn/asm/ppc64-mont.pl
+++ b/crypto/bn/asm/ppc64-mont.pl
--- a/crypto/bn/asm/rsaz-avx2.pl
+++ b/crypto/bn/asm/rsaz-avx2.pl
--- a/crypto/bn/asm/rsaz-x86_64.pl
+++ b/crypto/bn/asm/rsaz-x86_64.pl
--- a/crypto/bn/asm/s390x-gf2m.pl
+++ b/crypto/bn/asm/s390x-gf2m.pl
@@ -0,0 +1,221 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... gcc 4.3 appeared to generate poor code, therefore
+# the effort. And indeed, the module delivers 55%-90%(*) improvement
+# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
+# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
+# This is for 64-bit build. In 32-bit "highgprs" case improvement is
+# even higher, for example on z990 it was measured 80%-150%. ECDSA
+# sign is modest 9%-12% faster. Keep in mind that these coefficients
+# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
+# burnt in it...
+#
+# (*)	gcc 4.1 was observed to deliver better results than gcc 4.3,
+#	so that improvement coefficients can vary from one specific
+#	setup to another.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+        $SIZE_T=4;
+        $g="";
+} else {
+        $SIZE_T=8;
+        $g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
+$rp="%r2";
+$a1="%r3";
+$a0="%r4";
+$b1="%r5";
+$b0="%r6";
+
+$ra="%r14";
+$sp="%r15";
+
+@T=("%r0","%r1");
+@i=("%r12","%r13");
+
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11));
+($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8;
+
+$code.=<<___;
+.text
+
+.type	_mul_1x1,\@function
+.align	16
+_mul_1x1:
+	lgr	$a1,$a
+	sllg	$a2,$a,1
+	sllg	$a4,$a,2
+	sllg	$a8,$a,3
+
+	srag	$lo,$a1,63			# broadcast 63rd bit
+	nihh	$a1,0x1fff
+	srag	@i[0],$a2,63			# broadcast 62nd bit
+	nihh	$a2,0x3fff
+	srag	@i[1],$a4,63			# broadcast 61st bit
+	nihh	$a4,0x7fff
+	ngr	$lo,$b
+	ngr	@i[0],$b
+	ngr	@i[1],$b
+
+	lghi	@T[0],0
+	lgr	$a12,$a1
+	stg	@T[0],`$stdframe+0*8`($sp)	# tab[0]=0
+	xgr	$a12,$a2
+	stg	$a1,`$stdframe+1*8`($sp)	# tab[1]=a1
+	 lgr	$a48,$a4
+	stg	$a2,`$stdframe+2*8`($sp)	# tab[2]=a2
+	 xgr	$a48,$a8
+	stg	$a12,`$stdframe+3*8`($sp)	# tab[3]=a1^a2
+	 xgr	$a1,$a4
+
+	stg	$a4,`$stdframe+4*8`($sp)	# tab[4]=a4
+	xgr	$a2,$a4
+	stg	$a1,`$stdframe+5*8`($sp)	# tab[5]=a1^a4
+	xgr	$a12,$a4
+	stg	$a2,`$stdframe+6*8`($sp)	# tab[6]=a2^a4
+	 xgr	$a1,$a48
+	stg	$a12,`$stdframe+7*8`($sp)	# tab[7]=a1^a2^a4
+	 xgr	$a2,$a48
+
+	stg	$a8,`$stdframe+8*8`($sp)	# tab[8]=a8
+	xgr	$a12,$a48
+	stg	$a1,`$stdframe+9*8`($sp)	# tab[9]=a1^a8
+	 xgr	$a1,$a4
+	stg	$a2,`$stdframe+10*8`($sp)	# tab[10]=a2^a8
+	 xgr	$a2,$a4
+	stg	$a12,`$stdframe+11*8`($sp)	# tab[11]=a1^a2^a8
+
+	xgr	$a12,$a4
+	stg	$a48,`$stdframe+12*8`($sp)	# tab[12]=a4^a8
+	 srlg	$hi,$lo,1
+	stg	$a1,`$stdframe+13*8`($sp)	# tab[13]=a1^a4^a8
+	 sllg	$lo,$lo,63
+	stg	$a2,`$stdframe+14*8`($sp)	# tab[14]=a2^a4^a8
+	 srlg	@T[0],@i[0],2
+	stg	$a12,`$stdframe+15*8`($sp)	# tab[15]=a1^a2^a4^a8
+
+	lghi	$mask,`0xf<<3`
+	sllg	$a1,@i[0],62
+	 sllg	@i[0],$b,3
+	srlg	@T[1],@i[1],3
+	 ngr	@i[0],$mask
+	sllg	$a2,@i[1],61
+	 srlg	@i[1],$b,4-3
+	xgr	$hi,@T[0]
+	 ngr	@i[1],$mask
+	xgr	$lo,$a1
+	xgr	$hi,@T[1]
+	xgr	$lo,$a2
+
+	xg	$lo,$stdframe(@i[0],$sp)
+	srlg	@i[0],$b,8-3
+	ngr	@i[0],$mask
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+	lg	@T[1],$stdframe(@i[1],$sp)
+	srlg	@i[1],$b,`($n+2)*4`-3
+	sllg	@T[0],@T[1],`$n*4`
+	ngr	@i[1],$mask
+	srlg	@T[1],@T[1],`64-$n*4`
+	xgr	$lo,@T[0]
+	xgr	$hi,@T[1]
+___
+	push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+	lg	@T[1],$stdframe(@i[1],$sp)
+	sllg	@T[0],@T[1],`$n*4`
+	srlg	@T[1],@T[1],`64-$n*4`
+	xgr	$lo,@T[0]
+	xgr	$hi,@T[1]
+
+	lg	@T[0],$stdframe(@i[0],$sp)
+	sllg	@T[1],@T[0],`($n+1)*4`
+	srlg	@T[0],@T[0],`64-($n+1)*4`
+	xgr	$lo,@T[1]
+	xgr	$hi,@T[0]
+
+	br	$ra
+.size	_mul_1x1,.-_mul_1x1
+
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,\@function
+.align	16
+bn_GF2m_mul_2x2:
+	stm${g}	%r3,%r15,3*$SIZE_T($sp)
+
+	lghi	%r1,-$stdframe-128
+	la	%r0,0($sp)
+	la	$sp,0(%r1,$sp)			# alloca
+	st${g}	%r0,0($sp)			# back chain
+___
+if ($SIZE_T==8) {
+my @r=map("%r$_",(6..9));
+$code.=<<___;
+	bras	$ra,_mul_1x1			# a1·b1
+	stmg	$lo,$hi,16($rp)
+
+	lg	$a,`$stdframe+128+4*$SIZE_T`($sp)
+	lg	$b,`$stdframe+128+6*$SIZE_T`($sp)
+	bras	$ra,_mul_1x1			# a0·b0
+	stmg	$lo,$hi,0($rp)
+
+	lg	$a,`$stdframe+128+3*$SIZE_T`($sp)
+	lg	$b,`$stdframe+128+5*$SIZE_T`($sp)
+	xg	$a,`$stdframe+128+4*$SIZE_T`($sp)
+	xg	$b,`$stdframe+128+6*$SIZE_T`($sp)
+	bras	$ra,_mul_1x1			# (a0+a1)·(b0+b1)
+	lmg	@r[0],@r[3],0($rp)
+
+	xgr	$lo,$hi
+	xgr	$hi,@r[1]
+	xgr	$lo,@r[0]
+	xgr	$hi,@r[2]
+	xgr	$lo,@r[3]	
+	xgr	$hi,@r[3]
+	xgr	$lo,$hi
+	stg	$hi,16($rp)
+	stg	$lo,8($rp)
+___
+} else {
+$code.=<<___;
+	sllg	%r3,%r3,32
+	sllg	%r5,%r5,32
+	or	%r3,%r4
+	or	%r5,%r6
+	bras	$ra,_mul_1x1
+	rllg	$lo,$lo,32
+	rllg	$hi,$hi,32
+	stmg	$lo,$hi,0($rp)
+___
+}
+$code.=<<___;
+	lm${g}	%r6,%r15,`$stdframe+128+6*$SIZE_T`($sp)
+	br	$ra
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.string	"GF(2^m) Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
--- a/crypto/bn/asm/s390x-mont.pl
+++ b/crypto/bn/asm/s390x-mont.pl
@@ -0,0 +1,277 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# April 2007.
+#
+# Performance improvement over vanilla C code varies from 85% to 45%
+# depending on key length and benchmark. Unfortunately in this context
+# these are not very impressive results [for code that utilizes "wide"
+# 64x64=128-bit multiplication, which is not commonly available to C
+# programmers], at least hand-coded bn_asm.c replacement is known to
+# provide 30-40% better results for longest keys. Well, on a second
+# thought it's not very surprising, because z-CPUs are single-issue
+# and _strictly_ in-order execution, while bn_mul_mont is more or less
+# dependent on CPU ability to pipe-line instructions and have several
+# of them "in-flight" at the same time. I mean while other methods,
+# for example Karatsuba, aim to minimize amount of multiplications at
+# the cost of other operations increase, bn_mul_mont aim to neatly
+# "overlap" multiplications and the other operations [and on most
+# platforms even minimize the amount of the other operations, in
+# particular references to memory]. But it's possible to improve this
+# module performance by implementing dedicated squaring code-path and
+# possibly by unrolling loops...
+
+# January 2009.
+#
+# Reschedule to minimize/avoid Address Generation Interlock hazard,
+# make inner loops counter-based.
+
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
+# is achieved by swapping words after 64-bit loads, follow _dswap-s.
+# On z990 it was measured to perform 2.6-2.2 times better than
+# compiler-generated code, less for longer keys...
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+	$SIZE_T=4;
+	$g="";
+} else {
+	$SIZE_T=8;
+	$g="g";
+}
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$stdframe=16*$SIZE_T+4*8;
+
+$mn0="%r0";
+$num="%r1";
+
+# int bn_mul_mont(
+$rp="%r2";		# BN_ULONG *rp,
+$ap="%r3";		# const BN_ULONG *ap,
+$bp="%r4";		# const BN_ULONG *bp,
+$np="%r5";		# const BN_ULONG *np,
+$n0="%r6";		# const BN_ULONG *n0,
+#$num="160(%r15)"	# int num);
+
+$bi="%r2";	# zaps rp
+$j="%r7";
+
+$ahi="%r8";
+$alo="%r9";
+$nhi="%r10";
+$nlo="%r11";
+$AHI="%r12";
+$NHI="%r13";
+$count="%r14";
+$sp="%r15";
+
+$code.=<<___;
+.text
+.globl	bn_mul_mont
+.type	bn_mul_mont,\@function
+bn_mul_mont:
+	lgf	$num,`$stdframe+$SIZE_T-4`($sp)	# pull $num
+	sla	$num,`log($SIZE_T)/log(2)`	# $num to enumerate bytes
+	la	$bp,0($num,$bp)
+
+	st${g}	%r2,2*$SIZE_T($sp)
+
+	cghi	$num,16		#
+	lghi	%r2,0		#
+	blr	%r14		# if($num<16) return 0;
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+	tmll	$num,4
+	bnzr	%r14		# if ($num&1) return 0;
+___
+$code.=<<___ if ($flavour !~ /3[12]/);
+	cghi	$num,96		#
+	bhr	%r14		# if($num>96) return 0;
+___
+$code.=<<___;
+	stm${g}	%r3,%r15,3*$SIZE_T($sp)
+
+	lghi	$rp,-$stdframe-8	# leave room for carry bit
+	lcgr	$j,$num		# -$num
+	lgr	%r0,$sp
+	la	$rp,0($rp,$sp)
+	la	$sp,0($j,$rp)	# alloca
+	st${g}	%r0,0($sp)	# back chain
+
+	sra	$num,3		# restore $num
+	la	$bp,0($j,$bp)	# restore $bp
+	ahi	$num,-1		# adjust $num for inner loop
+	lg	$n0,0($n0)	# pull n0
+	_dswap	$n0
+
+	lg	$bi,0($bp)
+	_dswap	$bi
+	lg	$alo,0($ap)
+	_dswap	$alo
+	mlgr	$ahi,$bi	# ap[0]*bp[0]
+	lgr	$AHI,$ahi
+
+	lgr	$mn0,$alo	# "tp[0]"*n0
+	msgr	$mn0,$n0
+
+	lg	$nlo,0($np)	#
+	_dswap	$nlo
+	mlgr	$nhi,$mn0	# np[0]*m1
+	algr	$nlo,$alo	# +="tp[0]"
+	lghi	$NHI,0
+	alcgr	$NHI,$nhi
+
+	la	$j,8(%r0)	# j=1
+	lr	$count,$num
+
+.align	16
+.L1st:
+	lg	$alo,0($j,$ap)
+	_dswap	$alo
+	mlgr	$ahi,$bi	# ap[j]*bp[0]
+	algr	$alo,$AHI
+	lghi	$AHI,0
+	alcgr	$AHI,$ahi
+
+	lg	$nlo,0($j,$np)
+	_dswap	$nlo
+	mlgr	$nhi,$mn0	# np[j]*m1
+	algr	$nlo,$NHI
+	lghi	$NHI,0
+	alcgr	$nhi,$NHI	# +="tp[j]"
+	algr	$nlo,$alo
+	alcgr	$NHI,$nhi
+
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
+	la	$j,8($j)	# j++
+	brct	$count,.L1st
+
+	algr	$NHI,$AHI
+	lghi	$AHI,0
+	alcgr	$AHI,$AHI	# upmost overflow bit
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)
+	la	$bp,8($bp)	# bp++
+
+.Louter:
+	lg	$bi,0($bp)	# bp[i]
+	_dswap	$bi
+	lg	$alo,0($ap)
+	_dswap	$alo
+	mlgr	$ahi,$bi	# ap[0]*bp[i]
+	alg	$alo,$stdframe($sp)	# +=tp[0]
+	lghi	$AHI,0
+	alcgr	$AHI,$ahi
+
+	lgr	$mn0,$alo
+	msgr	$mn0,$n0	# tp[0]*n0
+
+	lg	$nlo,0($np)	# np[0]
+	_dswap	$nlo
+	mlgr	$nhi,$mn0	# np[0]*m1
+	algr	$nlo,$alo	# +="tp[0]"
+	lghi	$NHI,0
+	alcgr	$NHI,$nhi
+
+	la	$j,8(%r0)	# j=1
+	lr	$count,$num
+
+.align	16
+.Linner:
+	lg	$alo,0($j,$ap)
+	_dswap	$alo
+	mlgr	$ahi,$bi	# ap[j]*bp[i]
+	algr	$alo,$AHI
+	lghi	$AHI,0
+	alcgr	$ahi,$AHI
+	alg	$alo,$stdframe($j,$sp)# +=tp[j]
+	alcgr	$AHI,$ahi
+
+	lg	$nlo,0($j,$np)
+	_dswap	$nlo
+	mlgr	$nhi,$mn0	# np[j]*m1
+	algr	$nlo,$NHI
+	lghi	$NHI,0
+	alcgr	$nhi,$NHI
+	algr	$nlo,$alo	# +="tp[j]"
+	alcgr	$NHI,$nhi
+
+	stg	$nlo,$stdframe-8($j,$sp)	# tp[j-1]=
+	la	$j,8($j)	# j++
+	brct	$count,.Linner
+
+	algr	$NHI,$AHI
+	lghi	$AHI,0
+	alcgr	$AHI,$AHI
+	alg	$NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
+	lghi	$ahi,0
+	alcgr	$AHI,$ahi	# new upmost overflow bit
+	stg	$NHI,$stdframe-8($j,$sp)
+	stg	$AHI,$stdframe($j,$sp)
+
+	la	$bp,8($bp)	# bp++
+	cl${g}	$bp,`$stdframe+8+4*$SIZE_T`($j,$sp)	# compare to &bp[num]
+	jne	.Louter
+
+	l${g}	$rp,`$stdframe+8+2*$SIZE_T`($j,$sp)	# reincarnate rp
+	la	$ap,$stdframe($sp)
+	ahi	$num,1		# restore $num, incidentally clears "borrow"
+
+	la	$j,0(%r0)
+	lr	$count,$num
+.Lsub:	lg	$alo,0($j,$ap)
+	lg	$nlo,0($j,$np)
+	_dswap	$nlo
+	slbgr	$alo,$nlo
+	stg	$alo,0($j,$rp)
+	la	$j,8($j)
+	brct	$count,.Lsub
+	lghi	$ahi,0
+	slbgr	$AHI,$ahi	# handle upmost carry
+
+	ngr	$ap,$AHI
+	lghi	$np,-1
+	xgr	$np,$AHI
+	ngr	$np,$rp
+	ogr	$ap,$np		# ap=borrow?tp:rp
+
+	la	$j,0(%r0)
+	lgr	$count,$num
+.Lcopy:	lg	$alo,0($j,$ap)		# copy or in-place refresh
+	_dswap	$alo
+	stg	$j,$stdframe($j,$sp)	# zap tp
+	stg	$alo,0($j,$rp)
+	la	$j,8($j)
+	brct	$count,.Lcopy
+
+	la	%r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
+	lm${g}	%r6,%r15,0(%r1)
+	lghi	%r2,1		# signal "processed"
+	br	%r14
+.size	bn_mul_mont,.-bn_mul_mont
+.string	"Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+	s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
+	print $_,"\n";
+}
+close STDOUT;
--- a/crypto/bn/asm/s390x.S
+++ b/crypto/bn/asm/s390x.S
@@ -0,0 +1,713 @@
+.ident "s390x.S, version 1.1"
+// ====================================================================
+// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+// project.
+//
+// Rights for redistribution and usage in source and binary forms are
+// granted according to the OpenSSL license. Warranty of any kind is
+// disclaimed.
+// ====================================================================
+
+.text
+
+#define zero	%r0
+
+// BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl	bn_mul_add_words
+.type	bn_mul_add_words,@function
+.align	4
+bn_mul_add_words:
+	lghi	zero,0		// zero = 0
+	la	%r1,0(%r2)	// put rp aside [to give way to]
+	lghi	%r2,0		// return value
+	ltgfr	%r4,%r4
+	bler	%r14		// if (len<=0) return 0;
+
+	stmg	%r6,%r13,48(%r15)
+	lghi	%r2,3
+	lghi	%r12,0		// carry = 0
+	slgr	%r1,%r3		// rp-=ap
+	nr	%r2,%r4		// len%4
+	sra	%r4,2		// cnt=len/4
+	jz	.Loop1_madd	// carry is incidentally cleared if branch taken
+	algr	zero,zero	// clear carry
+
+	lg	%r7,0(%r3)	// ap[0]
+	lg	%r9,8(%r3)	// ap[1]
+	mlgr	%r6,%r5		// *=w
+	brct	%r4,.Loop4_madd
+	j	.Loop4_madd_tail
+
+.Loop4_madd:
+	mlgr	%r8,%r5
+	lg	%r11,16(%r3)	// ap[i+2]
+	alcgr	%r7,%r12	// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r3,%r1)	// +=rp[i]
+	stg	%r7,0(%r3,%r1)	// rp[i]=
+
+	mlgr	%r10,%r5
+	lg	%r13,24(%r3)
+	alcgr	%r9,%r6
+	alcgr	%r8,zero
+	alg	%r9,8(%r3,%r1)
+	stg	%r9,8(%r3,%r1)
+
+	mlgr	%r12,%r5
+	lg	%r7,32(%r3)
+	alcgr	%r11,%r8
+	alcgr	%r10,zero
+	alg	%r11,16(%r3,%r1)
+	stg	%r11,16(%r3,%r1)
+
+	mlgr	%r6,%r5
+	lg	%r9,40(%r3)
+	alcgr	%r13,%r10
+	alcgr	%r12,zero
+	alg	%r13,24(%r3,%r1)
+	stg	%r13,24(%r3,%r1)
+
+	la	%r3,32(%r3)	// i+=4
+	brct	%r4,.Loop4_madd
+
+.Loop4_madd_tail:
+	mlgr	%r8,%r5
+	lg	%r11,16(%r3)
+	alcgr	%r7,%r12	// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r3,%r1)	// +=rp[i]
+	stg	%r7,0(%r3,%r1)	// rp[i]=
+
+	mlgr	%r10,%r5
+	lg	%r13,24(%r3)
+	alcgr	%r9,%r6
+	alcgr	%r8,zero
+	alg	%r9,8(%r3,%r1)
+	stg	%r9,8(%r3,%r1)
+
+	mlgr	%r12,%r5
+	alcgr	%r11,%r8
+	alcgr	%r10,zero
+	alg	%r11,16(%r3,%r1)
+	stg	%r11,16(%r3,%r1)
+
+	alcgr	%r13,%r10
+	alcgr	%r12,zero
+	alg	%r13,24(%r3,%r1)
+	stg	%r13,24(%r3,%r1)
+
+	la	%r3,32(%r3)	// i+=4
+
+	la	%r2,1(%r2)	// see if len%4 is zero ...
+	brct	%r2,.Loop1_madd	// without touching condition code:-)
+
+.Lend_madd:
+	lgr	%r2,zero	// return value
+	alcgr	%r2,%r12	// collect even carry bit
+	lmg	%r6,%r13,48(%r15)
+	br	%r14
+
+.Loop1_madd:
+	lg	%r7,0(%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	alcgr	%r7,%r12	// +=carry
+	alcgr	%r6,zero
+	alg	%r7,0(%r3,%r1)	// +=rp[i]
+	stg	%r7,0(%r3,%r1)	// rp[i]=
+
+	lgr	%r12,%r6
+	la	%r3,8(%r3)	// i++
+	brct	%r2,.Loop1_madd
+
+	j	.Lend_madd
+.size	bn_mul_add_words,.-bn_mul_add_words
+
+// BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
+.globl	bn_mul_words
+.type	bn_mul_words,@function
+.align	4
+bn_mul_words:
+	lghi	zero,0		// zero = 0
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0;
+	ltgfr	%r4,%r4
+	bler	%r14		// if (len<=0) return 0;
+
+	stmg	%r6,%r10,48(%r15)
+	lghi	%r10,3
+	lghi	%r8,0		// carry = 0
+	nr	%r10,%r4	// len%4
+	sra	%r4,2		// cnt=len/4
+	jz	.Loop1_mul	// carry is incidentally cleared if branch taken
+	algr	zero,zero	// clear carry
+
+.Loop4_mul:
+	lg	%r7,0(%r2,%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	alcgr	%r7,%r8		// +=carry
+	stg	%r7,0(%r2,%r1)	// rp[i]=
+
+	lg	%r9,8(%r2,%r3)
+	mlgr	%r8,%r5
+	alcgr	%r9,%r6
+	stg	%r9,8(%r2,%r1)
+
+	lg	%r7,16(%r2,%r3)
+	mlgr	%r6,%r5
+	alcgr	%r7,%r8
+	stg	%r7,16(%r2,%r1)
+
+	lg	%r9,24(%r2,%r3)
+	mlgr	%r8,%r5
+	alcgr	%r9,%r6
+	stg	%r9,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r4,.Loop4_mul
+
+	la	%r10,1(%r10)		// see if len%4 is zero ...
+	brct	%r10,.Loop1_mul		// without touching condition code:-)
+
+.Lend_mul:
+	alcgr	%r8,zero	// collect carry bit
+	lgr	%r2,%r8
+	lmg	%r6,%r10,48(%r15)
+	br	%r14
+
+.Loop1_mul:
+	lg	%r7,0(%r2,%r3)	// ap[i]
+	mlgr	%r6,%r5		// *=w
+	alcgr	%r7,%r8		// +=carry
+	stg	%r7,0(%r2,%r1)	// rp[i]=
+
+	lgr	%r8,%r6
+	la	%r2,8(%r2)	// i++
+	brct	%r10,.Loop1_mul
+
+	j	.Lend_mul
+.size	bn_mul_words,.-bn_mul_words
+
+// void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
+.globl	bn_sqr_words
+.type	bn_sqr_words,@function
+.align	4
+bn_sqr_words:
+	ltgfr	%r4,%r4
+	bler	%r14
+
+	stmg	%r6,%r7,48(%r15)
+	srag	%r1,%r4,2	// cnt=len/4
+	jz	.Loop1_sqr
+
+.Loop4_sqr:
+	lg	%r7,0(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,0(%r2)
+	stg	%r6,8(%r2)
+
+	lg	%r7,8(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,16(%r2)
+	stg	%r6,24(%r2)
+
+	lg	%r7,16(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,32(%r2)
+	stg	%r6,40(%r2)
+
+	lg	%r7,24(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,48(%r2)
+	stg	%r6,56(%r2)
+
+	la	%r3,32(%r3)
+	la	%r2,64(%r2)
+	brct	%r1,.Loop4_sqr
+
+	lghi	%r1,3
+	nr	%r4,%r1		// cnt=len%4
+	jz	.Lend_sqr
+
+.Loop1_sqr:
+	lg	%r7,0(%r3)
+	mlgr	%r6,%r7
+	stg	%r7,0(%r2)
+	stg	%r6,8(%r2)
+
+	la	%r3,8(%r3)
+	la	%r2,16(%r2)
+	brct	%r4,.Loop1_sqr
+
+.Lend_sqr:
+	lmg	%r6,%r7,48(%r15)
+	br	%r14
+.size	bn_sqr_words,.-bn_sqr_words
+
+// BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
+.globl	bn_div_words
+.type	bn_div_words,@function
+.align	4
+bn_div_words:
+	dlgr	%r2,%r4
+	lgr	%r2,%r3
+	br	%r14
+.size	bn_div_words,.-bn_div_words
+
+// BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl	bn_add_words
+.type	bn_add_words,@function
+.align	4
+bn_add_words:
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0
+	ltgfr	%r5,%r5
+	bler	%r14		// if (len<=0) return 0;
+
+	stg	%r6,48(%r15)
+	lghi	%r6,3
+	nr	%r6,%r5		// len%4
+	sra	%r5,2		// len/4, use sra because it sets condition code
+	jz	.Loop1_add	// carry is incidentally cleared if branch taken
+	algr	%r2,%r2		// clear carry
+
+.Loop4_add:
+	lg	%r0,0(%r2,%r3)
+	alcg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+	lg	%r0,8(%r2,%r3)
+	alcg	%r0,8(%r2,%r4)
+	stg	%r0,8(%r2,%r1)
+	lg	%r0,16(%r2,%r3)
+	alcg	%r0,16(%r2,%r4)
+	stg	%r0,16(%r2,%r1)
+	lg	%r0,24(%r2,%r3)
+	alcg	%r0,24(%r2,%r4)
+	stg	%r0,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r5,.Loop4_add
+
+	la	%r6,1(%r6)	// see if len%4 is zero ...
+	brct	%r6,.Loop1_add	// without touching condition code:-)
+
+.Lexit_add:
+	lghi	%r2,0
+	alcgr	%r2,%r2
+	lg	%r6,48(%r15)
+	br	%r14
+
+.Loop1_add:
+	lg	%r0,0(%r2,%r3)
+	alcg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+
+	la	%r2,8(%r2)	// i++
+	brct	%r6,.Loop1_add
+
+	j	.Lexit_add
+.size	bn_add_words,.-bn_add_words
+
+// BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
+.globl	bn_sub_words
+.type	bn_sub_words,@function
+.align	4
+bn_sub_words:
+	la	%r1,0(%r2)	// put rp aside
+	lghi	%r2,0		// i=0
+	ltgfr	%r5,%r5
+	bler	%r14		// if (len<=0) return 0;
+
+	stg	%r6,48(%r15)
+	lghi	%r6,3
+	nr	%r6,%r5		// len%4
+	sra	%r5,2		// len/4, use sra because it sets condition code
+	jnz	.Loop4_sub	// borrow is incidentally cleared if branch taken
+	slgr	%r2,%r2		// clear borrow
+
+.Loop1_sub:
+	lg	%r0,0(%r2,%r3)
+	slbg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+
+	la	%r2,8(%r2)	// i++
+	brct	%r6,.Loop1_sub
+	j	.Lexit_sub
+
+.Loop4_sub:
+	lg	%r0,0(%r2,%r3)
+	slbg	%r0,0(%r2,%r4)
+	stg	%r0,0(%r2,%r1)
+	lg	%r0,8(%r2,%r3)
+	slbg	%r0,8(%r2,%r4)
+	stg	%r0,8(%r2,%r1)
+	lg	%r0,16(%r2,%r3)
+	slbg	%r0,16(%r2,%r4)
+	stg	%r0,16(%r2,%r1)
+	lg	%r0,24(%r2,%r3)
+	slbg	%r0,24(%r2,%r4)
+	stg	%r0,24(%r2,%r1)
+
+	la	%r2,32(%r2)	// i+=4
+	brct	%r5,.Loop4_sub
+
+	la	%r6,1(%r6)	// see if len%4 is zero ...
+	brct	%r6,.Loop1_sub	// without touching condition code:-)
+
+.Lexit_sub:
+	lghi	%r2,0
+	slbgr	%r2,%r2
+	lcgr	%r2,%r2
+	lg	%r6,48(%r15)
+	br	%r14
+.size	bn_sub_words,.-bn_sub_words
+
+#define c1	%r1
+#define c2	%r5
+#define c3	%r8
+
+#define mul_add_c(ai,bi,c1,c2,c3)	\
+	lg	%r7,ai*8(%r3);		\
+	mlg	%r6,bi*8(%r4);		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero
+
+// void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl	bn_mul_comba8
+.type	bn_mul_comba8,@function
+.align	4
+bn_mul_comba8:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	mul_add_c(0,0,c1,c2,c3);
+	stg	c1,0*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(0,1,c2,c3,c1);
+	mul_add_c(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(2,0,c3,c1,c2);
+	mul_add_c(1,1,c3,c1,c2);
+	mul_add_c(0,2,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(0,3,c1,c2,c3);
+	mul_add_c(1,2,c1,c2,c3);
+	mul_add_c(2,1,c1,c2,c3);
+	mul_add_c(3,0,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(4,0,c2,c3,c1);
+	mul_add_c(3,1,c2,c3,c1);
+	mul_add_c(2,2,c2,c3,c1);
+	mul_add_c(1,3,c2,c3,c1);
+	mul_add_c(0,4,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(0,5,c3,c1,c2);
+	mul_add_c(1,4,c3,c1,c2);
+	mul_add_c(2,3,c3,c1,c2);
+	mul_add_c(3,2,c3,c1,c2);
+	mul_add_c(4,1,c3,c1,c2);
+	mul_add_c(5,0,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(6,0,c1,c2,c3);
+	mul_add_c(5,1,c1,c2,c3);
+	mul_add_c(4,2,c1,c2,c3);
+	mul_add_c(3,3,c1,c2,c3);
+	mul_add_c(2,4,c1,c2,c3);
+	mul_add_c(1,5,c1,c2,c3);
+	mul_add_c(0,6,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(0,7,c2,c3,c1);
+	mul_add_c(1,6,c2,c3,c1);
+	mul_add_c(2,5,c2,c3,c1);
+	mul_add_c(3,4,c2,c3,c1);
+	mul_add_c(4,3,c2,c3,c1);
+	mul_add_c(5,2,c2,c3,c1);
+	mul_add_c(6,1,c2,c3,c1);
+	mul_add_c(7,0,c2,c3,c1);
+	stg	c2,7*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(7,1,c3,c1,c2);
+	mul_add_c(6,2,c3,c1,c2);
+	mul_add_c(5,3,c3,c1,c2);
+	mul_add_c(4,4,c3,c1,c2);
+	mul_add_c(3,5,c3,c1,c2);
+	mul_add_c(2,6,c3,c1,c2);
+	mul_add_c(1,7,c3,c1,c2);
+	stg	c3,8*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(2,7,c1,c2,c3);
+	mul_add_c(3,6,c1,c2,c3);
+	mul_add_c(4,5,c1,c2,c3);
+	mul_add_c(5,4,c1,c2,c3);
+	mul_add_c(6,3,c1,c2,c3);
+	mul_add_c(7,2,c1,c2,c3);
+	stg	c1,9*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(7,3,c2,c3,c1);
+	mul_add_c(6,4,c2,c3,c1);
+	mul_add_c(5,5,c2,c3,c1);
+	mul_add_c(4,6,c2,c3,c1);
+	mul_add_c(3,7,c2,c3,c1);
+	stg	c2,10*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(4,7,c3,c1,c2);
+	mul_add_c(5,6,c3,c1,c2);
+	mul_add_c(6,5,c3,c1,c2);
+	mul_add_c(7,4,c3,c1,c2);
+	stg	c3,11*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(7,5,c1,c2,c3);
+	mul_add_c(6,6,c1,c2,c3);
+	mul_add_c(5,7,c1,c2,c3);
+	stg	c1,12*8(%r2)
+	lghi	c1,0
+
+
+	mul_add_c(6,7,c2,c3,c1);
+	mul_add_c(7,6,c2,c3,c1);
+	stg	c2,13*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(7,7,c3,c1,c2);
+	stg	c3,14*8(%r2)
+	stg	c1,15*8(%r2)
+
+	lmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_mul_comba8,.-bn_mul_comba8
+
+// void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
+.globl	bn_mul_comba4
+.type	bn_mul_comba4,@function
+.align	4
+bn_mul_comba4:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	mul_add_c(0,0,c1,c2,c3);
+	stg	c1,0*8(%r3)
+	lghi	c1,0
+
+	mul_add_c(0,1,c2,c3,c1);
+	mul_add_c(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(2,0,c3,c1,c2);
+	mul_add_c(1,1,c3,c1,c2);
+	mul_add_c(0,2,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(0,3,c1,c2,c3);
+	mul_add_c(1,2,c1,c2,c3);
+	mul_add_c(2,1,c1,c2,c3);
+	mul_add_c(3,0,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	mul_add_c(3,1,c2,c3,c1);
+	mul_add_c(2,2,c2,c3,c1);
+	mul_add_c(1,3,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	mul_add_c(2,3,c3,c1,c2);
+	mul_add_c(3,2,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	mul_add_c(3,3,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	stg	c2,7*8(%r2)
+
+	stmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_mul_comba4,.-bn_mul_comba4
+
+#define sqr_add_c(ai,c1,c2,c3)		\
+	lg	%r7,ai*8(%r3);		\
+	mlgr	%r6,%r7;		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero
+
+#define sqr_add_c2(ai,aj,c1,c2,c3)	\
+	lg	%r7,ai*8(%r3);		\
+	mlg	%r6,aj*8(%r3);		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero;		\
+	algr	c1,%r7;			\
+	alcgr	c2,%r6;			\
+	alcgr	c3,zero
+
+// void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
+.globl	bn_sqr_comba8
+.type	bn_sqr_comba8,@function
+.align	4
+bn_sqr_comba8:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	sqr_add_c(0,c1,c2,c3);
+	stg	c1,0*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(1,c3,c1,c2);
+	sqr_add_c2(2,0,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c2(3,0,c1,c2,c3);
+	sqr_add_c2(2,1,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c(2,c2,c3,c1);
+	sqr_add_c2(3,1,c2,c3,c1);
+	sqr_add_c2(4,0,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c2(5,0,c3,c1,c2);
+	sqr_add_c2(4,1,c3,c1,c2);
+	sqr_add_c2(3,2,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c(3,c1,c2,c3);
+	sqr_add_c2(4,2,c1,c2,c3);
+	sqr_add_c2(5,1,c1,c2,c3);
+	sqr_add_c2(6,0,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(7,0,c2,c3,c1);
+	sqr_add_c2(6,1,c2,c3,c1);
+	sqr_add_c2(5,2,c2,c3,c1);
+	sqr_add_c2(4,3,c2,c3,c1);
+	stg	c2,7*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(4,c3,c1,c2);
+	sqr_add_c2(5,3,c3,c1,c2);
+	sqr_add_c2(6,2,c3,c1,c2);
+	sqr_add_c2(7,1,c3,c1,c2);
+	stg	c3,8*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c2(7,2,c1,c2,c3);
+	sqr_add_c2(6,3,c1,c2,c3);
+	sqr_add_c2(5,4,c1,c2,c3);
+	stg	c1,9*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c(5,c2,c3,c1);
+	sqr_add_c2(6,4,c2,c3,c1);
+	sqr_add_c2(7,3,c2,c3,c1);
+	stg	c2,10*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c2(7,4,c3,c1,c2);
+	sqr_add_c2(6,5,c3,c1,c2);
+	stg	c3,11*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c(6,c1,c2,c3);
+	sqr_add_c2(7,5,c1,c2,c3);
+	stg	c1,12*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(7,6,c2,c3,c1);
+	stg	c2,13*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(7,c3,c1,c2);
+	stg	c3,14*8(%r2)
+	stg	c1,15*8(%r2)
+
+	lmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_sqr_comba8,.-bn_sqr_comba8
+
+// void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
+.globl bn_sqr_comba4
+.type	bn_sqr_comba4,@function
+.align	4
+bn_sqr_comba4:
+	stmg	%r6,%r8,48(%r15)
+
+	lghi	c1,0
+	lghi	c2,0
+	lghi	c3,0
+	lghi	zero,0
+
+	sqr_add_c(0,c1,c2,c3);
+	stg	c1,0*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c2(1,0,c2,c3,c1);
+	stg	c2,1*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c(1,c3,c1,c2);
+	sqr_add_c2(2,0,c3,c1,c2);
+	stg	c3,2*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c2(3,0,c1,c2,c3);
+	sqr_add_c2(2,1,c1,c2,c3);
+	stg	c1,3*8(%r2)
+	lghi	c1,0
+
+	sqr_add_c(2,c2,c3,c1);
+	sqr_add_c2(3,1,c2,c3,c1);
+	stg	c2,4*8(%r2)
+	lghi	c2,0
+
+	sqr_add_c2(3,2,c3,c1,c2);
+	stg	c3,5*8(%r2)
+	lghi	c3,0
+
+	sqr_add_c(3,c1,c2,c3);
+	stg	c1,6*8(%r2)
+	stg	c2,7*8(%r2)
+
+	lmg	%r6,%r8,48(%r15)
+	br	%r14
+.size	bn_sqr_comba4,.-bn_sqr_comba4
--- a/crypto/bn/asm/sparct4-mont.pl
+++ b/crypto/bn/asm/sparct4-mont.pl
--- a/crypto/bn/asm/sparcv8.S
+++ b/crypto/bn/asm/sparcv8.S
--- a/crypto/bn/asm/sparcv8plus.S
+++ b/crypto/bn/asm/sparcv8plus.S
--- a/crypto/bn/asm/sparcv9-gf2m.pl
+++ b/crypto/bn/asm/sparcv9-gf2m.pl
@@ -0,0 +1,190 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# October 2012
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: one suitable
+# for all SPARCv9 processors and one for VIS3-capable ones. Former
+# delivers ~25-45% more, more for longer keys, heaviest DH and DSA
+# verify operations on venerable UltraSPARC II. On T4 VIS3 code is
+# ~100-230% faster than gcc-generated code and ~35-90% faster than
+# the pure SPARCv9 code path.
+
+$locals=16*8;
+
+$tab="%l0";
+
+@T=("%g2","%g3");
+@i=("%g4","%g5");
+
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%o$_",(0..5));
+($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;
+
+$code.=<<___;
+#include <sparc_arch.h>
+
+#ifdef __arch64__
+.register	%g2,#scratch
+.register	%g3,#scratch
+#endif
+
+#ifdef __PIC__
+SPARC_PIC_THUNK(%g1)
+#endif
+
+.globl	bn_GF2m_mul_2x2
+.align	16
+bn_GF2m_mul_2x2:
+        SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
+        ld	[%g1+0],%g1             	! OPENSSL_sparcv9cap_P[0]
+
+        andcc	%g1, SPARCV9_VIS3, %g0
+        bz,pn	%icc,.Lsoftware
+        nop
+
+	sllx	%o1, 32, %o1
+	sllx	%o3, 32, %o3
+	or	%o2, %o1, %o1
+	or	%o4, %o3, %o3
+	.word	0x95b262ab			! xmulx   %o1, %o3, %o2
+	.word	0x99b262cb			! xmulxhi %o1, %o3, %o4
+	srlx	%o2, 32, %o1			! 13 cycles later
+	st	%o2, [%o0+0]
+	st	%o1, [%o0+4]
+	srlx	%o4, 32, %o3
+	st	%o4, [%o0+8]
+	retl
+	st	%o3, [%o0+12]
+
+.align	16
+.Lsoftware:
+	save	%sp,-STACK_FRAME-$locals,%sp
+
+	sllx	%i1,32,$a
+	mov	-1,$a12
+	sllx	%i3,32,$b
+	or	%i2,$a,$a
+	srlx	$a12,1,$a48			! 0x7fff...
+	or	%i4,$b,$b
+	srlx	$a12,2,$a12			! 0x3fff...
+	add	%sp,STACK_BIAS+STACK_FRAME,$tab
+
+	sllx	$a,2,$a4
+	mov	$a,$a1
+	sllx	$a,1,$a2
+
+	srax	$a4,63,@i[1]			! broadcast 61st bit
+	and	$a48,$a4,$a4			! (a<<2)&0x7fff...
+	srlx	$a48,2,$a48
+	srax	$a2,63,@i[0]			! broadcast 62nd bit
+	and	$a12,$a2,$a2			! (a<<1)&0x3fff...
+	srax	$a1,63,$lo			! broadcast 63rd bit
+	and	$a48,$a1,$a1			! (a<<0)&0x1fff...
+
+	sllx	$a1,3,$a8
+	and	$b,$lo,$lo
+	and	$b,@i[0],@i[0]
+	and	$b,@i[1],@i[1]
+
+	stx	%g0,[$tab+0*8]			! tab[0]=0
+	xor	$a1,$a2,$a12
+	stx	$a1,[$tab+1*8]			! tab[1]=a1
+	stx	$a2,[$tab+2*8]			! tab[2]=a2
+	 xor	$a4,$a8,$a48
+	stx	$a12,[$tab+3*8]			! tab[3]=a1^a2
+	 xor	$a4,$a1,$a1
+
+	stx	$a4,[$tab+4*8]			! tab[4]=a4
+	xor	$a4,$a2,$a2
+	stx	$a1,[$tab+5*8]			! tab[5]=a1^a4
+	xor	$a4,$a12,$a12
+	stx	$a2,[$tab+6*8]			! tab[6]=a2^a4
+	 xor	$a48,$a1,$a1
+	stx	$a12,[$tab+7*8]			! tab[7]=a1^a2^a4
+	 xor	$a48,$a2,$a2
+
+	stx	$a8,[$tab+8*8]			! tab[8]=a8
+	xor	$a48,$a12,$a12
+	stx	$a1,[$tab+9*8]			! tab[9]=a1^a8
+	 xor	$a4,$a1,$a1
+	stx	$a2,[$tab+10*8]			! tab[10]=a2^a8
+	 xor	$a4,$a2,$a2
+	stx	$a12,[$tab+11*8]		! tab[11]=a1^a2^a8
+
+	xor	$a4,$a12,$a12
+	stx	$a48,[$tab+12*8]		! tab[12]=a4^a8
+	 srlx	$lo,1,$hi
+	stx	$a1,[$tab+13*8]			! tab[13]=a1^a4^a8
+	 sllx	$lo,63,$lo
+	stx	$a2,[$tab+14*8]			! tab[14]=a2^a4^a8
+	 srlx	@i[0],2,@T[0]
+	stx	$a12,[$tab+15*8]		! tab[15]=a1^a2^a4^a8
+
+	sllx	@i[0],62,$a1
+	 sllx	$b,3,@i[0]
+	srlx	@i[1],3,@T[1]
+	 and	@i[0],`0xf<<3`,@i[0]
+	sllx	@i[1],61,$a2
+	 ldx	[$tab+@i[0]],@i[0]
+	 srlx	$b,4-3,@i[1]
+	xor	@T[0],$hi,$hi
+	 and	@i[1],`0xf<<3`,@i[1]
+	xor	$a1,$lo,$lo
+	 ldx	[$tab+@i[1]],@i[1]
+	xor	@T[1],$hi,$hi
+
+	xor	@i[0],$lo,$lo
+	srlx	$b,8-3,@i[0]
+	 xor	$a2,$lo,$lo
+	and	@i[0],`0xf<<3`,@i[0]
+___
+for($n=1;$n<14;$n++) {
+$code.=<<___;
+	sllx	@i[1],`$n*4`,@T[0]
+	ldx	[$tab+@i[0]],@i[0]
+	srlx	@i[1],`64-$n*4`,@T[1]
+	xor	@T[0],$lo,$lo
+	srlx	$b,`($n+2)*4`-3,@i[1]
+	xor	@T[1],$hi,$hi
+	and	@i[1],`0xf<<3`,@i[1]
+___
+	push(@i,shift(@i)); push(@T,shift(@T));
+}
+$code.=<<___;
+	sllx	@i[1],`$n*4`,@T[0]
+	ldx	[$tab+@i[0]],@i[0]
+	srlx	@i[1],`64-$n*4`,@T[1]
+	xor	@T[0],$lo,$lo
+
+	sllx	@i[0],`($n+1)*4`,@T[0]
+	 xor	@T[1],$hi,$hi
+	srlx	@i[0],`64-($n+1)*4`,@T[1]
+	xor	@T[0],$lo,$lo
+	xor	@T[1],$hi,$hi
+
+	srlx	$lo,32,%i1
+	st	$lo,[%i0+0]
+	st	%i1,[%i0+4]
+	srlx	$hi,32,%i2
+	st	$hi,[%i0+8]
+	st	%i2,[%i0+12]
+
+	ret
+	restore
+.type	bn_GF2m_mul_2x2,#function
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz	"GF(2^m) Multiplication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
--- a/crypto/bn/asm/sparcv9-mont.pl
+++ b/crypto/bn/asm/sparcv9-mont.pl
@@ -0,0 +1,606 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# December 2005
+#
+# Pure SPARCv9/8+ and IALU-only bn_mul_mont implementation. The reasons
+# for undertaken effort are multiple. First of all, UltraSPARC is not
+# the whole SPARCv9 universe and other VIS-free implementations deserve
+# optimized code as much. Secondly, newly introduced UltraSPARC T1,
+# a.k.a. Niagara, has shared FPU and concurrent FPU-intensive pathes,
+# such as sparcv9a-mont, will simply sink it. Yes, T1 is equipped with
+# several integrated RSA/DSA accelerator circuits accessible through
+# kernel driver [only(*)], but having decent user-land software
+# implementation is important too. Finally, reasons like desire to
+# experiment with dedicated squaring procedure. Yes, this module
+# implements one, because it was easiest to draft it in SPARCv9
+# instructions...
+
+# (*)	Engine accessing the driver in question is on my TODO list.
+#	For reference, acceleator is estimated to give 6 to 10 times
+#	improvement on single-threaded RSA sign. It should be noted
+#	that 6-10x improvement coefficient does not actually mean
+#	something extraordinary in terms of absolute [single-threaded]
+#	performance, as SPARCv9 instruction set is by all means least
+#	suitable for high performance crypto among other 64 bit
+#	platforms. 6-10x factor simply places T1 in same performance
+#	domain as say AMD64 and IA-64. Improvement of RSA verify don't
+#	appear impressive at all, but it's the sign operation which is
+#	far more critical/interesting.
+
+# You might notice that inner loops are modulo-scheduled:-) This has
+# essentially negligible impact on UltraSPARC performance, it's
+# Fujitsu SPARC64 V users who should notice and hopefully appreciate
+# the advantage... Currently this module surpasses sparcv9a-mont.pl
+# by ~20% on UltraSPARC-III and later cores, but recall that sparcv9a
+# module still have hidden potential [see TODO list there], which is
+# estimated to be larger than 20%...
+
+# int bn_mul_mont(
+$rp="%i0";	# BN_ULONG *rp,
+$ap="%i1";	# const BN_ULONG *ap,
+$bp="%i2";	# const BN_ULONG *bp,
+$np="%i3";	# const BN_ULONG *np,
+$n0="%i4";	# const BN_ULONG *n0,
+$num="%i5";	# int num);
+
+$bits=32;
+for (@ARGV)	{ $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)	{ $bias=2047; $frame=192; }
+else		{ $bias=0;    $frame=128; }
+
+$car0="%o0";
+$car1="%o1";
+$car2="%o2";	# 1 bit
+$acc0="%o3";
+$acc1="%o4";
+$mask="%g1";	# 32 bits, what a waste...
+$tmp0="%g4";
+$tmp1="%g5";
+
+$i="%l0";
+$j="%l1";
+$mul0="%l2";
+$mul1="%l3";
+$tp="%l4";
+$apj="%l5";
+$npj="%l6";
+$tpj="%l7";
+
+$fname="bn_mul_mont_int";
+
+$code=<<___;
+.section	".text",#alloc,#execinstr
+
+.global	$fname
+.align	32
+$fname:
+	cmp	%o5,4			! 128 bits minimum
+	bge,pt	%icc,.Lenter
+	sethi	%hi(0xffffffff),$mask
+	retl
+	clr	%o0
+.align	32
+.Lenter:
+	save	%sp,-$frame,%sp
+	sll	$num,2,$num		! num*=4
+	or	$mask,%lo(0xffffffff),$mask
+	ld	[$n0],$n0
+	cmp	$ap,$bp
+	and	$num,$mask,$num
+	ld	[$bp],$mul0		! bp[0]
+	nop
+
+	add	%sp,$bias,%o7		! real top of stack
+	ld	[$ap],$car0		! ap[0] ! redundant in squaring context
+	sub	%o7,$num,%o7
+	ld	[$ap+4],$apj		! ap[1]
+	and	%o7,-1024,%o7
+	ld	[$np],$car1		! np[0]
+	sub	%o7,$bias,%sp		! alloca
+	ld	[$np+4],$npj		! np[1]
+	be,pt	`$bits==32?"%icc":"%xcc"`,.Lbn_sqr_mont
+	mov	12,$j
+
+	mulx	$car0,$mul0,$car0	! ap[0]*bp[0]
+	mulx	$apj,$mul0,$tmp0	!prologue! ap[1]*bp[0]
+	and	$car0,$mask,$acc0
+	add	%sp,$bias+$frame,$tp
+	ld	[$ap+8],$apj		!prologue!
+
+	mulx	$n0,$acc0,$mul1		! "t[0]"*n0
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1	! np[0]*"t[0]"*n0
+	mulx	$npj,$mul1,$acc1	!prologue! np[1]*"t[0]"*n0
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	ld	[$np+8],$npj		!prologue!
+	srlx	$car1,32,$car1
+	mov	$tmp0,$acc0		!prologue!
+
+.L1st:
+	mulx	$apj,$mul0,$tmp0
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0
+	ld	[$ap+$j],$apj		! ap[j]
+	and	$car0,$mask,$acc0
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj		! np[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	add	$j,4,$j			! j++
+	mov	$tmp0,$acc0
+	st	$car1,[$tp]
+	cmp	$j,$num
+	mov	$tmp1,$acc1
+	srlx	$car1,32,$car1
+	bl	%icc,.L1st
+	add	$tp,4,$tp		! tp++
+!.L1st
+
+	mulx	$apj,$mul0,$tmp0	!epilogue!
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0
+	and	$car0,$mask,$acc0
+	add	$acc1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$tmp0,$car0,$car0
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car1,$car1
+	st	$car1,[$tp+8]
+	srlx	$car1,32,$car2
+
+	mov	4,$i			! i++
+	ld	[$bp+4],$mul0		! bp[1]
+.Louter:
+	add	%sp,$bias+$frame,$tp
+	ld	[$ap],$car0		! ap[0]
+	ld	[$ap+4],$apj		! ap[1]
+	ld	[$np],$car1		! np[0]
+	ld	[$np+4],$npj		! np[1]
+	ld	[$tp],$tmp1		! tp[0]
+	ld	[$tp+4],$tpj		! tp[1]
+	mov	12,$j
+
+	mulx	$car0,$mul0,$car0
+	mulx	$apj,$mul0,$tmp0	!prologue!
+	add	$tmp1,$car0,$car0
+	ld	[$ap+8],$apj		!prologue!
+	and	$car0,$mask,$acc0
+
+	mulx	$n0,$acc0,$mul1
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1
+	mulx	$npj,$mul1,$acc1	!prologue!
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	ld	[$np+8],$npj		!prologue!
+	srlx	$car1,32,$car1
+	mov	$tmp0,$acc0		!prologue!
+
+.Linner:
+	mulx	$apj,$mul0,$tmp0
+	mulx	$npj,$mul1,$tmp1
+	add	$tpj,$car0,$car0
+	ld	[$ap+$j],$apj		! ap[j]
+	add	$acc0,$car0,$car0
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj		! np[j]
+	and	$car0,$mask,$acc0
+	ld	[$tp+8],$tpj		! tp[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	add	$j,4,$j			! j++
+	mov	$tmp0,$acc0
+	st	$car1,[$tp]		! tp[j-1]
+	srlx	$car1,32,$car1
+	mov	$tmp1,$acc1
+	cmp	$j,$num
+	bl	%icc,.Linner
+	add	$tp,4,$tp		! tp++
+!.Linner
+
+	mulx	$apj,$mul0,$tmp0	!epilogue!
+	mulx	$npj,$mul1,$tmp1
+	add	$tpj,$car0,$car0
+	add	$acc0,$car0,$car0
+	ld	[$tp+8],$tpj		! tp[j]
+	and	$car0,$mask,$acc0
+	add	$acc1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]		! tp[j-1]
+	srlx	$car1,32,$car1
+
+	add	$tpj,$car0,$car0
+	add	$tmp0,$car0,$car0
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp+4]		! tp[j-1]
+	srlx	$car0,32,$car0
+	add	$i,4,$i			! i++
+	srlx	$car1,32,$car1
+
+	add	$car0,$car1,$car1
+	cmp	$i,$num
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+8]
+
+	srlx	$car1,32,$car2
+	bl,a	%icc,.Louter
+	ld	[$bp+$i],$mul0		! bp[i]
+!.Louter
+
+	add	$tp,12,$tp
+
+.Ltail:
+	add	$np,$num,$np
+	add	$rp,$num,$rp
+	mov	$tp,$ap
+	sub	%g0,$num,%o7		! k=-num
+	ba	.Lsub
+	subcc	%g0,%g0,%g0		! clear %icc.c
+.align	16
+.Lsub:
+	ld	[$tp+%o7],%o0
+	ld	[$np+%o7],%o1
+	subccc	%o0,%o1,%o1		! tp[j]-np[j]
+	add	$rp,%o7,$i
+	add	%o7,4,%o7
+	brnz	%o7,.Lsub
+	st	%o1,[$i]
+	subc	$car2,0,$car2		! handle upmost overflow bit
+	and	$tp,$car2,$ap
+	andn	$rp,$car2,$np
+	or	$ap,$np,$ap
+	sub	%g0,$num,%o7
+
+.Lcopy:
+	ld	[$ap+%o7],%o0		! copy or in-place refresh
+	st	%g0,[$tp+%o7]		! zap tp
+	st	%o0,[$rp+%o7]
+	add	%o7,4,%o7
+	brnz	%o7,.Lcopy
+	nop
+	mov	1,%i0
+	ret
+	restore
+___
+
+########
+######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
+######## code without following dedicated squaring procedure.
+########
+$sbit="%i2";		# re-use $bp!
+
+$code.=<<___;
+.align	32
+.Lbn_sqr_mont:
+	mulx	$mul0,$mul0,$car0		! ap[0]*ap[0]
+	mulx	$apj,$mul0,$tmp0		!prologue!
+	and	$car0,$mask,$acc0
+	add	%sp,$bias+$frame,$tp
+	ld	[$ap+8],$apj			!prologue!
+
+	mulx	$n0,$acc0,$mul1			! "t[0]"*n0
+	srlx	$car0,32,$car0
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1		! np[0]*"t[0]"*n0
+	mulx	$npj,$mul1,$acc1		!prologue!
+	and	$car0,1,$sbit
+	ld	[$np+8],$npj			!prologue!
+	srlx	$car0,1,$car0
+	add	$acc0,$car1,$car1
+	srlx	$car1,32,$car1
+	mov	$tmp0,$acc0			!prologue!
+
+.Lsqr_1st:
+	mulx	$apj,$mul0,$tmp0
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0		! ap[j]*a0+c0
+	add	$acc1,$car1,$car1
+	ld	[$ap+$j],$apj			! ap[j]
+	and	$car0,$mask,$acc0
+	ld	[$np+$j],$npj			! np[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	mov	$tmp1,$acc1
+	srlx	$acc0,32,$sbit
+	add	$j,4,$j				! j++
+	and	$acc0,$mask,$acc0
+	cmp	$j,$num
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]
+	mov	$tmp0,$acc0
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_1st
+	add	$tp,4,$tp			! tp++
+!.Lsqr_1st
+
+	mulx	$apj,$mul0,$tmp0		! epilogue
+	mulx	$npj,$mul1,$tmp1
+	add	$acc0,$car0,$car0		! ap[j]*a0+c0
+	add	$acc1,$car1,$car1
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$tmp0,$car0,$car0		! ap[j]*a0+c0
+	add	$tmp1,$car1,$car1
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	st	$car1,[$tp+8]
+	srlx	$car1,32,$car2
+
+	ld	[%sp+$bias+$frame],$tmp0	! tp[0]
+	ld	[%sp+$bias+$frame+4],$tmp1	! tp[1]
+	ld	[%sp+$bias+$frame+8],$tpj	! tp[2]
+	ld	[$ap+4],$mul0			! ap[1]
+	ld	[$ap+8],$apj			! ap[2]
+	ld	[$np],$car1			! np[0]
+	ld	[$np+4],$npj			! np[1]
+	mulx	$n0,$tmp0,$mul1
+
+	mulx	$mul0,$mul0,$car0
+	and	$mul1,$mask,$mul1
+
+	mulx	$car1,$mul1,$car1
+	mulx	$npj,$mul1,$acc1
+	add	$tmp0,$car1,$car1
+	and	$car0,$mask,$acc0
+	ld	[$np+8],$npj			! np[2]
+	srlx	$car1,32,$car1
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	$acc0,$car1,$car1
+	and	$car0,1,$sbit
+	add	$acc1,$car1,$car1
+	srlx	$car0,1,$car0
+	mov	12,$j
+	st	$car1,[%sp+$bias+$frame]	! tp[0]=
+	srlx	$car1,32,$car1
+	add	%sp,$bias+$frame+4,$tp
+
+.Lsqr_2nd:
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$acc0,$car0,$car0
+	add	$tpj,$car1,$car1
+	ld	[$ap+$j],$apj			! ap[j]
+	and	$car0,$mask,$acc0
+	ld	[$np+$j],$npj			! np[j]
+	srlx	$car0,32,$car0
+	add	$acc1,$car1,$car1
+	ld	[$tp+8],$tpj			! tp[j]
+	add	$acc0,$acc0,$acc0
+	add	$j,4,$j				! j++
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	cmp	$j,$num
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_2nd
+	add	$tp,4,$tp			! tp++
+!.Lsqr_2nd
+
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$acc0,$car0,$car0
+	add	$tpj,$car1,$car1
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc1,$car1,$car1
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car2
+
+	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
+	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
+	ld	[$ap+8],$mul0			! ap[2]
+	ld	[$np],$car1			! np[0]
+	ld	[$np+4],$npj			! np[1]
+	mulx	$n0,$tmp1,$mul1
+	and	$mul1,$mask,$mul1
+	mov	8,$i
+
+	mulx	$mul0,$mul0,$car0
+	mulx	$car1,$mul1,$car1
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	%sp,$bias+$frame,$tp
+	srlx	$car1,32,$car1
+	and	$car0,1,$sbit
+	srlx	$car0,1,$car0
+	mov	4,$j
+
+.Lsqr_outer:
+.Lsqr_inner1:
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$j,4,$j
+	ld	[$tp+8],$tpj
+	cmp	$j,$i
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_inner1
+	add	$tp,4,$tp
+!.Lsqr_inner1
+
+	add	$j,4,$j
+	ld	[$ap+$j],$apj			! ap[j]
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	ld	[$np+$j],$npj			! np[j]
+	add	$acc0,$car1,$car1
+	ld	[$tp+8],$tpj			! tp[j]
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$j,4,$j
+	cmp	$j,$num
+	be,pn	%icc,.Lsqr_no_inner2
+	add	$tp,4,$tp
+
+.Lsqr_inner2:
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$acc0,$car0,$car0
+	ld	[$ap+$j],$apj			! ap[j]
+	and	$car0,$mask,$acc0
+	ld	[$np+$j],$npj			! np[j]
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	ld	[$tp+8],$tpj			! tp[j]
+	or	$sbit,$acc0,$acc0
+	add	$j,4,$j				! j++
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	cmp	$j,$num
+	add	$acc0,$car1,$car1
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_inner2
+	add	$tp,4,$tp			! tp++
+
+.Lsqr_no_inner2:
+	mulx	$apj,$mul0,$acc0
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$acc0,$car0,$car0
+	and	$car0,$mask,$acc0
+	srlx	$car0,32,$car0
+	add	$acc0,$acc0,$acc0
+	or	$sbit,$acc0,$acc0
+	srlx	$acc0,32,$sbit
+	and	$acc0,$mask,$acc0
+	add	$acc0,$car1,$car1
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]			! tp[j-1]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car2
+
+	add	$i,4,$i				! i++
+	ld	[%sp+$bias+$frame],$tmp1	! tp[0]
+	ld	[%sp+$bias+$frame+4],$tpj	! tp[1]
+	ld	[$ap+$i],$mul0			! ap[j]
+	ld	[$np],$car1			! np[0]
+	ld	[$np+4],$npj			! np[1]
+	mulx	$n0,$tmp1,$mul1
+	and	$mul1,$mask,$mul1
+	add	$i,4,$tmp0
+
+	mulx	$mul0,$mul0,$car0
+	mulx	$car1,$mul1,$car1
+	and	$car0,$mask,$acc0
+	add	$tmp1,$car1,$car1
+	srlx	$car0,32,$car0
+	add	%sp,$bias+$frame,$tp
+	srlx	$car1,32,$car1
+	and	$car0,1,$sbit
+	srlx	$car0,1,$car0
+
+	cmp	$tmp0,$num			! i<num-1
+	bl	%icc,.Lsqr_outer
+	mov	4,$j
+
+.Lsqr_last:
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$j,4,$j
+	ld	[$tp+8],$tpj
+	cmp	$j,$i
+	add	$acc1,$car1,$car1
+	ld	[$np+$j],$npj
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+	bl	%icc,.Lsqr_last
+	add	$tp,4,$tp
+!.Lsqr_last
+
+	mulx	$npj,$mul1,$acc1
+	add	$tpj,$car1,$car1
+	add	$acc0,$car1,$car1
+	add	$acc1,$car1,$car1
+	st	$car1,[$tp]
+	srlx	$car1,32,$car1
+
+	add	$car0,$car0,$car0		! recover $car0
+	or	$sbit,$car0,$car0
+	add	$car0,$car1,$car1
+	add	$car2,$car1,$car1
+	st	$car1,[$tp+4]
+	srlx	$car1,32,$car2
+
+	ba	.Ltail
+	add	$tp,8,$tp
+.type	$fname,#function
+.size	$fname,(.-$fname)
+.asciz	"Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align	32
+___
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
--- a/crypto/bn/asm/sparcv9a-mont.pl
+++ b/crypto/bn/asm/sparcv9a-mont.pl
@@ -0,0 +1,882 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
+# Because unlike integer multiplier, which simply stalls whole CPU,
+# FPU is fully pipelined and can effectively emit 48 bit partial
+# product every cycle. Why not blended SPARC v9? One can argue that
+# making this module dependent on UltraSPARC VIS extension limits its
+# binary compatibility. Well yes, it does exclude SPARC64 prior-V(!)
+# implementations from compatibility matrix. But the rest, whole Sun
+# UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
+# VIS extension instructions used in this module. This is considered
+# good enough to not care about HAL SPARC64 users [if any] who have
+# integer-only pure SPARCv9 module to "fall down" to.
+
+# USI&II cores currently exhibit uniform 2x improvement [over pre-
+# bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
+# performance improves few percents for shorter keys and worsens few
+# percents for longer keys. This is because USIII integer multiplier
+# is >3x faster than USI&II one, which is harder to match [but see
+# TODO list below]. It should also be noted that SPARC64 V features
+# out-of-order execution, which *might* mean that integer multiplier
+# is pipelined, which in turn *might* be impossible to match... On
+# additional note, SPARC64 V implements FP Multiply-Add instruction,
+# which is perfectly usable in this context... In other words, as far
+# as Fujitsu SPARC64 V goes, talk to the author:-)
+
+# The implementation implies following "non-natural" limitations on
+# input arguments:
+# - num may not be less than 4;
+# - num has to be even;
+# Failure to meet either condition has no fatal effects, simply
+# doesn't give any performance gain.
+
+# TODO:
+# - modulo-schedule inner loop for better performance (on in-order
+#   execution core such as UltraSPARC this shall result in further
+#   noticeable(!) improvement);
+# - dedicated squaring procedure[?];
+
+######################################################################
+# November 2006
+#
+# Modulo-scheduled inner loops allow to interleave floating point and
+# integer instructions and minimize Read-After-Write penalties. This
+# results in *further* 20-50% perfromance improvement [depending on
+# key length, more for longer keys] on USI&II cores and 30-80% - on
+# USIII&IV.
+
+$fname="bn_mul_mont_fpu";
+$bits=32;
+for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+
+if ($bits==64) {
+	$bias=2047;
+	$frame=192;
+} else {
+	$bias=0;
+	$frame=128;	# 96 rounded up to largest known cache-line
+}
+$locals=64;
+
+# In order to provide for 32-/64-bit ABI duality, I keep integers wider
+# than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
+# exclusively for pointers, indexes and other small values...
+# int bn_mul_mont(
+$rp="%i0";	# BN_ULONG *rp,
+$ap="%i1";	# const BN_ULONG *ap,
+$bp="%i2";	# const BN_ULONG *bp,
+$np="%i3";	# const BN_ULONG *np,
+$n0="%i4";	# const BN_ULONG *n0,
+$num="%i5";	# int num);
+
+$tp="%l0";	# t[num]
+$ap_l="%l1";	# a[num],n[num] are smashed to 32-bit words and saved
+$ap_h="%l2";	# to these four vectors as double-precision FP values.
+$np_l="%l3";	# This way a bunch of fxtods are eliminated in second
+$np_h="%l4";	# loop and L1-cache aliasing is minimized...
+$i="%l5";
+$j="%l6";
+$mask="%l7";	# 16-bit mask, 0xffff
+
+$n0="%g4";	# reassigned(!) to "64-bit" register
+$carry="%i4";	# %i4 reused(!) for a carry bit
+
+# FP register naming chart
+#
+#     ..HILO
+#       dcba
+#   --------
+#        LOa
+#       LOb
+#      LOc
+#     LOd
+#      HIa
+#     HIb
+#    HIc
+#   HId
+#    ..a
+#   ..b
+$ba="%f0";    $bb="%f2";    $bc="%f4";    $bd="%f6";
+$na="%f8";    $nb="%f10";   $nc="%f12";   $nd="%f14";
+$alo="%f16";  $alo_="%f17"; $ahi="%f18";  $ahi_="%f19";
+$nlo="%f20";  $nlo_="%f21"; $nhi="%f22";  $nhi_="%f23";
+
+$dota="%f24"; $dotb="%f26";
+
+$aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
+$ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
+$nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
+$nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
+
+$ASI_FL16_P=0xD2;	# magic ASI value to engage 16-bit FP load
+
+$code=<<___;
+.section	".text",#alloc,#execinstr
+
+.global $fname
+.align  32
+$fname:
+	save	%sp,-$frame-$locals,%sp
+
+	cmp	$num,4
+	bl,a,pn %icc,.Lret
+	clr	%i0
+	andcc	$num,1,%g0		! $num has to be even...
+	bnz,a,pn %icc,.Lret
+	clr	%i0			! signal "unsupported input value"
+
+	srl	$num,1,$num
+	sethi	%hi(0xffff),$mask
+	ld	[%i4+0],$n0		! $n0 reassigned, remember?
+	or	$mask,%lo(0xffff),$mask
+	ld	[%i4+4],%o0
+	sllx	%o0,32,%o0
+	or	%o0,$n0,$n0		! $n0=n0[1].n0[0]
+
+	sll	$num,3,$num		! num*=8
+
+	add	%sp,$bias,%o0		! real top of stack
+	sll	$num,2,%o1
+	add	%o1,$num,%o1		! %o1=num*5
+	sub	%o0,%o1,%o0
+	and	%o0,-2048,%o0		! optimize TLB utilization
+	sub	%o0,$bias,%sp		! alloca(5*num*8)
+
+	rd	%asi,%o7		! save %asi
+	add	%sp,$bias+$frame+$locals,$tp
+	add	$tp,$num,$ap_l
+	add	$ap_l,$num,$ap_l	! [an]p_[lh] point at the vectors' ends !
+	add	$ap_l,$num,$ap_h
+	add	$ap_h,$num,$np_l
+	add	$np_l,$num,$np_h
+
+	wr	%g0,$ASI_FL16_P,%asi	! setup %asi for 16-bit FP loads
+
+	add	$rp,$num,$rp		! readjust input pointers to point
+	add	$ap,$num,$ap		! at the ends too...
+	add	$bp,$num,$bp
+	add	$np,$num,$np
+
+	stx	%o7,[%sp+$bias+$frame+48]	! save %asi
+
+	sub	%g0,$num,$i		! i=-num
+	sub	%g0,$num,$j		! j=-num
+
+	add	$ap,$j,%o3
+	add	$bp,$i,%o4
+
+	ld	[%o3+4],%g1		! bp[0]
+	ld	[%o3+0],%o0
+	ld	[%o4+4],%g5		! ap[0]
+	sllx	%g1,32,%g1
+	ld	[%o4+0],%o1
+	sllx	%g5,32,%g5
+	or	%g1,%o0,%o0
+	or	%g5,%o1,%o1
+
+	add	$np,$j,%o5
+
+	mulx	%o1,%o0,%o0		! ap[0]*bp[0]
+	mulx	$n0,%o0,%o0		! ap[0]*bp[0]*n0
+	stx	%o0,[%sp+$bias+$frame+0]
+
+	ld	[%o3+0],$alo_	! load a[j] as pair of 32-bit words
+	fzeros	$alo
+	ld	[%o3+4],$ahi_
+	fzeros	$ahi
+	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
+	fzeros	$nlo
+	ld	[%o5+4],$nhi_
+	fzeros	$nhi
+
+	! transfer b[i] to FPU as 4x16-bit values
+	ldda	[%o4+2]%asi,$ba
+	fxtod	$alo,$alo
+	ldda	[%o4+0]%asi,$bb
+	fxtod	$ahi,$ahi
+	ldda	[%o4+6]%asi,$bc
+	fxtod	$nlo,$nlo
+	ldda	[%o4+4]%asi,$bd
+	fxtod	$nhi,$nhi
+
+	! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
+	ldda	[%sp+$bias+$frame+6]%asi,$na
+	fxtod	$ba,$ba
+	ldda	[%sp+$bias+$frame+4]%asi,$nb
+	fxtod	$bb,$bb
+	ldda	[%sp+$bias+$frame+2]%asi,$nc
+	fxtod	$bc,$bc
+	ldda	[%sp+$bias+$frame+0]%asi,$nd
+	fxtod	$bd,$bd
+
+	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
+	fxtod	$na,$na
+	std	$ahi,[$ap_h+$j]
+	fxtod	$nb,$nb
+	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
+	fxtod	$nc,$nc
+	std	$nhi,[$np_h+$j]
+	fxtod	$nd,$nd
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+		fmuld	$alo,$bd,$alod
+	faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+		fmuld	$ahi,$ba,$ahia
+	faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+		fmuld	$ahi,$bb,$ahib
+	faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+		fmuld	$ahi,$bc,$ahic
+	faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+		fmuld	$ahi,$bd,$ahid
+	faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	add	$j,8,$j
+	std	$nlob,[%sp+$bias+$frame+8]
+	add	$ap,$j,%o4
+	std	$nloc,[%sp+$bias+$frame+16]
+	add	$np,$j,%o5
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
+	fzeros	$alo
+	ld	[%o4+4],$ahi_
+	fzeros	$ahi
+	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
+	fzeros	$nlo
+	ld	[%o5+4],$nhi_
+	fzeros	$nhi
+
+	fxtod	$alo,$alo
+	fxtod	$ahi,$ahi
+	fxtod	$nlo,$nlo
+	fxtod	$nhi,$nhi
+
+	ldx	[%sp+$bias+$frame+0],%o0
+		fmuld	$alo,$ba,$aloa
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$nlo,$na,$nloa
+	ldx	[%sp+$bias+$frame+16],%o2
+		fmuld	$alo,$bb,$alob
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$nlo,$nb,$nlob
+
+	srlx	%o0,16,%o7
+	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
+		fmuld	$alo,$bc,$aloc
+	add	%o7,%o1,%o1
+	std	$ahi,[$ap_h+$j]
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	srlx	%o1,16,%o7
+	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
+		fmuld	$alo,$bd,$alod
+	add	%o7,%o2,%o2
+	std	$nhi,[$np_h+$j]
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	srlx	%o2,16,%o7
+		fmuld	$ahi,$ba,$ahia
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	!and	%o0,$mask,%o0
+	!and	%o1,$mask,%o1
+	!and	%o2,$mask,%o2
+	!sllx	%o1,16,%o1
+	!sllx	%o2,32,%o2
+	!sllx	%o3,48,%o7
+	!or	%o1,%o0,%o0
+	!or	%o2,%o0,%o0
+	!or	%o7,%o0,%o0		! 64-bit result
+	srlx	%o3,16,%g1		! 34-bit carry
+		fmuld	$ahi,$bb,$ahib
+
+	faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+		fmuld	$ahi,$bc,$ahic
+	faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+		fmuld	$ahi,$bd,$ahid
+	faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+
+	faddd	$dota,$nloa,$nloa
+	faddd	$dotb,$nlob,$nlob
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	addcc	$j,8,$j
+	std	$nloc,[%sp+$bias+$frame+16]
+	bz,pn	%icc,.L1stskip
+	std	$nlod,[%sp+$bias+$frame+24]
+
+.align	32			! incidentally already aligned !
+.L1st:
+	add	$ap,$j,%o4
+	add	$np,$j,%o5
+	ld	[%o4+0],$alo_	! load a[j] as pair of 32-bit words
+	fzeros	$alo
+	ld	[%o4+4],$ahi_
+	fzeros	$ahi
+	ld	[%o5+0],$nlo_	! load n[j] as pair of 32-bit words
+	fzeros	$nlo
+	ld	[%o5+4],$nhi_
+	fzeros	$nhi
+
+	fxtod	$alo,$alo
+	fxtod	$ahi,$ahi
+	fxtod	$nlo,$nlo
+	fxtod	$nhi,$nhi
+
+	ldx	[%sp+$bias+$frame+0],%o0
+		fmuld	$alo,$ba,$aloa
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$nlo,$na,$nloa
+	ldx	[%sp+$bias+$frame+16],%o2
+		fmuld	$alo,$bb,$alob
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$nlo,$nb,$nlob
+
+	srlx	%o0,16,%o7
+	std	$alo,[$ap_l+$j]		! save smashed ap[j] in double format
+		fmuld	$alo,$bc,$aloc
+	add	%o7,%o1,%o1
+	std	$ahi,[$ap_h+$j]
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	srlx	%o1,16,%o7
+	std	$nlo,[$np_l+$j]		! save smashed np[j] in double format
+		fmuld	$alo,$bd,$alod
+	add	%o7,%o2,%o2
+	std	$nhi,[$np_h+$j]
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	srlx	%o2,16,%o7
+		fmuld	$ahi,$ba,$ahia
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+		fmuld	$ahi,$bb,$ahib
+	sllx	%o1,16,%o1
+		faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+	sllx	%o2,32,%o2
+		fmuld	$ahi,$bc,$ahic
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+		faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+	or	%o2,%o0,%o0
+		fmuld	$ahi,$bd,$ahid
+	or	%o7,%o0,%o0		! 64-bit result
+		faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+	addcc	%g1,%o0,%o0
+		faddd	$dota,$nloa,$nloa
+	srlx	%o3,16,%g1		! 34-bit carry
+		faddd	$dotb,$nlob,$nlob
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]=
+
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	std	$nloc,[%sp+$bias+$frame+16]
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	addcc	$j,8,$j
+	bnz,pt	%icc,.L1st
+	add	$tp,8,$tp
+
+.L1stskip:
+	fdtox	$dota,$dota
+	fdtox	$dotb,$dotb
+
+	ldx	[%sp+$bias+$frame+0],%o0
+	ldx	[%sp+$bias+$frame+8],%o1
+	ldx	[%sp+$bias+$frame+16],%o2
+	ldx	[%sp+$bias+$frame+24],%o3
+
+	srlx	%o0,16,%o7
+	std	$dota,[%sp+$bias+$frame+32]
+	add	%o7,%o1,%o1
+	std	$dotb,[%sp+$bias+$frame+40]
+	srlx	%o1,16,%o7
+	add	%o7,%o2,%o2
+	srlx	%o2,16,%o7
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+	sllx	%o1,16,%o1
+	sllx	%o2,32,%o2
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+	or	%o2,%o0,%o0
+	or	%o7,%o0,%o0		! 64-bit result
+	ldx	[%sp+$bias+$frame+32],%o4
+	addcc	%g1,%o0,%o0
+	ldx	[%sp+$bias+$frame+40],%o5
+	srlx	%o3,16,%g1		! 34-bit carry
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]=
+	add	$tp,8,$tp
+
+	srlx	%o4,16,%o7
+	add	%o7,%o5,%o5
+	and	%o4,$mask,%o4
+	sllx	%o5,16,%o7
+	or	%o7,%o4,%o4
+	addcc	%g1,%o4,%o4
+	srlx	%o5,48,%g1
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	mov	%g1,$carry
+	stx	%o4,[$tp]		! tp[num-1]=
+
+	ba	.Louter
+	add	$i,8,$i
+.align	32
+.Louter:
+	sub	%g0,$num,$j		! j=-num
+	add	%sp,$bias+$frame+$locals,$tp
+
+	add	$ap,$j,%o3
+	add	$bp,$i,%o4
+
+	ld	[%o3+4],%g1		! bp[i]
+	ld	[%o3+0],%o0
+	ld	[%o4+4],%g5		! ap[0]
+	sllx	%g1,32,%g1
+	ld	[%o4+0],%o1
+	sllx	%g5,32,%g5
+	or	%g1,%o0,%o0
+	or	%g5,%o1,%o1
+
+	ldx	[$tp],%o2		! tp[0]
+	mulx	%o1,%o0,%o0
+	addcc	%o2,%o0,%o0
+	mulx	$n0,%o0,%o0		! (ap[0]*bp[i]+t[0])*n0
+	stx	%o0,[%sp+$bias+$frame+0]
+
+	! transfer b[i] to FPU as 4x16-bit values
+	ldda	[%o4+2]%asi,$ba
+	ldda	[%o4+0]%asi,$bb
+	ldda	[%o4+6]%asi,$bc
+	ldda	[%o4+4]%asi,$bd
+
+	! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
+	ldda	[%sp+$bias+$frame+6]%asi,$na
+	fxtod	$ba,$ba
+	ldda	[%sp+$bias+$frame+4]%asi,$nb
+	fxtod	$bb,$bb
+	ldda	[%sp+$bias+$frame+2]%asi,$nc
+	fxtod	$bc,$bc
+	ldda	[%sp+$bias+$frame+0]%asi,$nd
+	fxtod	$bd,$bd
+	ldd	[$ap_l+$j],$alo		! load a[j] in double format
+	fxtod	$na,$na
+	ldd	[$ap_h+$j],$ahi
+	fxtod	$nb,$nb
+	ldd	[$np_l+$j],$nlo		! load n[j] in double format
+	fxtod	$nc,$nc
+	ldd	[$np_h+$j],$nhi
+	fxtod	$nd,$nd
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+		fmuld	$alo,$bd,$alod
+	faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+		fmuld	$ahi,$ba,$ahia
+	faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+		fmuld	$ahi,$bb,$ahib
+	faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+		fmuld	$ahi,$bc,$ahic
+	faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+		fmuld	$ahi,$bd,$ahid
+	faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+
+	faddd	$ahic,$nhic,$dota	! $nhic
+	faddd	$ahid,$nhid,$dotb	! $nhid
+
+	faddd	$nloc,$nhia,$nloc
+	faddd	$nlod,$nhib,$nlod
+
+	fdtox	$nloa,$nloa
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	std	$nloc,[%sp+$bias+$frame+16]
+	add	$j,8,$j
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	ldd	[$ap_l+$j],$alo		! load a[j] in double format
+	ldd	[$ap_h+$j],$ahi
+	ldd	[$np_l+$j],$nlo		! load n[j] in double format
+	ldd	[$np_h+$j],$nhi
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	ldx	[%sp+$bias+$frame+0],%o0
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$alo,$bd,$alod
+	ldx	[%sp+$bias+$frame+16],%o2
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$ahi,$ba,$ahia
+
+	srlx	%o0,16,%o7
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	add	%o7,%o1,%o1
+		fmuld	$ahi,$bb,$ahib
+	srlx	%o1,16,%o7
+		faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+	add	%o7,%o2,%o2
+		fmuld	$ahi,$bc,$ahic
+	srlx	%o2,16,%o7
+		faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	! why?
+	and	%o0,$mask,%o0
+		fmuld	$ahi,$bd,$ahid
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+		faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+	sllx	%o1,16,%o1
+		faddd	$dota,$nloa,$nloa
+	sllx	%o2,32,%o2
+		faddd	$dotb,$nlob,$nlob
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+		faddd	$ahic,$nhic,$dota	! $nhic
+	or	%o2,%o0,%o0
+		faddd	$ahid,$nhid,$dotb	! $nhid
+	or	%o7,%o0,%o0		! 64-bit result
+	ldx	[$tp],%o7
+		faddd	$nloc,$nhia,$nloc
+	addcc	%o7,%o0,%o0
+	! end-of-why?
+		faddd	$nlod,$nhib,$nlod
+	srlx	%o3,16,%g1		! 34-bit carry
+		fdtox	$nloa,$nloa
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	fdtox	$nlob,$nlob
+	fdtox	$nloc,$nloc
+	fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	addcc	$j,8,$j
+	std	$nloc,[%sp+$bias+$frame+16]
+	bz,pn	%icc,.Linnerskip
+	std	$nlod,[%sp+$bias+$frame+24]
+
+	ba	.Linner
+	nop
+.align	32
+.Linner:
+	ldd	[$ap_l+$j],$alo		! load a[j] in double format
+	ldd	[$ap_h+$j],$ahi
+	ldd	[$np_l+$j],$nlo		! load n[j] in double format
+	ldd	[$np_h+$j],$nhi
+
+		fmuld	$alo,$ba,$aloa
+		fmuld	$nlo,$na,$nloa
+		fmuld	$alo,$bb,$alob
+		fmuld	$nlo,$nb,$nlob
+		fmuld	$alo,$bc,$aloc
+	ldx	[%sp+$bias+$frame+0],%o0
+		faddd	$aloa,$nloa,$nloa
+		fmuld	$nlo,$nc,$nloc
+	ldx	[%sp+$bias+$frame+8],%o1
+		fmuld	$alo,$bd,$alod
+	ldx	[%sp+$bias+$frame+16],%o2
+		faddd	$alob,$nlob,$nlob
+		fmuld	$nlo,$nd,$nlod
+	ldx	[%sp+$bias+$frame+24],%o3
+		fmuld	$ahi,$ba,$ahia
+
+	srlx	%o0,16,%o7
+		faddd	$aloc,$nloc,$nloc
+		fmuld	$nhi,$na,$nhia
+	add	%o7,%o1,%o1
+		fmuld	$ahi,$bb,$ahib
+	srlx	%o1,16,%o7
+		faddd	$alod,$nlod,$nlod
+		fmuld	$nhi,$nb,$nhib
+	add	%o7,%o2,%o2
+		fmuld	$ahi,$bc,$ahic
+	srlx	%o2,16,%o7
+		faddd	$ahia,$nhia,$nhia
+		fmuld	$nhi,$nc,$nhic
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+		fmuld	$ahi,$bd,$ahid
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+		faddd	$ahib,$nhib,$nhib
+		fmuld	$nhi,$nd,$nhid
+	sllx	%o1,16,%o1
+		faddd	$dota,$nloa,$nloa
+	sllx	%o2,32,%o2
+		faddd	$dotb,$nlob,$nlob
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+		faddd	$ahic,$nhic,$dota	! $nhic
+	or	%o2,%o0,%o0
+		faddd	$ahid,$nhid,$dotb	! $nhid
+	or	%o7,%o0,%o0		! 64-bit result
+		faddd	$nloc,$nhia,$nloc
+	addcc	%g1,%o0,%o0
+	ldx	[$tp+8],%o7		! tp[j]
+		faddd	$nlod,$nhib,$nlod
+	srlx	%o3,16,%g1		! 34-bit carry
+		fdtox	$nloa,$nloa
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+		fdtox	$nlob,$nlob
+	addcc	%o7,%o0,%o0
+		fdtox	$nloc,$nloc
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]
+		fdtox	$nlod,$nlod
+
+	std	$nloa,[%sp+$bias+$frame+0]
+	std	$nlob,[%sp+$bias+$frame+8]
+	std	$nloc,[%sp+$bias+$frame+16]
+	addcc	$j,8,$j
+	std	$nlod,[%sp+$bias+$frame+24]
+	bnz,pt	%icc,.Linner
+	add	$tp,8,$tp
+
+.Linnerskip:
+	fdtox	$dota,$dota
+	fdtox	$dotb,$dotb
+
+	ldx	[%sp+$bias+$frame+0],%o0
+	ldx	[%sp+$bias+$frame+8],%o1
+	ldx	[%sp+$bias+$frame+16],%o2
+	ldx	[%sp+$bias+$frame+24],%o3
+
+	srlx	%o0,16,%o7
+	std	$dota,[%sp+$bias+$frame+32]
+	add	%o7,%o1,%o1
+	std	$dotb,[%sp+$bias+$frame+40]
+	srlx	%o1,16,%o7
+	add	%o7,%o2,%o2
+	srlx	%o2,16,%o7
+	add	%o7,%o3,%o3		! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
+	and	%o0,$mask,%o0
+	and	%o1,$mask,%o1
+	and	%o2,$mask,%o2
+	sllx	%o1,16,%o1
+	sllx	%o2,32,%o2
+	sllx	%o3,48,%o7
+	or	%o1,%o0,%o0
+	or	%o2,%o0,%o0
+	ldx	[%sp+$bias+$frame+32],%o4
+	or	%o7,%o0,%o0		! 64-bit result
+	ldx	[%sp+$bias+$frame+40],%o5
+	addcc	%g1,%o0,%o0
+	ldx	[$tp+8],%o7		! tp[j]
+	srlx	%o3,16,%g1		! 34-bit carry
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	addcc	%o7,%o0,%o0
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	stx	%o0,[$tp]		! tp[j-1]
+	add	$tp,8,$tp
+
+	srlx	%o4,16,%o7
+	add	%o7,%o5,%o5
+	and	%o4,$mask,%o4
+	sllx	%o5,16,%o7
+	or	%o7,%o4,%o4
+	addcc	%g1,%o4,%o4
+	srlx	%o5,48,%g1
+	bcs,a	%xcc,.+8
+	add	%g1,1,%g1
+
+	addcc	$carry,%o4,%o4
+	stx	%o4,[$tp]		! tp[num-1]
+	mov	%g1,$carry
+	bcs,a	%xcc,.+8
+	add	$carry,1,$carry
+
+	addcc	$i,8,$i
+	bnz	%icc,.Louter
+	nop
+
+	add	$tp,8,$tp		! adjust tp to point at the end
+	orn	%g0,%g0,%g4
+	sub	%g0,$num,%o7		! n=-num
+	ba	.Lsub
+	subcc	%g0,%g0,%g0		! clear %icc.c
+
+.align	32
+.Lsub:
+	ldx	[$tp+%o7],%o0
+	add	$np,%o7,%g1
+	ld	[%g1+0],%o2
+	ld	[%g1+4],%o3
+	srlx	%o0,32,%o1
+	subccc	%o0,%o2,%o2
+	add	$rp,%o7,%g1
+	subccc	%o1,%o3,%o3
+	st	%o2,[%g1+0]
+	add	%o7,8,%o7
+	brnz,pt	%o7,.Lsub
+	st	%o3,[%g1+4]
+	subc	$carry,0,%g4
+	sub	%g0,$num,%o7		! n=-num
+	ba	.Lcopy
+	nop
+
+.align	32
+.Lcopy:
+	ldx	[$tp+%o7],%o0
+	add	$rp,%o7,%g1
+	ld	[%g1+0],%o2
+	ld	[%g1+4],%o3
+	stx	%g0,[$tp+%o7]
+	and	%o0,%g4,%o0
+	srlx	%o0,32,%o1
+	andn	%o2,%g4,%o2
+	andn	%o3,%g4,%o3
+	or	%o2,%o0,%o0
+	or	%o3,%o1,%o1
+	st	%o0,[%g1+0]
+	add	%o7,8,%o7
+	brnz,pt	%o7,.Lcopy
+	st	%o1,[%g1+4]
+	sub	%g0,$num,%o7		! n=-num
+
+.Lzap:
+	stx	%g0,[$ap_l+%o7]
+	stx	%g0,[$ap_h+%o7]
+	stx	%g0,[$np_l+%o7]
+	stx	%g0,[$np_h+%o7]
+	add	%o7,8,%o7
+	brnz,pt	%o7,.Lzap
+	nop
+
+	ldx	[%sp+$bias+$frame+48],%o7
+	wr	%g0,%o7,%asi		! restore %asi
+
+	mov	1,%i0
+.Lret:
+	ret
+	restore
+.type   $fname,#function
+.size	$fname,(.-$fname)
+.asciz	"Montgomery Multipltication for UltraSPARC, CRYPTOGAMS by <appro\@openssl.org>"
+.align	32
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+# Below substitution makes it possible to compile without demanding
+# VIS extentions on command line, e.g. -xarch=v9 vs. -xarch=v9a. I
+# dare to do this, because VIS capability is detected at run-time now
+# and this routine is not called on CPU not capable to execute it. Do
+# note that fzeros is not the only VIS dependency! Another dependency
+# is implicit and is just _a_ numerical value loaded to %asi register,
+# which assembler can't recognize as VIS specific...
+$code =~ s/fzeros\s+%f([0-9]+)/
+	   sprintf(".word\t0x%x\t! fzeros %%f%d",0x81b00c20|($1<<25),$1)
+	  /gem;
+
+print $code;
+# flush
+close STDOUT;
--- a/crypto/bn/asm/via-mont.pl
+++ b/crypto/bn/asm/via-mont.pl
@@ -0,0 +1,242 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# Wrapper around 'rep montmul', VIA-specific instruction accessing
+# PadLock Montgomery Multiplier. The wrapper is designed as drop-in
+# replacement for OpenSSL bn_mul_mont [first implemented in 0.9.9].
+#
+# Below are interleaved outputs from 'openssl speed rsa dsa' for 4
+# different software configurations on 1.5GHz VIA Esther processor.
+# Lines marked with "software integer" denote performance of hand-
+# coded integer-only assembler found in OpenSSL 0.9.7. "Software SSE2"
+# refers to hand-coded SSE2 Montgomery multiplication procedure found
+# OpenSSL 0.9.9. "Hardware VIA SDK" refers to padlock_pmm routine from
+# Padlock SDK 2.0.1 available for download from VIA, which naturally
+# utilizes the magic 'repz montmul' instruction. And finally "hardware
+# this" refers to *this* implementation which also uses 'repz montmul'
+#
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.001720s 0.000140s    581.4   7149.7	software integer
+# rsa  512 bits 0.000690s 0.000086s   1450.3  11606.0	software SSE2
+# rsa  512 bits 0.006136s 0.000201s    163.0   4974.5	hardware VIA SDK
+# rsa  512 bits 0.000712s 0.000050s   1404.9  19858.5	hardware this
+#
+# rsa 1024 bits 0.008518s 0.000413s    117.4   2420.8	software integer
+# rsa 1024 bits 0.004275s 0.000277s    233.9   3609.7	software SSE2
+# rsa 1024 bits 0.012136s 0.000260s     82.4   3844.5	hardware VIA SDK
+# rsa 1024 bits 0.002522s 0.000116s    396.5   8650.9	hardware this
+#
+# rsa 2048 bits 0.050101s 0.001371s     20.0    729.6	software integer
+# rsa 2048 bits 0.030273s 0.001008s     33.0    991.9	software SSE2
+# rsa 2048 bits 0.030833s 0.000976s     32.4   1025.1	hardware VIA SDK
+# rsa 2048 bits 0.011879s 0.000342s     84.2   2921.7	hardware this
+#
+# rsa 4096 bits 0.327097s 0.004859s      3.1    205.8	software integer
+# rsa 4096 bits 0.229318s 0.003859s      4.4    259.2	software SSE2
+# rsa 4096 bits 0.233953s 0.003274s      4.3    305.4	hardware VIA SDK
+# rsa 4096 bits 0.070493s 0.001166s     14.2    857.6	hardware this
+#
+# dsa  512 bits 0.001342s 0.001651s    745.2    605.7	software integer
+# dsa  512 bits 0.000844s 0.000987s   1185.3   1013.1	software SSE2
+# dsa  512 bits 0.001902s 0.002247s    525.6    444.9	hardware VIA SDK
+# dsa  512 bits 0.000458s 0.000524s   2182.2   1909.1	hardware this
+#
+# dsa 1024 bits 0.003964s 0.004926s    252.3    203.0	software integer
+# dsa 1024 bits 0.002686s 0.003166s    372.3    315.8	software SSE2
+# dsa 1024 bits 0.002397s 0.002823s    417.1    354.3	hardware VIA SDK
+# dsa 1024 bits 0.000978s 0.001170s   1022.2    855.0	hardware this
+#
+# dsa 2048 bits 0.013280s 0.016518s     75.3     60.5	software integer
+# dsa 2048 bits 0.009911s 0.011522s    100.9     86.8	software SSE2
+# dsa 2048 bits 0.009542s 0.011763s    104.8     85.0	hardware VIA SDK
+# dsa 2048 bits 0.002884s 0.003352s    346.8    298.3	hardware this
+#
+# To give you some other reference point here is output for 2.4GHz P4
+# running hand-coded SSE2 bn_mul_mont found in 0.9.9, i.e. "software
+# SSE2" in above terms.
+#
+# rsa  512 bits 0.000407s 0.000047s   2454.2  21137.0
+# rsa 1024 bits 0.002426s 0.000141s    412.1   7100.0
+# rsa 2048 bits 0.015046s 0.000491s     66.5   2034.9
+# rsa 4096 bits 0.109770s 0.002379s      9.1    420.3
+# dsa  512 bits 0.000438s 0.000525s   2281.1   1904.1
+# dsa 1024 bits 0.001346s 0.001595s    742.7    627.0
+# dsa 2048 bits 0.004745s 0.005582s    210.7    179.1
+#
+# Conclusions: 
+# - VIA SDK leaves a *lot* of room for improvement (which this
+#   implementation successfully fills:-);
+# - 'rep montmul' gives up to >3x performance improvement depending on
+#   key length;
+# - in terms of absolute performance it delivers approximately as much
+#   as modern out-of-order 32-bit cores [again, for longer keys].
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],"via-mont.pl");
+
+# int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num);
+$func="bn_mul_mont_padlock";
+
+$pad=16*1;	# amount of reserved bytes on top of every vector
+
+# stack layout
+$mZeroPrime=&DWP(0,"esp");		# these are specified by VIA
+$A=&DWP(4,"esp");
+$B=&DWP(8,"esp");
+$T=&DWP(12,"esp");
+$M=&DWP(16,"esp");
+$scratch=&DWP(20,"esp");
+$rp=&DWP(24,"esp");			# these are mine
+$sp=&DWP(28,"esp");
+# &DWP(32,"esp")			# 32 byte scratch area
+# &DWP(64+(4*$num+$pad)*0,"esp")	# padded tp[num]
+# &DWP(64+(4*$num+$pad)*1,"esp")	# padded copy of ap[num]
+# &DWP(64+(4*$num+$pad)*2,"esp")	# padded copy of bp[num]
+# &DWP(64+(4*$num+$pad)*3,"esp")	# padded copy of np[num]
+# Note that SDK suggests to unconditionally allocate 2K per vector. This
+# has quite an impact on performance. It naturally depends on key length,
+# but to give an example 1024 bit private RSA key operations suffer >30%
+# penalty. I allocate only as much as actually required...
+
+&function_begin($func);
+	&xor	("eax","eax");
+	&mov	("ecx",&wparam(5));	# num
+	# meet VIA's limitations for num [note that the specification
+	# expresses them in bits, while we work with amount of 32-bit words]
+	&test	("ecx",3);
+	&jnz	(&label("leave"));	# num % 4 != 0
+	&cmp	("ecx",8);
+	&jb	(&label("leave"));	# num < 8
+	&cmp	("ecx",1024);
+	&ja	(&label("leave"));	# num > 1024
+
+	&pushf	();
+	&cld	();
+
+	&mov	("edi",&wparam(0));	# rp
+	&mov	("eax",&wparam(1));	# ap
+	&mov	("ebx",&wparam(2));	# bp
+	&mov	("edx",&wparam(3));	# np
+	&mov	("esi",&wparam(4));	# n0
+	&mov	("esi",&DWP(0,"esi"));	# *n0
+
+	&lea	("ecx",&DWP($pad,"","ecx",4));	# ecx becomes vector size in bytes
+	&lea	("ebp",&DWP(64,"","ecx",4));	# allocate 4 vectors + 64 bytes
+	&neg	("ebp");
+	&add	("ebp","esp");
+	&and	("ebp",-64);		# align to cache-line
+	&xchg	("ebp","esp");		# alloca
+
+	&mov	($rp,"edi");		# save rp
+	&mov	($sp,"ebp");		# save esp
+
+	&mov	($mZeroPrime,"esi");
+	&lea	("esi",&DWP(64,"esp"));	# tp
+	&mov	($T,"esi");
+	&lea	("edi",&DWP(32,"esp"));	# scratch area
+	&mov	($scratch,"edi");
+	&mov	("esi","eax");
+
+	&lea	("ebp",&DWP(-$pad,"ecx"));
+	&shr	("ebp",2);		# restore original num value in ebp
+
+	&xor	("eax","eax");
+
+	&mov	("ecx","ebp");
+	&lea	("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
+	&data_byte(0xf3,0xab);		# rep stosl, bzero
+
+	&mov	("ecx","ebp");
+	&lea	("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
+	&mov	($A,"edi");
+	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded ap copy...
+
+	&mov	("ecx","ebp");
+	&mov	("esi","ebx");
+	&mov	($B,"edi");
+	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded bp copy...
+
+	&mov	("ecx","ebp");
+	&mov	("esi","edx");
+	&mov	($M,"edi");
+	&data_byte(0xf3,0xa5);		# rep movsl, memcpy
+	&mov	("ecx",$pad/4);
+	&data_byte(0xf3,0xab);		# rep stosl, bzero pad
+	# edi points at the end of padded np copy...
+
+	# let magic happen...
+	&mov	("ecx","ebp");
+	&mov	("esi","esp");
+	&shl	("ecx",5);		# convert word counter to bit counter
+	&align	(4);
+	&data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
+
+	&mov	("ecx","ebp");
+	&lea	("esi",&DWP(64,"esp"));		# tp
+	# edi still points at the end of padded np copy...
+	&neg	("ebp");
+	&lea	("ebp",&DWP(-$pad,"edi","ebp",4));	# so just "rewind"
+	&mov	("edi",$rp);			# restore rp
+	&xor	("edx","edx");			# i=0 and clear CF
+
+&set_label("sub",8);
+	&mov	("eax",&DWP(0,"esi","edx",4));
+	&sbb	("eax",&DWP(0,"ebp","edx",4));
+	&mov	(&DWP(0,"edi","edx",4),"eax");	# rp[i]=tp[i]-np[i]
+	&lea	("edx",&DWP(1,"edx"));		# i++
+	&loop	(&label("sub"));		# doesn't affect CF!
+
+	&mov	("eax",&DWP(0,"esi","edx",4));	# upmost overflow bit
+	&sbb	("eax",0);
+	&and	("esi","eax");
+	&not	("eax");
+	&mov	("ebp","edi");
+	&and	("ebp","eax");
+	&or	("esi","ebp");			# tp=carry?tp:rp
+
+	&mov	("ecx","edx");			# num
+	&xor	("edx","edx");			# i=0
+
+&set_label("copy",8);
+	&mov	("eax",&DWP(0,"esi","edx",4));
+	&mov	(&DWP(64,"esp","edx",4),"ecx");	# zap tp
+	&mov	(&DWP(0,"edi","edx",4),"eax");
+	&lea	("edx",&DWP(1,"edx"));		# i++
+	&loop	(&label("copy"));
+
+	&mov	("ebp",$sp);
+	&xor	("eax","eax");
+
+	&mov	("ecx",64/4);
+	&mov	("edi","esp");		# zap frame including scratch area
+	&data_byte(0xf3,0xab);		# rep stosl, bzero
+
+	# zap copies of ap, bp and np
+	&lea	("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
+	&lea	("ecx",&DWP(3*$pad/4,"edx","edx",2));
+	&data_byte(0xf3,0xab);		# rep stosl, bzero
+
+	&mov	("esp","ebp");
+	&inc	("eax");		# signal "done"
+	&popf	();
+&set_label("leave");
+&function_end($func);
+
+&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
--- a/crypto/bn/asm/vis3-mont.pl
+++ b/crypto/bn/asm/vis3-mont.pl
@@ -0,0 +1,373 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2012.
+#
+# SPARCv9 VIS3 Montgomery multiplicaion procedure suitable for T3 and
+# onward. There are three new instructions used here: umulxhi,
+# addxc[cc] and initializing store. On T3 RSA private key operations
+# are 1.54/1.87/2.11/2.26 times faster for 512/1024/2048/4096-bit key
+# lengths. This is without dedicated squaring procedure. On T4
+# corresponding coefficients are 1.47/2.10/2.80/2.90x, which is mostly
+# for reference purposes, because T4 has dedicated Montgomery
+# multiplication and squaring *instructions* that deliver even more.
+
+$bits=32;
+for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
+if ($bits==64)  { $bias=2047; $frame=192; }
+else            { $bias=0;    $frame=112; }
+
+$code.=<<___ if ($bits==64);
+.register	%g2,#scratch
+.register	%g3,#scratch
+___
+$code.=<<___;
+.section	".text",#alloc,#execinstr
+___
+
+($n0,$m0,$m1,$lo0,$hi0, $lo1,$hi1,$aj,$alo,$nj,$nlo,$tj)=
+	(map("%g$_",(1..5)),map("%o$_",(0..5,7)));
+
+# int bn_mul_mont(
+$rp="%o0";	# BN_ULONG *rp,
+$ap="%o1";	# const BN_ULONG *ap,
+$bp="%o2";	# const BN_ULONG *bp,
+$np="%o3";	# const BN_ULONG *np,
+$n0p="%o4";	# const BN_ULONG *n0,
+$num="%o5";	# int num);	# caller ensures that num is even
+				# and >=6
+$code.=<<___;
+.globl	bn_mul_mont_vis3
+.align	32
+bn_mul_mont_vis3:
+	add	%sp,	$bias,	%g4	! real top of stack
+	sll	$num,	2,	$num	! size in bytes
+	add	$num,	63,	%g5
+	andn	%g5,	63,	%g5	! buffer size rounded up to 64 bytes
+	add	%g5,	%g5,	%g1
+	add	%g5,	%g1,	%g1	! 3*buffer size
+	sub	%g4,	%g1,	%g1
+	andn	%g1,	63,	%g1	! align at 64 byte
+	sub	%g1,	$frame,	%g1	! new top of stack
+	sub	%g1,	%g4,	%g1
+
+	save	%sp,	%g1,	%sp
+___
+
+#	+-------------------------------+<-----	%sp
+#	.				.
+#	+-------------------------------+<-----	aligned at 64 bytes
+#	| __int64 tmp[0]		|
+#	+-------------------------------+
+#	.				.
+#	.				.
+#	+-------------------------------+<----- aligned at 64 bytes
+#	| __int64 ap[1..0]		|	converted ap[]
+#	+-------------------------------+
+#	| __int64 np[1..0]		|	converted np[]
+#	+-------------------------------+
+#	| __int64 ap[3..2]		|
+#	.				.
+#	.				.
+#	+-------------------------------+
+($rp,$ap,$bp,$np,$n0p,$num)=map("%i$_",(0..5));
+($t0,$t1,$t2,$t3,$cnt,$tp,$bufsz,$anp)=map("%l$_",(0..7));
+($ovf,$i)=($t0,$t1);
+$code.=<<___;
+	ld	[$n0p+0],	$t0	! pull n0[0..1] value
+	add	%sp, $bias+$frame, $tp
+	ld	[$n0p+4],	$t1
+	add	$tp,	%g5,	$anp
+	ld	[$bp+0],	$t2	! m0=bp[0]
+	sllx	$t1,	32,	$n0
+	ld	[$bp+4],	$t3
+	or	$t0,	$n0,	$n0
+	add	$bp,	8,	$bp
+
+	ld	[$ap+0],	$t0	! ap[0]
+	sllx	$t3,	32,	$m0
+	ld	[$ap+4],	$t1
+	or	$t2,	$m0,	$m0
+
+	ld	[$ap+8],	$t2	! ap[1]
+	sllx	$t1,	32,	$aj
+	ld	[$ap+12],	$t3
+	or	$t0,	$aj,	$aj
+	add	$ap,	16,	$ap
+	stx	$aj,	[$anp]		! converted ap[0]
+
+	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[0]
+	umulxhi	$aj,	$m0,	$hi0
+
+	ld	[$np+0],	$t0	! np[0]
+	sllx	$t3,	32,	$aj
+	ld	[$np+4],	$t1
+	or	$t2,	$aj,	$aj
+
+	ld	[$np+8],	$t2	! np[1]
+	sllx	$t1,	32,	$nj
+	ld	[$np+12],	$t3
+	or	$t0, $nj,	$nj
+	add	$np,	16,	$np
+	stx	$nj,	[$anp+8]	! converted np[0]
+
+	mulx	$lo0,	$n0,	$m1	! "tp[0]"*n0
+	stx	$aj,	[$anp+16]	! converted ap[1]
+
+	mulx	$aj,	$m0,	$alo	! ap[1]*bp[0]
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+
+	mulx	$nj,	$m1,	$lo1	! np[0]*m1
+	umulxhi	$nj,	$m1,	$hi1
+
+	sllx	$t3,	32,	$nj
+	or	$t2,	$nj,	$nj
+	stx	$nj,	[$anp+24]	! converted np[1]
+	add	$anp,	32,	$anp
+
+	addcc	$lo0,	$lo1,	$lo1
+	addxc	%g0,	$hi1,	$hi1
+
+	mulx	$nj,	$m1,	$nlo	! np[1]*m1
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+
+	ba	.L1st
+	sub	$num,	24,	$cnt	! cnt=num-3
+
+.align	16
+.L1st:
+	ld	[$ap+0],	$t0	! ap[j]
+	addcc	$alo,	$hi0,	$lo0
+	ld	[$ap+4],	$t1
+	addxc	$aj,	%g0,	$hi0
+
+	sllx	$t1,	32,	$aj
+	add	$ap,	8,	$ap
+	or	$t0,	$aj,	$aj
+	stx	$aj,	[$anp]		! converted ap[j]
+
+	ld	[$np+0],	$t2	! np[j]
+	addcc	$nlo,	$hi1,	$lo1
+	ld	[$np+4],	$t3
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+
+	sllx	$t3,	32,	$nj
+	add	$np,	8,	$np
+	mulx	$aj,	$m0,	$alo	! ap[j]*bp[0]
+	or	$t2,	$nj,	$nj
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+	stx	$nj,	[$anp+8]	! converted np[j]
+	add	$anp,	16,	$anp	! anp++
+
+	mulx	$nj,	$m1,	$nlo	! np[j]*m1
+	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+	add	$tp,	8,	$tp	! tp++
+
+	brnz,pt	$cnt,	.L1st
+	sub	$cnt,	8,	$cnt	! j--
+!.L1st
+	addcc	$alo,	$hi0,	$lo0
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+
+	addcc	$nlo,	$hi1,	$lo1
+	addxc	$nj,	%g0,	$hi1
+	addcc	$lo0,	$lo1,	$lo1	! np[j]*m1+ap[j]*bp[0]
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+	add	$tp,	8,	$tp
+
+	addcc	$hi0,	$hi1,	$hi1
+	addxc	%g0,	%g0,	$ovf	! upmost overflow bit
+	stx	$hi1,	[$tp]
+	add	$tp,	8,	$tp
+
+	ba	.Louter
+	sub	$num,	16,	$i	! i=num-2
+
+.align	16
+.Louter:
+	ld	[$bp+0],	$t2	! m0=bp[i]
+	ld	[$bp+4],	$t3
+
+	sub	$anp,	$num,	$anp	! rewind
+	sub	$tp,	$num,	$tp
+	sub	$anp,	$num,	$anp
+
+	add	$bp,	8,	$bp
+	sllx	$t3,	32,	$m0
+	ldx	[$anp+0],	$aj	! ap[0]
+	or	$t2,	$m0,	$m0
+	ldx	[$anp+8],	$nj	! np[0]
+
+	mulx	$aj,	$m0,	$lo0	! ap[0]*bp[i]
+	ldx	[$tp],		$tj	! tp[0]
+	umulxhi	$aj,	$m0,	$hi0
+	ldx	[$anp+16],	$aj	! ap[1]
+	addcc	$lo0,	$tj,	$lo0	! ap[0]*bp[i]+tp[0]
+	mulx	$aj,	$m0,	$alo	! ap[1]*bp[i]
+	addxc	%g0,	$hi0,	$hi0
+	mulx	$lo0,	$n0,	$m1	! tp[0]*n0
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+	mulx	$nj,	$m1,	$lo1	! np[0]*m1
+	umulxhi	$nj,	$m1,	$hi1
+	ldx	[$anp+24],	$nj	! np[1]
+	add	$anp,	32,	$anp
+	addcc	$lo1,	$lo0,	$lo1
+	mulx	$nj,	$m1,	$nlo	! np[1]*m1
+	addxc	%g0,	$hi1,	$hi1
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+
+	ba	.Linner
+	sub	$num,	24,	$cnt	! cnt=num-3
+.align	16
+.Linner:
+	addcc	$alo,	$hi0,	$lo0
+	ldx	[$tp+8],	$tj	! tp[j]
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+	ldx	[$anp+0],	$aj	! ap[j]
+	addcc	$nlo,	$hi1,	$lo1
+	mulx	$aj,	$m0,	$alo	! ap[j]*bp[i]
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+	ldx	[$anp+8],	$nj	! np[j]
+	add	$anp,	16,	$anp
+	umulxhi	$aj,	$m0,	$aj	! ahi=aj
+	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
+	mulx	$nj,	$m1,	$nlo	! np[j]*m1
+	addxc	%g0,	$hi0,	$hi0
+	umulxhi	$nj,	$m1,	$nj	! nhi=nj
+	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+	add	$tp,	8,	$tp
+	brnz,pt	$cnt,	.Linner
+	sub	$cnt,	8,	$cnt
+!.Linner
+	ldx	[$tp+8],	$tj	! tp[j]
+	addcc	$alo,	$hi0,	$lo0
+	addxc	$aj,	%g0,	$hi0	! ahi=aj
+	addcc	$lo0,	$tj,	$lo0	! ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi0,	$hi0
+
+	addcc	$nlo,	$hi1,	$lo1
+	addxc	$nj,	%g0,	$hi1	! nhi=nj
+	addcc	$lo1,	$lo0,	$lo1	! np[j]*m1+ap[j]*bp[i]+tp[j]
+	addxc	%g0,	$hi1,	$hi1
+	stx	$lo1,	[$tp]		! tp[j-1]
+
+	subcc	%g0,	$ovf,	%g0	! move upmost overflow to CCR.xcc
+	addxccc	$hi1,	$hi0,	$hi1
+	addxc	%g0,	%g0,	$ovf
+	stx	$hi1,	[$tp+8]
+	add	$tp,	16,	$tp
+
+	brnz,pt	$i,	.Louter
+	sub	$i,	8,	$i
+
+	sub	$anp,	$num,	$anp	! rewind
+	sub	$tp,	$num,	$tp
+	sub	$anp,	$num,	$anp
+	ba	.Lsub
+	subcc	$num,	8,	$cnt	! cnt=num-1 and clear CCR.xcc
+
+.align	16
+.Lsub:
+	ldx	[$tp],		$tj
+	add	$tp,	8,	$tp
+	ldx	[$anp+8],	$nj
+	add	$anp,	16,	$anp
+	subccc	$tj,	$nj,	$t2	! tp[j]-np[j]
+	srlx	$tj,	32,	$tj
+	srlx	$nj,	32,	$nj
+	subccc	$tj,	$nj,	$t3
+	add	$rp,	8,	$rp
+	st	$t2,	[$rp-4]		! reverse order
+	st	$t3,	[$rp-8]
+	brnz,pt	$cnt,	.Lsub
+	sub	$cnt,	8,	$cnt
+
+	sub	$anp,	$num,	$anp	! rewind
+	sub	$tp,	$num,	$tp
+	sub	$anp,	$num,	$anp
+	sub	$rp,	$num,	$rp
+
+	subc	$ovf,	%g0,	$ovf	! handle upmost overflow bit
+	and	$tp,	$ovf,	$ap
+	andn	$rp,	$ovf,	$np
+	or	$np,	$ap,	$ap	! ap=borrow?tp:rp
+	ba	.Lcopy
+	sub	$num,	8,	$cnt
+
+.align	16
+.Lcopy:					! copy or in-place refresh
+	ld	[$ap+0],	$t2
+	ld	[$ap+4],	$t3
+	add	$ap,	8,	$ap
+	stx	%g0,	[$tp]		! zap
+	add	$tp,	8,	$tp
+	stx	%g0,	[$anp]		! zap
+	stx	%g0,	[$anp+8]
+	add	$anp,	16,	$anp
+	st	$t3,	[$rp+0]		! flip order
+	st	$t2,	[$rp+4]
+	add	$rp,	8,	$rp
+	brnz	$cnt,	.Lcopy
+	sub	$cnt,	8,	$cnt
+
+	mov	1,	%o0
+	ret
+	restore
+.type	bn_mul_mont_vis3, #function
+.size	bn_mul_mont_vis3, .-bn_mul_mont_vis3
+.asciz  "Montgomery Multiplication for SPARCv9 VIS3, CRYPTOGAMS by <appro\@openssl.org>"
+.align	4
+___
+
+# Purpose of these subroutines is to explicitly encode VIS instructions,
+# so that one can compile the module without having to specify VIS
+# extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
+# Idea is to reserve for option to produce "universal" binary and let
+# programmer detect if current CPU is VIS capable at run-time.
+sub unvis3 {
+my ($mnemonic,$rs1,$rs2,$rd)=@_;
+my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
+my ($ref,$opf);
+my %visopf = (	"addxc"		=> 0x011,
+		"addxccc"	=> 0x013,
+		"umulxhi"	=> 0x016	);
+
+    $ref = "$mnemonic\t$rs1,$rs2,$rd";
+
+    if ($opf=$visopf{$mnemonic}) {
+	foreach ($rs1,$rs2,$rd) {
+	    return $ref if (!/%([goli])([0-9])/);
+	    $_=$bias{$1}+$2;
+	}
+
+	return	sprintf ".word\t0x%08x !%s",
+			0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
+			$ref;
+    } else {
+	return $ref;
+    }
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval $1/ge;
+
+	s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
+		&unvis3($1,$2,$3,$4)
+	 /ge;
+
+	print $_,"\n";
+}
+
+close STDOUT;
--- a/crypto/bn/asm/vms.mar
+++ b/crypto/bn/asm/vms.mar
--- a/crypto/bn/asm/x86-gf2m.pl
+++ b/crypto/bn/asm/x86-gf2m.pl
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has three code paths: pure integer
+# code suitable for any x86 CPU, MMX code suitable for PIII and later
+# and PCLMULQDQ suitable for Westmere and later. Improvement varies
+# from one benchmark and µ-arch to another. Below are interval values
+# for 163- and 571-bit ECDH benchmarks relative to compiler-generated
+# code:
+#
+# PIII		16%-30%
+# P4		12%-12%
+# Opteron	18%-40%
+# Core2		19%-44%
+# Atom		38%-64%
+# Westmere	53%-121%(PCLMULQDQ)/20%-32%(MMX)
+# Sandy Bridge	72%-127%(PCLMULQDQ)/27%-23%(MMX)
+#
+# Note that above improvement coefficients are not coefficients for
+# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result
+# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark
+# is more and more dominated by other subroutines, most notably by
+# BN_GF2m_mod[_mul]_arr...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386");
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+$a="eax";
+$b="ebx";
+($a1,$a2,$a4)=("ecx","edx","ebp");
+
+$R="mm0";
+@T=("mm1","mm2");
+($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5");
+@i=("esi","edi");
+
+					if (!$x86only) {
+&function_begin_B("_mul_1x1_mmx");
+	&sub	("esp",32+4);
+	 &mov	($a1,$a);
+	 &lea	($a2,&DWP(0,$a,$a));
+	 &and	($a1,0x3fffffff);
+	 &lea	($a4,&DWP(0,$a2,$a2));
+	 &mov	(&DWP(0*4,"esp"),0);
+	 &and	($a2,0x7fffffff);
+	&movd	($A,$a);
+	&movd	($B,$b);
+	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
+	 &xor	($a1,$a2);		# a1^a2
+	&pxor	($B31,$B31);
+	&pxor	($B30,$B30);
+	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
+	 &xor	($a2,$a4);		# a2^a4
+	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
+	&pcmpgtd($B31,$A);		# broadcast 31st bit
+	&paddd	($A,$A);		# $A<<=1
+	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
+	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
+	 &xor	($a4,$a2);		# a2=a4^a2^a4
+	&pand	($B31,$B);
+	&pcmpgtd($B30,$A);		# broadcast 30th bit
+	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
+	 &xor	($a4,$a1);		# a1^a2^a4
+	&psllq	($B31,31);
+	&pand	($B30,$B);
+	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
+	&mov	(@i[0],0x7);
+	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
+	 &mov	($a4,@i[0]);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	&mov	(@i[1],$a4);
+	&psllq	($B30,30);
+	&and	(@i[1],$b);
+	&shr	($b,3);
+	&movd	($R,&DWP(0,"esp",@i[0],4));
+	&mov	(@i[0],$a4);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	for($n=1;$n<9;$n++) {
+		&movd	(@T[1],&DWP(0,"esp",@i[1],4));
+		&mov	(@i[1],$a4);
+		&psllq	(@T[1],3*$n);
+		&and	(@i[1],$b);
+		&shr	($b,3);
+		&pxor	($R,@T[1]);
+
+		push(@i,shift(@i)); push(@T,shift(@T));
+	}
+	&movd	(@T[1],&DWP(0,"esp",@i[1],4));
+	&pxor	($R,$B30);
+	&psllq	(@T[1],3*$n++);
+	&pxor	($R,@T[1]);
+
+	&movd	(@T[0],&DWP(0,"esp",@i[0],4));
+	&pxor	($R,$B31);
+	&psllq	(@T[0],3*$n);
+	&add	("esp",32+4);
+	&pxor	($R,@T[0]);
+	&ret	();
+&function_end_B("_mul_1x1_mmx");
+					}
+
+($lo,$hi)=("eax","edx");
+@T=("ecx","ebp");
+
+&function_begin_B("_mul_1x1_ialu");
+	&sub	("esp",32+4);
+	 &mov	($a1,$a);
+	 &lea	($a2,&DWP(0,$a,$a));
+	 &lea	($a4,&DWP(0,"",$a,4));
+	 &and	($a1,0x3fffffff);
+	&lea	(@i[1],&DWP(0,$lo,$lo));
+	&sar	($lo,31);		# broadcast 31st bit
+	 &mov	(&DWP(0*4,"esp"),0);
+	 &and	($a2,0x7fffffff);
+	 &mov	(&DWP(1*4,"esp"),$a1);	# a1
+	 &xor	($a1,$a2);		# a1^a2
+	 &mov	(&DWP(2*4,"esp"),$a2);	# a2
+	 &xor	($a2,$a4);		# a2^a4
+	 &mov	(&DWP(3*4,"esp"),$a1);	# a1^a2
+	 &xor	($a1,$a2);		# a1^a4=a1^a2^a2^a4
+	 &mov	(&DWP(4*4,"esp"),$a4);	# a4
+	 &xor	($a4,$a2);		# a2=a4^a2^a4
+	 &mov	(&DWP(5*4,"esp"),$a1);	# a1^a4
+	 &xor	($a4,$a1);		# a1^a2^a4
+	&sar	(@i[1],31);		# broardcast 30th bit
+	&and	($lo,$b);
+	 &mov	(&DWP(6*4,"esp"),$a2);	# a2^a4
+	&and	(@i[1],$b);
+	 &mov	(&DWP(7*4,"esp"),$a4);	# a1^a2^a4
+	&mov	($hi,$lo);
+	&shl	($lo,31);
+	&mov	(@T[0],@i[1]);
+	&shr	($hi,1);
+
+	 &mov	(@i[0],0x7);
+	&shl	(@i[1],30);
+	 &and	(@i[0],$b);
+	&shr	(@T[0],2);
+	&xor	($lo,@i[1]);
+
+	&shr	($b,3);
+	&mov	(@i[1],0x7);		# 5-byte instruction!?
+	&and	(@i[1],$b);
+	&shr	($b,3);
+	 &xor	($hi,@T[0]);
+	&xor	($lo,&DWP(0,"esp",@i[0],4));
+	&mov	(@i[0],0x7);
+	&and	(@i[0],$b);
+	&shr	($b,3);
+	for($n=1;$n<9;$n++) {
+		&mov	(@T[1],&DWP(0,"esp",@i[1],4));
+		&mov	(@i[1],0x7);
+		&mov	(@T[0],@T[1]);
+		&shl	(@T[1],3*$n);
+		&and	(@i[1],$b);
+		&shr	(@T[0],32-3*$n);
+		&xor	($lo,@T[1]);
+		&shr	($b,3);
+		&xor	($hi,@T[0]);
+
+		push(@i,shift(@i)); push(@T,shift(@T));
+	}
+	&mov	(@T[1],&DWP(0,"esp",@i[1],4));
+	&mov	(@T[0],@T[1]);
+	&shl	(@T[1],3*$n);
+	&mov	(@i[1],&DWP(0,"esp",@i[0],4));
+	&shr	(@T[0],32-3*$n);	$n++;
+	&mov	(@i[0],@i[1]);
+	&xor	($lo,@T[1]);
+	&shl	(@i[1],3*$n);
+	&xor	($hi,@T[0]);
+	&shr	(@i[0],32-3*$n);
+	&xor	($lo,@i[1]);
+	&xor	($hi,@i[0]);
+
+	&add	("esp",32+4);
+	&ret	();
+&function_end_B("_mul_1x1_ialu");
+
+# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0);
+&function_begin_B("bn_GF2m_mul_2x2");
+if (!$x86only) {
+	&picmeup("edx","OPENSSL_ia32cap_P");
+	&mov	("eax",&DWP(0,"edx"));
+	&mov	("edx",&DWP(4,"edx"));
+	&test	("eax",1<<23);		# check MMX bit
+	&jz	(&label("ialu"));
+if ($sse2) {
+	&test	("eax",1<<24);		# check FXSR bit
+	&jz	(&label("mmx"));
+	&test	("edx",1<<1);		# check PCLMULQDQ bit
+	&jz	(&label("mmx"));
+
+	&movups		("xmm0",&QWP(8,"esp"));
+	&shufps		("xmm0","xmm0",0b10110001);
+	&pclmulqdq	("xmm0","xmm0",1);
+	&mov		("eax",&DWP(4,"esp"));
+	&movups		(&QWP(0,"eax"),"xmm0");
+	&ret	();
+
+&set_label("mmx",16);
+}
+	&push	("ebp");
+	&push	("ebx");
+	&push	("esi");
+	&push	("edi");
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&call	("_mul_1x1_mmx");	# a1·b1
+	&movq	("mm7",$R);
+
+	&mov	($a,&wparam(2));
+	&mov	($b,&wparam(4));
+	&call	("_mul_1x1_mmx");	# a0·b0
+	&movq	("mm6",$R);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&xor	($a,&wparam(2));
+	&xor	($b,&wparam(4));
+	&call	("_mul_1x1_mmx");	# (a0+a1)·(b0+b1)
+	&pxor	($R,"mm7");
+	&mov	($a,&wparam(0));
+	&pxor	($R,"mm6");		# (a0+a1)·(b0+b1)-a1·b1-a0·b0
+
+	&movq	($A,$R);
+	&psllq	($R,32);
+	&pop	("edi");
+	&psrlq	($A,32);
+	&pop	("esi");
+	&pxor	($R,"mm6");
+	&pop	("ebx");
+	&pxor	($A,"mm7");
+	&movq	(&QWP(0,$a),$R);
+	&pop	("ebp");
+	&movq	(&QWP(8,$a),$A);
+	&emms	();
+	&ret	();
+&set_label("ialu",16);
+}
+	&push	("ebp");
+	&push	("ebx");
+	&push	("esi");
+	&push	("edi");
+	&stack_push(4+1);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&call	("_mul_1x1_ialu");	# a1·b1
+	&mov	(&DWP(8,"esp"),$lo);
+	&mov	(&DWP(12,"esp"),$hi);
+
+	&mov	($a,&wparam(2));
+	&mov	($b,&wparam(4));
+	&call	("_mul_1x1_ialu");	# a0·b0
+	&mov	(&DWP(0,"esp"),$lo);
+	&mov	(&DWP(4,"esp"),$hi);
+
+	&mov	($a,&wparam(1));
+	&mov	($b,&wparam(3));
+	&xor	($a,&wparam(2));
+	&xor	($b,&wparam(4));
+	&call	("_mul_1x1_ialu");	# (a0+a1)·(b0+b1)
+
+	&mov	("ebp",&wparam(0));
+		 @r=("ebx","ecx","edi","esi");
+	&mov	(@r[0],&DWP(0,"esp"));
+	&mov	(@r[1],&DWP(4,"esp"));
+	&mov	(@r[2],&DWP(8,"esp"));
+	&mov	(@r[3],&DWP(12,"esp"));
+
+	&xor	($lo,$hi);
+	&xor	($hi,@r[1]);
+	&xor	($lo,@r[0]);
+	&mov	(&DWP(0,"ebp"),@r[0]);
+	&xor	($hi,@r[2]);
+	&mov	(&DWP(12,"ebp"),@r[3]);
+	&xor	($lo,@r[3]);
+	&stack_pop(4+1);
+	&xor	($hi,@r[3]);
+	&pop	("edi");
+	&xor	($lo,$hi);
+	&pop	("esi");
+	&mov	(&DWP(8,"ebp"),$hi);
+	&pop	("ebx");
+	&mov	(&DWP(4,"ebp"),$lo);
+	&pop	("ebp");
+	&ret	();
+&function_end_B("bn_GF2m_mul_2x2");
+
+&asciz	("GF(2^m) Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
--- a/crypto/bn/asm/x86-mont.pl
+++ b/crypto/bn/asm/x86-mont.pl
@@ -0,0 +1,615 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi";	$tp="esi";		# overlapping variables!!!
+$rp="edi";	$bp="edi";		# overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp");			# stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32;				# size of above frame rounded up to 16n
+
+	&xor	("eax","eax");
+	&mov	("edi",&wparam(5));	# int num
+	&cmp	("edi",4);
+	&jl	(&label("just_leave"));
+
+	&lea	("esi",&wparam(0));	# put aside pointer to argument block
+	&lea	("edx",&wparam(1));	# load ap
+	&add	("edi",2);		# extra two words on top of tp
+	&neg	("edi");
+	&lea	("ebp",&DWP(-$frame,"esp","edi",4));	# future alloca($frame+4*(num+2))
+	&neg	("edi");
+
+	# minimize cache contention by arraning 2K window between stack
+	# pointer and ap argument [np is also position sensitive vector,
+	# but it's assumed to be near ap, as it's allocated at ~same
+	# time].
+	&mov	("eax","ebp");
+	&sub	("eax","edx");
+	&and	("eax",2047);
+	&sub	("ebp","eax");		# this aligns sp and ap modulo 2048
+
+	&xor	("edx","ebp");
+	&and	("edx",2048);
+	&xor	("edx",2048);
+	&sub	("ebp","edx");		# this splits them apart modulo 4096
+
+	&and	("ebp",-64);		# align to cache line
+
+	# Some OSes, *cough*-dows, insist on stack being "wired" to
+	# physical memory in strictly sequential manner, i.e. if stack
+	# allocation spans two pages, then reference to farmost one can
+	# be punishable by SEGV. But page walking can do good even on
+	# other OSes, because it guarantees that villain thread hits
+	# the guard page before it can make damage to innocent one...
+	&mov	("eax","esp");
+	&sub	("eax","ebp");
+	&and	("eax",-4096);
+	&mov	("edx","esp");		# saved stack pointer!
+	&lea	("esp",&DWP(0,"ebp","eax"));
+	&mov	("eax",&DWP(0,"esp"));
+	&cmp	("esp","ebp");
+	&ja	(&label("page_walk"));
+	&jmp	(&label("page_walk_done"));
+
+&set_label("page_walk",16);
+	&lea	("esp",&DWP(-4096,"esp"));
+	&mov	("eax",&DWP(0,"esp"));
+	&cmp	("esp","ebp");
+	&ja	(&label("page_walk"));
+&set_label("page_walk_done");
+
+	################################# load argument block...
+	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+	&mov	("ebp",&DWP(3*4,"esi"));# const BN_ULONG *np
+	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+	#&mov	("edi",&DWP(5*4,"esi"));# int num
+
+	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
+	&mov	($_rp,"eax");		# ... save a copy of argument block
+	&mov	($_ap,"ebx");
+	&mov	($_bp,"ecx");
+	&mov	($_np,"ebp");
+	&mov	($_n0,"esi");
+	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
+	#&mov	($_num,$num);		# redundant as $num is not reused
+	&mov	($_sp,"edx");		# saved stack pointer!
+
+if($sse2) {
+$acc0="mm0";	# mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+	&picmeup("eax","OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,"eax"),26);
+	&jnc	(&label("non_sse2"));
+
+	&mov	("eax",-1);
+	&movd	($mask,"eax");		# mask 32 lower bits
+
+	&mov	($ap,$_ap);		# load input pointers
+	&mov	($bp,$_bp);
+	&mov	($np,$_np);
+
+	&xor	($i,$i);		# i=0
+	&xor	($j,$j);		# j=0
+
+	&movd	($mul0,&DWP(0,$bp));		# bp[0]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
+	&movq	($car0,$mul1);
+	&movq	($acc0,$mul1);			# I wish movd worked for
+	&pand	($acc0,$mask);			# inter-register transfers
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
+	&paddq	($car1,$acc0);
+
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&inc	($j);				# j++
+&set_label("1st",16);
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
+	&psrlq	($car1,32);
+
+	&lea	($j,&DWP(1,$j));
+	&cmp	($j,$num);
+	&jl	(&label("1st"));
+
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&paddq	($car1,$car0);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&inc	($i);				# i++
+&set_label("outer");
+	&xor	($j,$j);			# j=0
+
+	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
+
+	&paddq	($mul1,$temp);			# +=tp[0]
+	&movq	($acc0,$mul1);
+	&movq	($car0,$mul1);
+	&pand	($acc0,$mask);
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);
+	&paddq	($car1,$acc0);
+
+	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[1]
+
+	&inc	($j);				# j++
+	&dec	($num);
+&set_label("inner");
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[j+1]
+
+	&dec	($num);
+	&lea	($j,&DWP(1,$j));		# j++
+	&jnz	(&label("inner"));
+
+	&mov	($num,$j);
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
+	&paddq	($car1,$car0);
+	&paddq	($car1,$temp);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&lea	($i,&DWP(1,$i));		# i++
+	&cmp	($i,$num);
+	&jle	(&label("outer"));
+
+	&emms	();				# done with mmx bank
+	&jmp	(&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
+
+if (0) {
+	&mov	("esp",$_sp);
+	&xor	("eax","eax");	# signal "not fast enough [yet]"
+	&jmp	(&label("just_leave"));
+	# While the below code provides competitive performance for
+	# all key lengthes on modern Intel cores, it's still more
+	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+	# means compared to the original integer-only assembler.
+	# 512-bit RSA sign is better by ~40%, but that's about all
+	# one can say about all CPUs...
+} else {
+$inp="esi";	# integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+	&mov	($inp,$_ap);
+	&lea	($carry,&DWP(1,$num));
+	&mov	($word,$_bp);
+	&xor	($j,$j);				# j=0
+	&mov	("edx",$inp);
+	&and	($carry,1);				# see if num is even
+	&sub	("edx",$word);				# see if ap==bp
+	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
+	&or	($carry,"edx");
+	&mov	($word,&DWP(0,$word));			# bp[0]
+	&jz	(&label("bn_sqr_mont"));
+	&mov	($_bpend,"eax");
+	&mov	("eax",&DWP(0,$inp));
+	&xor	("edx","edx");
+
+&set_label("mull",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[0]
+	&add	($carry,"eax");
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("mull"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[0]
+	 &mov	($word,$_n0);
+	&add	("eax",$carry);
+	 &mov	($inp,$_np);
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
+	&xor	($j,$j);
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&inc	($j);
+
+	&jmp	(&label("2ndmadd"));
+
+&set_label("1stmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[i]
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("1stmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[i]
+	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&xor	($j,$j);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
+	&adc	($j,0);
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&mov	($j,1);
+
+&set_label("2ndmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
+	&jl	(&label("2ndmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&xor	("eax","eax");
+	 &mov	($j,$_bp);				# &bp[i]
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	 &lea	($j,&DWP(4,$j));
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	 &cmp	($j,$_bpend);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(0,$j));			# bp[i+1]
+	&mov	($inp,$_ap);
+	&mov	($_bp,$j);				# &bp[++i]
+	&xor	($j,$j);
+	&xor	("edx","edx");
+	&mov	("eax",&DWP(0,$inp));
+	&jmp	(&label("1stmadd"));
+
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+	&mov	($_num,$num);
+	&mov	($_bp,$j);				# i=0
+
+	&mov	("eax",$word);				# ap[0]
+	&mul	($word);				# ap[0]*ap[0]
+	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
+	&mov	($sbit,"edx");
+	&shr	("edx",1);
+	&and	($sbit,1);
+	&inc	($j);
+&set_label("sqr",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[0]
+	&add	("eax",$carry);
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	&shr	("eax",31);
+	&cmp	($j,$_num);
+	&mov	($sbit,"eax");
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("sqr"));
+
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*ap[0]
+	&add	("eax",$carry);
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+	&shr	("eax",31);
+	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
+
+	&lea	($carry,&DWP(0,"eax","edx",2));
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&shr	("edx",31);
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	($num,$j);
+	&adc	("edx",0);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&mov	($j,1);
+
+&set_label("3rdmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j+1]*m
+	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
+	&lea	($j,&DWP(2,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("3rdmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&mov	($j,$_bp);				# i
+	&xor	("eax","eax");
+	&mov	($inp,$_ap);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	&cmp	($j,$num);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
+	&lea	($j,&DWP(1,$j));
+	&mov	("eax",$word);
+	&mov	($_bp,$j);				# ++i
+	&mul	($word);				# ap[i]*ap[i]
+	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
+	&adc	("edx",0);
+	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
+	&xor	($carry,$carry);
+	&cmp	($j,$num);
+	&lea	($j,&DWP(1,$j));
+	&je	(&label("sqrlast"));
+
+	&mov	($sbit,"edx");				# zaps $num
+	&shr	("edx",1);
+	&and	($sbit,1);
+&set_label("sqradd",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[i]
+	&add	("eax",$carry);
+	&lea	($carry,&DWP(0,"eax","eax"));
+	&adc	("edx",0);
+	&shr	("eax",31);
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("eax",0);
+	&add	($carry,$sbit);
+	&adc	("eax",0);
+	&cmp	($j,$_num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&mov	($sbit,"eax");
+	&jle	(&label("sqradd"));
+
+	&mov	($carry,"edx");
+	&add	("edx","edx");
+	&shr	($carry,31);
+	&add	("edx",$sbit);
+	&adc	($carry,0);
+&set_label("sqrlast");
+	&mov	($word,$_n0);
+	&mov	($inp,$_np);
+	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&adc	($carry,0);
+	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&lea	($num,&DWP(-1,$j));
+	&adc	("edx",0);
+	&mov	($j,1);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+
+	&jmp	(&label("3rdmadd"));
+}
+
+&set_label("common_tail",16);
+	&mov	($np,$_np);			# load modulus pointer
+	&mov	($rp,$_rp);			# load result pointer
+	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
+
+	&mov	("eax",&DWP(0,$tp));		# tp[0]
+	&mov	($j,$num);			# j=num-1
+	&xor	($i,$i);			# i=0 and clear CF!
+
+&set_label("sub",16);
+	&sbb	("eax",&DWP(0,$np,$i,4));
+	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
+	&dec	($j);				# doesn't affect CF!
+	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
+	&lea	($i,&DWP(1,$i));		# i++
+	&jge	(&label("sub"));
+
+	&sbb	("eax",0);			# handle upmost overflow bit
+	&and	($tp,"eax");
+	&not	("eax");
+	&mov	($np,$rp);
+	&and	($np,"eax");
+	&or	($tp,$np);			# tp=carry?tp:rp
+
+&set_label("copy",16);				# copy or in-place refresh
+	&mov	("eax",&DWP(0,$tp,$num,4));
+	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
+	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
+	&dec	($num);
+	&jge	(&label("copy"));
+
+	&mov	("esp",$_sp);		# pull saved stack pointer
+	&mov	("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
--- a/crypto/bn/asm/x86.pl
+++ b/crypto/bn/asm/x86.pl
@@ -0,0 +1,28 @@
+#!/usr/local/bin/perl
+
+push(@INC,"perlasm","../../perlasm");
+require "x86asm.pl";
+
+require("x86/mul_add.pl");
+require("x86/mul.pl");
+require("x86/sqr.pl");
+require("x86/div.pl");
+require("x86/add.pl");
+require("x86/sub.pl");
+require("x86/comba.pl");
+
+&asm_init($ARGV[0],$0);
+
+&bn_mul_add_words("bn_mul_add_words");
+&bn_mul_words("bn_mul_words");
+&bn_sqr_words("bn_sqr_words");
+&bn_div_words("bn_div_words");
+&bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
+&bn_mul_comba("bn_mul_comba8",8);
+&bn_mul_comba("bn_mul_comba4",4);
+&bn_sqr_comba("bn_sqr_comba8",8);
+&bn_sqr_comba("bn_sqr_comba4",4);
+
+&asm_finish();
+
--- a/crypto/bn/asm/x86/add.pl
+++ b/crypto/bn/asm/x86/add.pl
@@ -0,0 +1,76 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_add_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *a
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+1;
--- a/crypto/bn/asm/x86/comba.pl
+++ b/crypto/bn/asm/x86/comba.pl
@@ -0,0 +1,277 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub mul_add_c
+	{
+	local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("mul a[$ai]*b[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	&mul("edx");
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# laod next a
+	 &mov("eax",&wparam(0)) if $pos > 0;			# load r[]
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0;	# laod next b
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1;	# laod next b
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop 
+	 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# laod next a
+	}
+
+sub sqr_add_c
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop 
+	 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# load next b
+	}
+
+sub sqr_add_c2
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$a,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add("eax","eax");
+	 ###
+	&adc("edx","edx");
+	 ###
+	&adc($c2,0);
+	 &add($c0,"eax");
+	&adc($c1,"edx");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;	# load next b
+	&adc($c2,0);
+	&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
+	 ###
+	}
+
+sub bn_mul_comba
+	{
+	local($name,$num)=@_;
+	local($a,$b,$c0,$c1,$c2);
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($tot,$end);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$b="edi";
+	
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	&push("esi");
+	 &mov($a,&wparam(1));
+	&push("edi");
+	 &mov($b,&wparam(2));
+	&push("ebp");
+	 &push("ebx");
+
+	&xor($c0,$c0);
+	 &mov("eax",&DWP(0,$a,"",0));	# load the first word 
+	&xor($c1,$c1);
+	 &mov("edx",&DWP(0,$b,"",0));	# load the first second 
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("################## Calculate word $i"); 
+
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($j+1) == $end)
+				{
+				$v=1;
+				$v=2 if (($i+1) == $tot);
+				}
+			else
+				{ $v=0; }
+			if (($j+1) != $end)
+				{
+				$na=($ai-1);
+				$nb=($bi+1);
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
+			&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				# &mov("eax",&wparam(0));
+				# &mov(&DWP($i*4,"eax","",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&comment("save r[$i]");
+	# &mov("eax",&wparam(0));
+	&mov(&DWP($i*4,"eax","",0),$c0);
+
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
+
+sub bn_sqr_comba
+	{
+	local($name,$num)=@_;
+	local($r,$a,$c0,$c1,$c2)=@_;
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($b,$tot,$end,$half);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$r="edi";
+
+	&push("esi");
+	 &push("edi");
+	&push("ebp");
+	 &push("ebx");
+	&mov($r,&wparam(0));
+	 &mov($a,&wparam(1));
+	&xor($c0,$c0);
+	 &xor($c1,$c1);
+	&mov("eax",&DWP(0,$a,"",0)); # load the first word
+
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("############### Calculate word $i");
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($ai-1) < ($bi+1))
+				{
+				$v=1;
+				$v=2 if ($i+1) == $tot;
+				}
+			else
+				{ $v=0; }
+			if (!$v)
+				{
+				$na=$ai-1;
+				$nb=$bi+1;
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+			if ($ai == $bi)
+				{
+				&sqr_add_c($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			else
+				{
+				&sqr_add_c2($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				#&mov(&DWP($i*4,$r,"",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				last;
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&mov(&DWP($i*4,$r,"",0),$c0);
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
+
+1;
--- a/crypto/bn/asm/x86/div.pl
+++ b/crypto/bn/asm/x86/div.pl
@@ -0,0 +1,15 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_div_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+	&mov("edx",&wparam(0));	#
+	&mov("eax",&wparam(1));	#
+	&mov("ebx",&wparam(2));	#
+	&div("ebx");
+	&function_end($name);
+	}
+1;
--- a/crypto/bn/asm/x86/f
+++ b/crypto/bn/asm/x86/f
@@ -0,0 +1,3 @@
+#!/usr/local/bin/perl
+# x86 assember
+
--- a/crypto/bn/asm/x86/mul.pl
+++ b/crypto/bn/asm/x86/mul.pl
@@ -0,0 +1,77 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_mul_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ecx";
+	$r="edi";
+	$c="esi";
+	$num="ebp";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+	&mov($w,&wparam(3));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("mw_finish"));
+
+	&set_label("mw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jz(&label("mw_finish"));
+	&jmp(&label("mw_loop"));
+
+	&set_label("mw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jnz(&label("mw_finish2"));
+	&jmp(&label("mw_end"));
+
+	&set_label("mw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a,"",0));# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		 &dec($num) if ($i != 7-1);
+		&jz(&label("mw_end")) if ($i != 7-1);
+		}
+	&set_label("mw_end",0);
+	&mov("eax",$c);
+
+	&function_end($name);
+	}
+
+1;
--- a/crypto/bn/asm/x86/mul_add.pl
+++ b/crypto/bn/asm/x86/mul_add.pl
@@ -0,0 +1,87 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_mul_add_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ebp";
+	$r="edi";
+	$c="esi";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+
+	&mov("ecx",&wparam(2));	#
+	&mov($a,&wparam(1));	#
+
+	&and("ecx",0xfffffff8);	# num / 8
+	&mov($w,&wparam(3));	#
+
+	&push("ecx");		# Up the stack for a tmp variable
+
+	&jz(&label("maw_finish"));
+
+	&set_label("maw_loop",0);
+
+	&mov(&swtmp(0),"ecx");	#
+
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);		# L(t)+= *r
+		 &mov($c,&DWP($i,$r,"",0));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",$c);		# L(t)+=c
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&mov("ecx",&swtmp(0));	#
+	&add($a,32);
+	&add($r,32);
+	&sub("ecx",8);
+	&jnz(&label("maw_loop"));
+
+	&set_label("maw_finish",0);
+	&mov("ecx",&wparam(2));	# get num
+	&and("ecx",7);
+	&jnz(&label("maw_finish2"));	# helps branch prediction
+	&jmp(&label("maw_end"));
+
+	&set_label("maw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a,"",0));# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 &mov($c,&DWP($i*4,$r,"",0));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",$c);
+		&adc("edx",0);			# H(t)+=carry
+		 &dec("ecx") if ($i != 7-1);
+		&mov(&DWP($i*4,$r,"",0),"eax");	# *r= L(t);
+		 &mov($c,"edx");			# c=  H(t);
+		&jz(&label("maw_end")) if ($i != 7-1);
+		}
+	&set_label("maw_end",0);
+	&mov("eax",$c);
+
+	&pop("ecx");	# clear variable from
+
+	&function_end($name);
+	}
+
+1;
--- a/crypto/bn/asm/x86/sqr.pl
+++ b/crypto/bn/asm/x86/sqr.pl
@@ -0,0 +1,60 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_sqr_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$r="esi";
+	$a="edi";
+	$num="ebx";
+
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("sw_finish"));
+
+	&set_label("sw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+		&mov("eax",&DWP($i,$a,"",0)); 	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*2,$r,"",0),"eax");	#
+		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,64);
+	&sub($num,8);
+	&jnz(&label("sw_loop"));
+
+	&set_label("sw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jz(&label("sw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov("eax",&DWP($i*4,$a,"",0));	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*8,$r,"",0),"eax");	#
+		 &dec($num) if ($i != 7-1);
+		&mov(&DWP($i*8+4,$r,"",0),"edx");
+		 &jz(&label("sw_end")) if ($i != 7-1);
+		}
+	&set_label("sw_end",0);
+
+	&function_end($name);
+	}
+
+1;
--- a/crypto/bn/asm/x86/sub.pl
+++ b/crypto/bn/asm/x86/sub.pl
@@ -0,0 +1,76 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_sub_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *a
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+1;
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -0,0 +1,638 @@
+#include "../bn_lcl.h"
+#if !(defined(__GNUC__) && __GNUC__>=2)
+# include "../bn_asm.c"         /* kind of dirty hack for Sun Studio */
+#else
+/*-
+ * x86_64 BIGNUM accelerator version 0.1, December 2002.
+ *
+ * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project.
+ *
+ * Rights for redistribution and usage in source and binary forms are
+ * granted according to the OpenSSL license. Warranty of any kind is
+ * disclaimed.
+ *
+ * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
+ *    versions, like 1.0...
+ * A. Well, that's because this code is basically a quick-n-dirty
+ *    proof-of-concept hack. As you can see it's implemented with
+ *    inline assembler, which means that you're bound to GCC and that
+ *    there might be enough room for further improvement.
+ *
+ * Q. Why inline assembler?
+ * A. x86_64 features own ABI which I'm not familiar with. This is
+ *    why I decided to let the compiler take care of subroutine
+ *    prologue/epilogue as well as register allocation. For reference.
+ *    Win64 implements different ABI for AMD64, different from Linux.
+ *
+ * Q. How much faster does it get?
+ * A. 'apps/openssl speed rsa dsa' output with no-asm:
+ *
+ *                        sign    verify    sign/s verify/s
+ *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
+ *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
+ *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
+ *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
+ *                        sign    verify    sign/s verify/s
+ *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
+ *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
+ *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
+ *
+ *    'apps/openssl speed rsa dsa' output with this module:
+ *
+ *                        sign    verify    sign/s verify/s
+ *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
+ *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
+ *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
+ *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
+ *                        sign    verify    sign/s verify/s
+ *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
+ *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
+ *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
+ *
+ *    For the reference. IA-32 assembler implementation performs
+ *    very much like 64-bit code compiled with no-asm on the same
+ *    machine.
+ */
+
+# if defined(_WIN64) || !defined(__LP64__)
+#  define BN_ULONG unsigned long long
+# else
+#  define BN_ULONG unsigned long
+# endif
+
+# undef mul
+# undef mul_add
+
+/*-
+ * "m"(a), "+m"(r)      is the way to favor DirectPath µ-code;
+ * "g"(0)               let the compiler to decide where does it
+ *                      want to keep the value of zero;
+ */
+# define mul_add(r,a,word,carry) do {   \
+        register BN_ULONG high,low;     \
+        asm ("mulq %3"                  \
+                : "=a"(low),"=d"(high)  \
+                : "a"(word),"m"(a)      \
+                : "cc");                \
+        asm ("addq %2,%0; adcq %3,%1"   \
+                : "+r"(carry),"+d"(high)\
+                : "a"(low),"g"(0)       \
+                : "cc");                \
+        asm ("addq %2,%0; adcq %3,%1"   \
+                : "+m"(r),"+d"(high)    \
+                : "r"(carry),"g"(0)     \
+                : "cc");                \
+        carry=high;                     \
+        } while (0)
+
+# define mul(r,a,word,carry) do {       \
+        register BN_ULONG high,low;     \
+        asm ("mulq %3"                  \
+                : "=a"(low),"=d"(high)  \
+                : "a"(word),"g"(a)      \
+                : "cc");                \
+        asm ("addq %2,%0; adcq %3,%1"   \
+                : "+r"(carry),"+d"(high)\
+                : "a"(low),"g"(0)       \
+                : "cc");                \
+        (r)=carry, carry=high;          \
+        } while (0)
+# undef sqr
+# define sqr(r0,r1,a)                   \
+        asm ("mulq %2"                  \
+                : "=a"(r0),"=d"(r1)     \
+                : "a"(a)                \
+                : "cc");
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
+                          BN_ULONG w)
+{
+    BN_ULONG c1 = 0;
+
+    if (num <= 0)
+        return (c1);
+
+    while (num & ~3) {
+        mul_add(rp[0], ap[0], w, c1);
+        mul_add(rp[1], ap[1], w, c1);
+        mul_add(rp[2], ap[2], w, c1);
+        mul_add(rp[3], ap[3], w, c1);
+        ap += 4;
+        rp += 4;
+        num -= 4;
+    }
+    if (num) {
+        mul_add(rp[0], ap[0], w, c1);
+        if (--num == 0)
+            return c1;
+        mul_add(rp[1], ap[1], w, c1);
+        if (--num == 0)
+            return c1;
+        mul_add(rp[2], ap[2], w, c1);
+        return c1;
+    }
+
+    return (c1);
+}
+
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
+{
+    BN_ULONG c1 = 0;
+
+    if (num <= 0)
+        return (c1);
+
+    while (num & ~3) {
+        mul(rp[0], ap[0], w, c1);
+        mul(rp[1], ap[1], w, c1);
+        mul(rp[2], ap[2], w, c1);
+        mul(rp[3], ap[3], w, c1);
+        ap += 4;
+        rp += 4;
+        num -= 4;
+    }
+    if (num) {
+        mul(rp[0], ap[0], w, c1);
+        if (--num == 0)
+            return c1;
+        mul(rp[1], ap[1], w, c1);
+        if (--num == 0)
+            return c1;
+        mul(rp[2], ap[2], w, c1);
+    }
+    return (c1);
+}
+
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
+{
+    if (n <= 0)
+        return;
+
+    while (n & ~3) {
+        sqr(r[0], r[1], a[0]);
+        sqr(r[2], r[3], a[1]);
+        sqr(r[4], r[5], a[2]);
+        sqr(r[6], r[7], a[3]);
+        a += 4;
+        r += 8;
+        n -= 4;
+    }
+    if (n) {
+        sqr(r[0], r[1], a[0]);
+        if (--n == 0)
+            return;
+        sqr(r[2], r[3], a[1]);
+        if (--n == 0)
+            return;
+        sqr(r[4], r[5], a[2]);
+    }
+}
+
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
+{
+    BN_ULONG ret, waste;
+
+ asm("divq      %4":"=a"(ret), "=d"(waste)
+ :     "a"(l), "d"(h), "r"(d)
+ :     "cc");
+
+    return ret;
+}
+
+BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      int n)
+{
+    BN_ULONG ret;
+    size_t i = 0;
+
+    if (n <= 0)
+        return 0;
+
+    asm volatile ("       subq    %0,%0           \n" /* clear carry */
+                  "       jmp     1f              \n"
+                  ".p2align 4                     \n"
+                  "1:     movq    (%4,%2,8),%0    \n"
+                  "       adcq    (%5,%2,8),%0    \n"
+                  "       movq    %0,(%3,%2,8)    \n"
+                  "       lea     1(%2),%2        \n"
+                  "       loop    1b              \n"
+                  "       sbbq    %0,%0           \n":"=&r" (ret), "+c"(n),
+                  "+r"(i)
+                  :"r"(rp), "r"(ap), "r"(bp)
+                  :"cc", "memory");
+
+    return ret & 1;
+}
+
+# ifndef SIMICS
+BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      int n)
+{
+    BN_ULONG ret;
+    size_t i = 0;
+
+    if (n <= 0)
+        return 0;
+
+    asm volatile ("       subq    %0,%0           \n" /* clear borrow */
+                  "       jmp     1f              \n"
+                  ".p2align 4                     \n"
+                  "1:     movq    (%4,%2,8),%0    \n"
+                  "       sbbq    (%5,%2,8),%0    \n"
+                  "       movq    %0,(%3,%2,8)    \n"
+                  "       lea     1(%2),%2        \n"
+                  "       loop    1b              \n"
+                  "       sbbq    %0,%0           \n":"=&r" (ret), "+c"(n),
+                  "+r"(i)
+                  :"r"(rp), "r"(ap), "r"(bp)
+                  :"cc", "memory");
+
+    return ret & 1;
+}
+# else
+/* Simics 1.4<7 has buggy sbbq:-( */
+#  define BN_MASK2 0xffffffffffffffffL
+BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
+{
+    BN_ULONG t1, t2;
+    int c = 0;
+
+    if (n <= 0)
+        return ((BN_ULONG)0);
+
+    for (;;) {
+        t1 = a[0];
+        t2 = b[0];
+        r[0] = (t1 - t2 - c) & BN_MASK2;
+        if (t1 != t2)
+            c = (t1 < t2);
+        if (--n <= 0)
+            break;
+
+        t1 = a[1];
+        t2 = b[1];
+        r[1] = (t1 - t2 - c) & BN_MASK2;
+        if (t1 != t2)
+            c = (t1 < t2);
+        if (--n <= 0)
+            break;
+
+        t1 = a[2];
+        t2 = b[2];
+        r[2] = (t1 - t2 - c) & BN_MASK2;
+        if (t1 != t2)
+            c = (t1 < t2);
+        if (--n <= 0)
+            break;
+
+        t1 = a[3];
+        t2 = b[3];
+        r[3] = (t1 - t2 - c) & BN_MASK2;
+        if (t1 != t2)
+            c = (t1 < t2);
+        if (--n <= 0)
+            break;
+
+        a += 4;
+        b += 4;
+        r += 4;
+    }
+    return (c);
+}
+# endif
+
+/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
+/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
+/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
+/*
+ * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
+ * c=(c2,c1,c0)
+ */
+
+/*
+ * Keep in mind that carrying into high part of multiplication result
+ * can not overflow, because it cannot be all-ones.
+ */
+# if 0
+/* original macros are kept for reference purposes */
+#  define mul_add_c(a,b,c0,c1,c2)       do {    \
+        BN_ULONG ta = (a), tb = (b);            \
+        BN_ULONG lo, hi;                        \
+        BN_UMULT_LOHI(lo,hi,ta,tb);             \
+        c0 += lo; hi += (c0<lo)?1:0;            \
+        c1 += hi; c2 += (c1<hi)?1:0;            \
+        } while(0)
+
+#  define mul_add_c2(a,b,c0,c1,c2)      do {    \
+        BN_ULONG ta = (a), tb = (b);            \
+        BN_ULONG lo, hi, tt;                    \
+        BN_UMULT_LOHI(lo,hi,ta,tb);             \
+        c0 += lo; tt = hi+((c0<lo)?1:0);        \
+        c1 += tt; c2 += (c1<tt)?1:0;            \
+        c0 += lo; hi += (c0<lo)?1:0;            \
+        c1 += hi; c2 += (c1<hi)?1:0;            \
+        } while(0)
+
+#  define sqr_add_c(a,i,c0,c1,c2)       do {    \
+        BN_ULONG ta = (a)[i];                   \
+        BN_ULONG lo, hi;                        \
+        BN_UMULT_LOHI(lo,hi,ta,ta);             \
+        c0 += lo; hi += (c0<lo)?1:0;            \
+        c1 += hi; c2 += (c1<hi)?1:0;            \
+        } while(0)
+# else
+#  define mul_add_c(a,b,c0,c1,c2) do {  \
+        BN_ULONG t1,t2;                 \
+        asm ("mulq %3"                  \
+                : "=a"(t1),"=d"(t2)     \
+                : "a"(a),"m"(b)         \
+                : "cc");                \
+        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
+                : "+r"(c0),"+r"(c1),"+r"(c2)            \
+                : "r"(t1),"r"(t2),"g"(0)                \
+                : "cc");                                \
+        } while (0)
+
+#  define sqr_add_c(a,i,c0,c1,c2) do {  \
+        BN_ULONG t1,t2;                 \
+        asm ("mulq %2"                  \
+                : "=a"(t1),"=d"(t2)     \
+                : "a"(a[i])             \
+                : "cc");                \
+        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
+                : "+r"(c0),"+r"(c1),"+r"(c2)            \
+                : "r"(t1),"r"(t2),"g"(0)                \
+                : "cc");                                \
+        } while (0)
+
+#  define mul_add_c2(a,b,c0,c1,c2) do { \
+        BN_ULONG t1,t2;                 \
+        asm ("mulq %3"                  \
+                : "=a"(t1),"=d"(t2)     \
+                : "a"(a),"m"(b)         \
+                : "cc");                \
+        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
+                : "+r"(c0),"+r"(c1),"+r"(c2)            \
+                : "r"(t1),"r"(t2),"g"(0)                \
+                : "cc");                                \
+        asm ("addq %3,%0; adcq %4,%1; adcq %5,%2"       \
+                : "+r"(c0),"+r"(c1),"+r"(c2)            \
+                : "r"(t1),"r"(t2),"g"(0)                \
+                : "cc");                                \
+        } while (0)
+# endif
+
+# define sqr_add_c2(a,i,j,c0,c1,c2)      \
+        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
+
+void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+{
+    BN_ULONG c1, c2, c3;
+
+    c1 = 0;
+    c2 = 0;
+    c3 = 0;
+    mul_add_c(a[0], b[0], c1, c2, c3);
+    r[0] = c1;
+    c1 = 0;
+    mul_add_c(a[0], b[1], c2, c3, c1);
+    mul_add_c(a[1], b[0], c2, c3, c1);
+    r[1] = c2;
+    c2 = 0;
+    mul_add_c(a[2], b[0], c3, c1, c2);
+    mul_add_c(a[1], b[1], c3, c1, c2);
+    mul_add_c(a[0], b[2], c3, c1, c2);
+    r[2] = c3;
+    c3 = 0;
+    mul_add_c(a[0], b[3], c1, c2, c3);
+    mul_add_c(a[1], b[2], c1, c2, c3);
+    mul_add_c(a[2], b[1], c1, c2, c3);
+    mul_add_c(a[3], b[0], c1, c2, c3);
+    r[3] = c1;
+    c1 = 0;
+    mul_add_c(a[4], b[0], c2, c3, c1);
+    mul_add_c(a[3], b[1], c2, c3, c1);
+    mul_add_c(a[2], b[2], c2, c3, c1);
+    mul_add_c(a[1], b[3], c2, c3, c1);
+    mul_add_c(a[0], b[4], c2, c3, c1);
+    r[4] = c2;
+    c2 = 0;
+    mul_add_c(a[0], b[5], c3, c1, c2);
+    mul_add_c(a[1], b[4], c3, c1, c2);
+    mul_add_c(a[2], b[3], c3, c1, c2);
+    mul_add_c(a[3], b[2], c3, c1, c2);
+    mul_add_c(a[4], b[1], c3, c1, c2);
+    mul_add_c(a[5], b[0], c3, c1, c2);
+    r[5] = c3;
+    c3 = 0;
+    mul_add_c(a[6], b[0], c1, c2, c3);
+    mul_add_c(a[5], b[1], c1, c2, c3);
+    mul_add_c(a[4], b[2], c1, c2, c3);
+    mul_add_c(a[3], b[3], c1, c2, c3);
+    mul_add_c(a[2], b[4], c1, c2, c3);
+    mul_add_c(a[1], b[5], c1, c2, c3);
+    mul_add_c(a[0], b[6], c1, c2, c3);
+    r[6] = c1;
+    c1 = 0;
+    mul_add_c(a[0], b[7], c2, c3, c1);
+    mul_add_c(a[1], b[6], c2, c3, c1);
+    mul_add_c(a[2], b[5], c2, c3, c1);
+    mul_add_c(a[3], b[4], c2, c3, c1);
+    mul_add_c(a[4], b[3], c2, c3, c1);
+    mul_add_c(a[5], b[2], c2, c3, c1);
+    mul_add_c(a[6], b[1], c2, c3, c1);
+    mul_add_c(a[7], b[0], c2, c3, c1);
+    r[7] = c2;
+    c2 = 0;
+    mul_add_c(a[7], b[1], c3, c1, c2);
+    mul_add_c(a[6], b[2], c3, c1, c2);
+    mul_add_c(a[5], b[3], c3, c1, c2);
+    mul_add_c(a[4], b[4], c3, c1, c2);
+    mul_add_c(a[3], b[5], c3, c1, c2);
+    mul_add_c(a[2], b[6], c3, c1, c2);
+    mul_add_c(a[1], b[7], c3, c1, c2);
+    r[8] = c3;
+    c3 = 0;
+    mul_add_c(a[2], b[7], c1, c2, c3);
+    mul_add_c(a[3], b[6], c1, c2, c3);
+    mul_add_c(a[4], b[5], c1, c2, c3);
+    mul_add_c(a[5], b[4], c1, c2, c3);
+    mul_add_c(a[6], b[3], c1, c2, c3);
+    mul_add_c(a[7], b[2], c1, c2, c3);
+    r[9] = c1;
+    c1 = 0;
+    mul_add_c(a[7], b[3], c2, c3, c1);
+    mul_add_c(a[6], b[4], c2, c3, c1);
+    mul_add_c(a[5], b[5], c2, c3, c1);
+    mul_add_c(a[4], b[6], c2, c3, c1);
+    mul_add_c(a[3], b[7], c2, c3, c1);
+    r[10] = c2;
+    c2 = 0;
+    mul_add_c(a[4], b[7], c3, c1, c2);
+    mul_add_c(a[5], b[6], c3, c1, c2);
+    mul_add_c(a[6], b[5], c3, c1, c2);
+    mul_add_c(a[7], b[4], c3, c1, c2);
+    r[11] = c3;
+    c3 = 0;
+    mul_add_c(a[7], b[5], c1, c2, c3);
+    mul_add_c(a[6], b[6], c1, c2, c3);
+    mul_add_c(a[5], b[7], c1, c2, c3);
+    r[12] = c1;
+    c1 = 0;
+    mul_add_c(a[6], b[7], c2, c3, c1);
+    mul_add_c(a[7], b[6], c2, c3, c1);
+    r[13] = c2;
+    c2 = 0;
+    mul_add_c(a[7], b[7], c3, c1, c2);
+    r[14] = c3;
+    r[15] = c1;
+}
+
+void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
+{
+    BN_ULONG c1, c2, c3;
+
+    c1 = 0;
+    c2 = 0;
+    c3 = 0;
+    mul_add_c(a[0], b[0], c1, c2, c3);
+    r[0] = c1;
+    c1 = 0;
+    mul_add_c(a[0], b[1], c2, c3, c1);
+    mul_add_c(a[1], b[0], c2, c3, c1);
+    r[1] = c2;
+    c2 = 0;
+    mul_add_c(a[2], b[0], c3, c1, c2);
+    mul_add_c(a[1], b[1], c3, c1, c2);
+    mul_add_c(a[0], b[2], c3, c1, c2);
+    r[2] = c3;
+    c3 = 0;
+    mul_add_c(a[0], b[3], c1, c2, c3);
+    mul_add_c(a[1], b[2], c1, c2, c3);
+    mul_add_c(a[2], b[1], c1, c2, c3);
+    mul_add_c(a[3], b[0], c1, c2, c3);
+    r[3] = c1;
+    c1 = 0;
+    mul_add_c(a[3], b[1], c2, c3, c1);
+    mul_add_c(a[2], b[2], c2, c3, c1);
+    mul_add_c(a[1], b[3], c2, c3, c1);
+    r[4] = c2;
+    c2 = 0;
+    mul_add_c(a[2], b[3], c3, c1, c2);
+    mul_add_c(a[3], b[2], c3, c1, c2);
+    r[5] = c3;
+    c3 = 0;
+    mul_add_c(a[3], b[3], c1, c2, c3);
+    r[6] = c1;
+    r[7] = c2;
+}
+
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
+{
+    BN_ULONG c1, c2, c3;
+
+    c1 = 0;
+    c2 = 0;
+    c3 = 0;
+    sqr_add_c(a, 0, c1, c2, c3);
+    r[0] = c1;
+    c1 = 0;
+    sqr_add_c2(a, 1, 0, c2, c3, c1);
+    r[1] = c2;
+    c2 = 0;
+    sqr_add_c(a, 1, c3, c1, c2);
+    sqr_add_c2(a, 2, 0, c3, c1, c2);
+    r[2] = c3;
+    c3 = 0;
+    sqr_add_c2(a, 3, 0, c1, c2, c3);
+    sqr_add_c2(a, 2, 1, c1, c2, c3);
+    r[3] = c1;
+    c1 = 0;
+    sqr_add_c(a, 2, c2, c3, c1);
+    sqr_add_c2(a, 3, 1, c2, c3, c1);
+    sqr_add_c2(a, 4, 0, c2, c3, c1);
+    r[4] = c2;
+    c2 = 0;
+    sqr_add_c2(a, 5, 0, c3, c1, c2);
+    sqr_add_c2(a, 4, 1, c3, c1, c2);
+    sqr_add_c2(a, 3, 2, c3, c1, c2);
+    r[5] = c3;
+    c3 = 0;
+    sqr_add_c(a, 3, c1, c2, c3);
+    sqr_add_c2(a, 4, 2, c1, c2, c3);
+    sqr_add_c2(a, 5, 1, c1, c2, c3);
+    sqr_add_c2(a, 6, 0, c1, c2, c3);
+    r[6] = c1;
+    c1 = 0;
+    sqr_add_c2(a, 7, 0, c2, c3, c1);
+    sqr_add_c2(a, 6, 1, c2, c3, c1);
+    sqr_add_c2(a, 5, 2, c2, c3, c1);
+    sqr_add_c2(a, 4, 3, c2, c3, c1);
+    r[7] = c2;
+    c2 = 0;
+    sqr_add_c(a, 4, c3, c1, c2);
+    sqr_add_c2(a, 5, 3, c3, c1, c2);
+    sqr_add_c2(a, 6, 2, c3, c1, c2);
+    sqr_add_c2(a, 7, 1, c3, c1, c2);
+    r[8] = c3;
+    c3 = 0;
+    sqr_add_c2(a, 7, 2, c1, c2, c3);
+    sqr_add_c2(a, 6, 3, c1, c2, c3);
+    sqr_add_c2(a, 5, 4, c1, c2, c3);
+    r[9] = c1;
+    c1 = 0;
+    sqr_add_c(a, 5, c2, c3, c1);
+    sqr_add_c2(a, 6, 4, c2, c3, c1);
+    sqr_add_c2(a, 7, 3, c2, c3, c1);
+    r[10] = c2;
+    c2 = 0;
+    sqr_add_c2(a, 7, 4, c3, c1, c2);
+    sqr_add_c2(a, 6, 5, c3, c1, c2);
+    r[11] = c3;
+    c3 = 0;
+    sqr_add_c(a, 6, c1, c2, c3);
+    sqr_add_c2(a, 7, 5, c1, c2, c3);
+    r[12] = c1;
+    c1 = 0;
+    sqr_add_c2(a, 7, 6, c2, c3, c1);
+    r[13] = c2;
+    c2 = 0;
+    sqr_add_c(a, 7, c3, c1, c2);
+    r[14] = c3;
+    r[15] = c1;
+}
+
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
+{
+    BN_ULONG c1, c2, c3;
+
+    c1 = 0;
+    c2 = 0;
+    c3 = 0;
+    sqr_add_c(a, 0, c1, c2, c3);
+    r[0] = c1;
+    c1 = 0;
+    sqr_add_c2(a, 1, 0, c2, c3, c1);
+    r[1] = c2;
+    c2 = 0;
+    sqr_add_c(a, 1, c3, c1, c2);
+    sqr_add_c2(a, 2, 0, c3, c1, c2);
+    r[2] = c3;
+    c3 = 0;
+    sqr_add_c2(a, 3, 0, c1, c2, c3);
+    sqr_add_c2(a, 2, 1, c1, c2, c3);
+    r[3] = c1;
+    c1 = 0;
+    sqr_add_c(a, 2, c2, c3, c1);
+    sqr_add_c2(a, 3, 1, c2, c3, c1);
+    r[4] = c2;
+    c2 = 0;
+    sqr_add_c2(a, 3, 2, c3, c1, c2);
+    r[5] = c3;
+    c3 = 0;
+    sqr_add_c(a, 3, c1, c2, c3);
+    r[6] = c1;
+    r[7] = c2;
+}
+#endif
--- a/crypto/bn/asm/x86_64-gf2m.pl
+++ b/crypto/bn/asm/x86_64-gf2m.pl
@@ -0,0 +1,390 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: code suitable
+# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
+# later. Improvement varies from one benchmark and µ-arch to another.
+# Vanilla code path is at most 20% faster than compiler-generated code
+# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
+# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
+# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
+# all CPU time is burnt in it...
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+($lo,$hi)=("%rax","%rdx");	$a=$lo;
+($i0,$i1)=("%rsi","%rdi");
+($t0,$t1)=("%rbx","%rcx");
+($b,$mask)=("%rbp","%r8");
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
+($R,$Tx)=("%xmm0","%xmm1");
+
+$code.=<<___;
+.text
+
+.type	_mul_1x1,\@abi-omnipotent
+.align	16
+_mul_1x1:
+	sub	\$128+8,%rsp
+	mov	\$-1,$a1
+	lea	($a,$a),$i0
+	shr	\$3,$a1
+	lea	(,$a,4),$i1
+	and	$a,$a1			# a1=a&0x1fffffffffffffff
+	lea	(,$a,8),$a8
+	sar	\$63,$a			# broadcast 63rd bit
+	lea	($a1,$a1),$a2
+	sar	\$63,$i0		# broadcast 62nd bit
+	lea	(,$a1,4),$a4
+	and	$b,$a
+	sar	\$63,$i1		# boardcast 61st bit
+	mov	$a,$hi			# $a is $lo
+	shl	\$63,$lo
+	and	$b,$i0
+	shr	\$1,$hi
+	mov	$i0,$t1
+	shl	\$62,$i0
+	and	$b,$i1
+	shr	\$2,$t1
+	xor	$i0,$lo
+	mov	$i1,$t0
+	shl	\$61,$i1
+	xor	$t1,$hi
+	shr	\$3,$t0
+	xor	$i1,$lo
+	xor	$t0,$hi
+
+	mov	$a1,$a12
+	movq	\$0,0(%rsp)		# tab[0]=0
+	xor	$a2,$a12		# a1^a2
+	mov	$a1,8(%rsp)		# tab[1]=a1
+	 mov	$a4,$a48
+	mov	$a2,16(%rsp)		# tab[2]=a2
+	 xor	$a8,$a48		# a4^a8
+	mov	$a12,24(%rsp)		# tab[3]=a1^a2
+
+	xor	$a4,$a1
+	mov	$a4,32(%rsp)		# tab[4]=a4
+	xor	$a4,$a2
+	mov	$a1,40(%rsp)		# tab[5]=a1^a4
+	xor	$a4,$a12
+	mov	$a2,48(%rsp)		# tab[6]=a2^a4
+	 xor	$a48,$a1		# a1^a4^a4^a8=a1^a8
+	mov	$a12,56(%rsp)		# tab[7]=a1^a2^a4
+	 xor	$a48,$a2		# a2^a4^a4^a8=a1^a8
+
+	mov	$a8,64(%rsp)		# tab[8]=a8
+	xor	$a48,$a12		# a1^a2^a4^a4^a8=a1^a2^a8
+	mov	$a1,72(%rsp)		# tab[9]=a1^a8
+	 xor	$a4,$a1			# a1^a8^a4
+	mov	$a2,80(%rsp)		# tab[10]=a2^a8
+	 xor	$a4,$a2			# a2^a8^a4
+	mov	$a12,88(%rsp)		# tab[11]=a1^a2^a8
+
+	xor	$a4,$a12		# a1^a2^a8^a4
+	mov	$a48,96(%rsp)		# tab[12]=a4^a8
+	 mov	$mask,$i0
+	mov	$a1,104(%rsp)		# tab[13]=a1^a4^a8
+	 and	$b,$i0
+	mov	$a2,112(%rsp)		# tab[14]=a2^a4^a8
+	 shr	\$4,$b
+	mov	$a12,120(%rsp)		# tab[15]=a1^a2^a4^a8
+	 mov	$mask,$i1
+	 and	$b,$i1
+	 shr	\$4,$b
+
+	movq	(%rsp,$i0,8),$R		# half of calculations is done in SSE2
+	mov	$mask,$i0
+	and	$b,$i0
+	shr	\$4,$b
+___
+    for ($n=1;$n<8;$n++) {
+	$code.=<<___;
+	mov	(%rsp,$i1,8),$t1
+	mov	$mask,$i1
+	mov	$t1,$t0
+	shl	\$`8*$n-4`,$t1
+	and	$b,$i1
+	 movq	(%rsp,$i0,8),$Tx
+	shr	\$`64-(8*$n-4)`,$t0
+	xor	$t1,$lo
+	 pslldq	\$$n,$Tx
+	 mov	$mask,$i0
+	shr	\$4,$b
+	xor	$t0,$hi
+	 and	$b,$i0
+	 shr	\$4,$b
+	 pxor	$Tx,$R
+___
+    }
+$code.=<<___;
+	mov	(%rsp,$i1,8),$t1
+	mov	$t1,$t0
+	shl	\$`8*$n-4`,$t1
+	movq	$R,$i0
+	shr	\$`64-(8*$n-4)`,$t0
+	xor	$t1,$lo
+	psrldq	\$8,$R
+	xor	$t0,$hi
+	movq	$R,$i1
+	xor	$i0,$lo
+	xor	$i1,$hi
+
+	add	\$128+8,%rsp
+	ret
+.Lend_mul_1x1:
+.size	_mul_1x1,.-_mul_1x1
+___
+
+($rp,$a1,$a0,$b1,$b0) = $win64?	("%rcx","%rdx","%r8", "%r9","%r10") :	# Win64 order
+				("%rdi","%rsi","%rdx","%rcx","%r8");	# Unix order
+
+$code.=<<___;
+.extern	OPENSSL_ia32cap_P
+.globl	bn_GF2m_mul_2x2
+.type	bn_GF2m_mul_2x2,\@abi-omnipotent
+.align	16
+bn_GF2m_mul_2x2:
+	mov	OPENSSL_ia32cap_P(%rip),%rax
+	bt	\$33,%rax
+	jnc	.Lvanilla_mul_2x2
+
+	movq		$a1,%xmm0
+	movq		$b1,%xmm1
+	movq		$a0,%xmm2
+___
+$code.=<<___ if ($win64);
+	movq		40(%rsp),%xmm3
+___
+$code.=<<___ if (!$win64);
+	movq		$b0,%xmm3
+___
+$code.=<<___;
+	movdqa		%xmm0,%xmm4
+	movdqa		%xmm1,%xmm5
+	pclmulqdq	\$0,%xmm1,%xmm0	# a1·b1
+	pxor		%xmm2,%xmm4
+	pxor		%xmm3,%xmm5
+	pclmulqdq	\$0,%xmm3,%xmm2	# a0·b0
+	pclmulqdq	\$0,%xmm5,%xmm4	# (a0+a1)·(b0+b1)
+	xorps		%xmm0,%xmm4
+	xorps		%xmm2,%xmm4	# (a0+a1)·(b0+b1)-a0·b0-a1·b1
+	movdqa		%xmm4,%xmm5
+	pslldq		\$8,%xmm4
+	psrldq		\$8,%xmm5
+	pxor		%xmm4,%xmm2
+	pxor		%xmm5,%xmm0
+	movdqu		%xmm2,0($rp)
+	movdqu		%xmm0,16($rp)
+	ret
+
+.align	16
+.Lvanilla_mul_2x2:
+	lea	-8*17(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+	mov	`8*17+40`(%rsp),$b0
+	mov	%rdi,8*15(%rsp)
+	mov	%rsi,8*16(%rsp)
+___
+$code.=<<___;
+	mov	%r14,8*10(%rsp)
+	mov	%r13,8*11(%rsp)
+	mov	%r12,8*12(%rsp)
+	mov	%rbp,8*13(%rsp)
+	mov	%rbx,8*14(%rsp)
+.Lbody_mul_2x2:
+	mov	$rp,32(%rsp)		# save the arguments
+	mov	$a1,40(%rsp)
+	mov	$a0,48(%rsp)
+	mov	$b1,56(%rsp)
+	mov	$b0,64(%rsp)
+
+	mov	\$0xf,$mask
+	mov	$a1,$a
+	mov	$b1,$b
+	call	_mul_1x1		# a1·b1
+	mov	$lo,16(%rsp)
+	mov	$hi,24(%rsp)
+
+	mov	48(%rsp),$a
+	mov	64(%rsp),$b
+	call	_mul_1x1		# a0·b0
+	mov	$lo,0(%rsp)
+	mov	$hi,8(%rsp)
+
+	mov	40(%rsp),$a
+	mov	56(%rsp),$b
+	xor	48(%rsp),$a
+	xor	64(%rsp),$b
+	call	_mul_1x1		# (a0+a1)·(b0+b1)
+___
+	@r=("%rbx","%rcx","%rdi","%rsi");
+$code.=<<___;
+	mov	0(%rsp),@r[0]
+	mov	8(%rsp),@r[1]
+	mov	16(%rsp),@r[2]
+	mov	24(%rsp),@r[3]
+	mov	32(%rsp),%rbp
+
+	xor	$hi,$lo
+	xor	@r[1],$hi
+	xor	@r[0],$lo
+	mov	@r[0],0(%rbp)
+	xor	@r[2],$hi
+	mov	@r[3],24(%rbp)
+	xor	@r[3],$lo
+	xor	@r[3],$hi
+	xor	$hi,$lo
+	mov	$hi,16(%rbp)
+	mov	$lo,8(%rbp)
+
+	mov	8*10(%rsp),%r14
+	mov	8*11(%rsp),%r13
+	mov	8*12(%rsp),%r12
+	mov	8*13(%rsp),%rbp
+	mov	8*14(%rsp),%rbx
+___
+$code.=<<___ if ($win64);
+	mov	8*15(%rsp),%rdi
+	mov	8*16(%rsp),%rsi
+___
+$code.=<<___;
+	lea	8*17(%rsp),%rsp
+	ret
+.Lend_mul_2x2:
+.size	bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz	"GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody_mul_2x2(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lin_prologue
+
+	mov	8*10(%rax),%r14		# mimic epilogue
+	mov	8*11(%rax),%r13
+	mov	8*12(%rax),%r12
+	mov	8*13(%rax),%rbp
+	mov	8*14(%rax),%rbx
+	mov	8*15(%rax),%rdi
+	mov	8*16(%rax),%rsi
+
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+
+.Lin_prologue:
+	lea	8*17(%rax),%rax
+	mov	%rax,152($context)	# restore context->Rsp
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	se_handler,.-se_handler
+
+.section	.pdata
+.align	4
+	.rva	_mul_1x1
+	.rva	.Lend_mul_1x1
+	.rva	.LSEH_info_1x1
+
+	.rva	.Lvanilla_mul_2x2
+	.rva	.Lend_mul_2x2
+	.rva	.LSEH_info_2x2
+.section	.xdata
+.align	8
+.LSEH_info_1x1:
+	.byte	0x01,0x07,0x02,0x00
+	.byte	0x07,0x01,0x11,0x00	# sub rsp,128+8
+.LSEH_info_2x2:
+	.byte	9,0,0,0
+	.rva	se_handler
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl