Update to OpenSSL 1.0.2.o

This commit is contained in:
Steve Dower
2018-04-13 17:29:45 +00:00
parent ccd3ab4aff
commit 4933cd8231
386 changed files with 5623 additions and 2984 deletions

View File

@@ -239,7 +239,7 @@ $code.=<<___;
vmovdqu 32*8-128($ap), $ACC8
lea 192(%rsp), $tp0 # 64+128=192
vpbroadcastq .Land_mask(%rip), $AND_MASK
vmovdqu .Land_mask(%rip), $AND_MASK
jmp .LOOP_GRANDE_SQR_1024
.align 32
@@ -1070,10 +1070,10 @@ $code.=<<___;
vpmuludq 32*6-128($np),$Yi,$TEMP1
vpaddq $TEMP1,$ACC6,$ACC6
vpmuludq 32*7-128($np),$Yi,$TEMP2
vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3
vpaddq $TEMP2,$ACC7,$ACC7
vpmuludq 32*8-128($np),$Yi,$TEMP0
vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3
vpaddq $TEMP0,$ACC8,$ACC8
mov %rbx, %rax
@@ -1086,7 +1086,9 @@ $code.=<<___;
vmovdqu -8+32*2-128($ap),$TEMP2
mov $r1, %rax
vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3
imull $n0, %eax
vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3
and \$0x1fffffff, %eax
imulq 16-128($ap),%rbx
@@ -1322,15 +1324,12 @@ ___
# But as we underutilize resources, it's possible to correct in
# each iteration with marginal performance loss. But then, as
# we do it in each iteration, we can correct less digits, and
# avoid performance penalties completely. Also note that we
# correct only three digits out of four. This works because
# most significant digit is subjected to less additions.
# avoid performance penalties completely.
$TEMP0 = $ACC9;
$TEMP3 = $Bi;
$TEMP4 = $Yi;
$code.=<<___;
vpermq \$0, $AND_MASK, $AND_MASK
vpaddq (%rsp), $TEMP1, $ACC0
vpsrlq \$29, $ACC0, $TEMP1
@@ -1763,7 +1762,7 @@ $code.=<<___;
.align 64
.Land_mask:
.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
.Lscatter_permd:
.long 0,2,4,6,7,7,7,7
.Lgather_permd:

View File

@@ -290,7 +290,7 @@ ___
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
######## code without following dedicated squaring procedure.
########
$sbit="%i2"; # re-use $bp!
$sbit="%o5";
$code.=<<___;
.align 32
@@ -403,7 +403,7 @@ $code.=<<___;
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$car1,$car1
add $tpj,$sbit,$sbit
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
ld [$np+$j],$npj ! np[j]
@@ -412,7 +412,7 @@ $code.=<<___;
ld [$tp+8],$tpj ! tp[j]
add $acc0,$acc0,$acc0
add $j,4,$j ! j++
or $sbit,$acc0,$acc0
add $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
cmp $j,$num
@@ -426,12 +426,12 @@ $code.=<<___;
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $acc0,$car0,$car0
add $tpj,$car1,$car1
add $tpj,$sbit,$sbit
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc1,$car1,$car1
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
add $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
@@ -439,7 +439,7 @@ $code.=<<___;
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
@@ -499,7 +499,7 @@ $code.=<<___;
.Lsqr_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $tpj,$sbit,$sbit
add $acc0,$car0,$car0
ld [$ap+$j],$apj ! ap[j]
and $car0,$mask,$acc0
@@ -507,7 +507,7 @@ $code.=<<___;
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
ld [$tp+8],$tpj ! tp[j]
or $sbit,$acc0,$acc0
add $sbit,$acc0,$acc0
add $j,4,$j ! j++
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
@@ -522,12 +522,12 @@ $code.=<<___;
.Lsqr_no_inner2:
mulx $apj,$mul0,$acc0
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $tpj,$sbit,$sbit
add $acc0,$car0,$car0
and $car0,$mask,$acc0
srlx $car0,32,$car0
add $acc0,$acc0,$acc0
or $sbit,$acc0,$acc0
add $sbit,$acc0,$acc0
srlx $acc0,32,$sbit
and $acc0,$mask,$acc0
add $acc0,$car1,$car1
@@ -536,7 +536,7 @@ $code.=<<___;
srlx $car1,32,$car1
add $car0,$car0,$car0
or $sbit,$car0,$car0
add $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]
@@ -581,14 +581,17 @@ $code.=<<___;
!.Lsqr_last
mulx $npj,$mul1,$acc1
add $tpj,$car1,$car1
add $tpj,$acc0,$acc0
srlx $acc0,32,$tmp0
and $acc0,$mask,$acc0
add $tmp0,$sbit,$sbit
add $acc0,$car1,$car1
add $acc1,$car1,$car1
st $car1,[$tp]
srlx $car1,32,$car1
add $car0,$car0,$car0 ! recover $car0
or $sbit,$car0,$car0
add $sbit,$car0,$car0
add $car0,$car1,$car1
add $car2,$car1,$car1
st $car1,[$tp+4]

View File

@@ -3090,11 +3090,19 @@ $code.=<<___;
.align 32
.Lsqrx8x_break:
sub 16+8(%rsp),%r8 # consume last carry
xor $zero,$zero
sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf
adcx $zero,%r8
mov 24+8(%rsp),$carry # initial $tptr, borrow $carry
adcx $zero,%r9
mov 0*8($aptr),%rdx # a[8], modulo-scheduled
xor %ebp,%ebp # xor $zero,$zero
adc \$0,%r10
mov %r8,0*8($tptr)
adc \$0,%r11
adc \$0,%r12
adc \$0,%r13
adc \$0,%r14
adc \$0,%r15
cmp $carry,$tptr # cf=0, of=0
je .Lsqrx8x_outer_loop