Update to OpenSSL 1.0.2.o
This commit is contained in:
@@ -239,7 +239,7 @@ $code.=<<___;
|
||||
vmovdqu 32*8-128($ap), $ACC8
|
||||
|
||||
lea 192(%rsp), $tp0 # 64+128=192
|
||||
vpbroadcastq .Land_mask(%rip), $AND_MASK
|
||||
vmovdqu .Land_mask(%rip), $AND_MASK
|
||||
jmp .LOOP_GRANDE_SQR_1024
|
||||
|
||||
.align 32
|
||||
@@ -1070,10 +1070,10 @@ $code.=<<___;
|
||||
vpmuludq 32*6-128($np),$Yi,$TEMP1
|
||||
vpaddq $TEMP1,$ACC6,$ACC6
|
||||
vpmuludq 32*7-128($np),$Yi,$TEMP2
|
||||
vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
|
||||
vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3
|
||||
vpaddq $TEMP2,$ACC7,$ACC7
|
||||
vpmuludq 32*8-128($np),$Yi,$TEMP0
|
||||
vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
|
||||
vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3
|
||||
vpaddq $TEMP0,$ACC8,$ACC8
|
||||
|
||||
mov %rbx, %rax
|
||||
@@ -1086,7 +1086,9 @@ $code.=<<___;
|
||||
vmovdqu -8+32*2-128($ap),$TEMP2
|
||||
|
||||
mov $r1, %rax
|
||||
vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3
|
||||
imull $n0, %eax
|
||||
vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3
|
||||
and \$0x1fffffff, %eax
|
||||
|
||||
imulq 16-128($ap),%rbx
|
||||
@@ -1322,15 +1324,12 @@ ___
|
||||
# But as we underutilize resources, it's possible to correct in
|
||||
# each iteration with marginal performance loss. But then, as
|
||||
# we do it in each iteration, we can correct less digits, and
|
||||
# avoid performance penalties completely. Also note that we
|
||||
# correct only three digits out of four. This works because
|
||||
# most significant digit is subjected to less additions.
|
||||
# avoid performance penalties completely.
|
||||
|
||||
$TEMP0 = $ACC9;
|
||||
$TEMP3 = $Bi;
|
||||
$TEMP4 = $Yi;
|
||||
$code.=<<___;
|
||||
vpermq \$0, $AND_MASK, $AND_MASK
|
||||
vpaddq (%rsp), $TEMP1, $ACC0
|
||||
|
||||
vpsrlq \$29, $ACC0, $TEMP1
|
||||
@@ -1763,7 +1762,7 @@ $code.=<<___;
|
||||
|
||||
.align 64
|
||||
.Land_mask:
|
||||
.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
|
||||
.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
|
||||
.Lscatter_permd:
|
||||
.long 0,2,4,6,7,7,7,7
|
||||
.Lgather_permd:
|
||||
|
||||
@@ -290,7 +290,7 @@ ___
|
||||
######## .Lbn_sqr_mont gives up to 20% *overall* improvement over
|
||||
######## code without following dedicated squaring procedure.
|
||||
########
|
||||
$sbit="%i2"; # re-use $bp!
|
||||
$sbit="%o5";
|
||||
|
||||
$code.=<<___;
|
||||
.align 32
|
||||
@@ -403,7 +403,7 @@ $code.=<<___;
|
||||
mulx $apj,$mul0,$acc0
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $acc0,$car0,$car0
|
||||
add $tpj,$car1,$car1
|
||||
add $tpj,$sbit,$sbit
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
and $car0,$mask,$acc0
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
@@ -412,7 +412,7 @@ $code.=<<___;
|
||||
ld [$tp+8],$tpj ! tp[j]
|
||||
add $acc0,$acc0,$acc0
|
||||
add $j,4,$j ! j++
|
||||
or $sbit,$acc0,$acc0
|
||||
add $sbit,$acc0,$acc0
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
cmp $j,$num
|
||||
@@ -426,12 +426,12 @@ $code.=<<___;
|
||||
mulx $apj,$mul0,$acc0
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $acc0,$car0,$car0
|
||||
add $tpj,$car1,$car1
|
||||
add $tpj,$sbit,$sbit
|
||||
and $car0,$mask,$acc0
|
||||
srlx $car0,32,$car0
|
||||
add $acc1,$car1,$car1
|
||||
add $acc0,$acc0,$acc0
|
||||
or $sbit,$acc0,$acc0
|
||||
add $sbit,$acc0,$acc0
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
add $acc0,$car1,$car1
|
||||
@@ -439,7 +439,7 @@ $code.=<<___;
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car0,$car0
|
||||
or $sbit,$car0,$car0
|
||||
add $sbit,$car0,$car0
|
||||
add $car0,$car1,$car1
|
||||
add $car2,$car1,$car1
|
||||
st $car1,[$tp+4]
|
||||
@@ -499,7 +499,7 @@ $code.=<<___;
|
||||
.Lsqr_inner2:
|
||||
mulx $apj,$mul0,$acc0
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
add $tpj,$sbit,$sbit
|
||||
add $acc0,$car0,$car0
|
||||
ld [$ap+$j],$apj ! ap[j]
|
||||
and $car0,$mask,$acc0
|
||||
@@ -507,7 +507,7 @@ $code.=<<___;
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$acc0,$acc0
|
||||
ld [$tp+8],$tpj ! tp[j]
|
||||
or $sbit,$acc0,$acc0
|
||||
add $sbit,$acc0,$acc0
|
||||
add $j,4,$j ! j++
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
@@ -522,12 +522,12 @@ $code.=<<___;
|
||||
.Lsqr_no_inner2:
|
||||
mulx $apj,$mul0,$acc0
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
add $tpj,$sbit,$sbit
|
||||
add $acc0,$car0,$car0
|
||||
and $car0,$mask,$acc0
|
||||
srlx $car0,32,$car0
|
||||
add $acc0,$acc0,$acc0
|
||||
or $sbit,$acc0,$acc0
|
||||
add $sbit,$acc0,$acc0
|
||||
srlx $acc0,32,$sbit
|
||||
and $acc0,$mask,$acc0
|
||||
add $acc0,$car1,$car1
|
||||
@@ -536,7 +536,7 @@ $code.=<<___;
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car0,$car0
|
||||
or $sbit,$car0,$car0
|
||||
add $sbit,$car0,$car0
|
||||
add $car0,$car1,$car1
|
||||
add $car2,$car1,$car1
|
||||
st $car1,[$tp+4]
|
||||
@@ -581,14 +581,17 @@ $code.=<<___;
|
||||
!.Lsqr_last
|
||||
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
add $tpj,$acc0,$acc0
|
||||
srlx $acc0,32,$tmp0
|
||||
and $acc0,$mask,$acc0
|
||||
add $tmp0,$sbit,$sbit
|
||||
add $acc0,$car1,$car1
|
||||
add $acc1,$car1,$car1
|
||||
st $car1,[$tp]
|
||||
srlx $car1,32,$car1
|
||||
|
||||
add $car0,$car0,$car0 ! recover $car0
|
||||
or $sbit,$car0,$car0
|
||||
add $sbit,$car0,$car0
|
||||
add $car0,$car1,$car1
|
||||
add $car2,$car1,$car1
|
||||
st $car1,[$tp+4]
|
||||
|
||||
@@ -3090,11 +3090,19 @@ $code.=<<___;
|
||||
|
||||
.align 32
|
||||
.Lsqrx8x_break:
|
||||
sub 16+8(%rsp),%r8 # consume last carry
|
||||
xor $zero,$zero
|
||||
sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf
|
||||
adcx $zero,%r8
|
||||
mov 24+8(%rsp),$carry # initial $tptr, borrow $carry
|
||||
adcx $zero,%r9
|
||||
mov 0*8($aptr),%rdx # a[8], modulo-scheduled
|
||||
xor %ebp,%ebp # xor $zero,$zero
|
||||
adc \$0,%r10
|
||||
mov %r8,0*8($tptr)
|
||||
adc \$0,%r11
|
||||
adc \$0,%r12
|
||||
adc \$0,%r13
|
||||
adc \$0,%r14
|
||||
adc \$0,%r15
|
||||
cmp $carry,$tptr # cf=0, of=0
|
||||
je .Lsqrx8x_outer_loop
|
||||
|
||||
|
||||
Reference in New Issue
Block a user