Import OpenSSL 1.0.2p
This commit is contained in:
@@ -287,15 +287,12 @@ bn_mul_mont:
|
||||
mov sp,$tp
|
||||
mov $bp,$rp # restore rp
|
||||
|
||||
and sp,$hi0,$ap
|
||||
bic $bp,$hi0,$bp
|
||||
bis $bp,$ap,$ap # ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
.Lcopy: ldq $aj,0($ap) # copy or in-place refresh
|
||||
.Lcopy: ldq $aj,0($tp) # conditional copy
|
||||
ldq $nj,0($rp)
|
||||
lda $tp,8($tp)
|
||||
lda $rp,8($rp)
|
||||
lda $ap,8($ap)
|
||||
cmoveq $hi0,$nj,$aj
|
||||
stq zero,-8($tp) # zap tp
|
||||
cmpult $tp,$tj,AT
|
||||
stq $aj,-8($rp)
|
||||
|
||||
@@ -216,14 +216,15 @@ bn_mul_mont:
|
||||
mov $tp,sp @ "rewind" $tp
|
||||
sub $rp,$rp,$aj @ "rewind" $rp
|
||||
|
||||
and $ap,$tp,$nhi
|
||||
bic $np,$rp,$nhi
|
||||
orr $ap,$ap,$np @ ap=borrow?tp:rp
|
||||
|
||||
.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
|
||||
.Lcopy: ldr $tj,[$tp] @ conditional copy
|
||||
ldr $aj,[$rp]
|
||||
str sp,[$tp],#4 @ zap tp
|
||||
str $tj,[$rp],#4
|
||||
cmp $tp,$num
|
||||
#ifdef __thumb2__
|
||||
it cc
|
||||
#endif
|
||||
movcc $aj,$tj
|
||||
str $aj,[$rp],#4
|
||||
teq $tp,$num @ preserve carry
|
||||
bne .Lcopy
|
||||
|
||||
add sp,$num,#4 @ skip over tp[num+1]
|
||||
|
||||
@@ -332,19 +332,19 @@ bn_mul_mont_general:
|
||||
{ .mmb; sub rptr=rptr,len // rewind
|
||||
sub tptr=tptr,len
|
||||
clrrrb.pr };;
|
||||
{ .mmi; and aptr=tptr,topbit
|
||||
andcm bptr=rptr,topbit
|
||||
{ .mmi; mov aptr=rptr
|
||||
mov bptr=tptr
|
||||
mov pr.rot=1<<16 };;
|
||||
{ .mii; or nptr=aptr,bptr
|
||||
{ .mii; cmp.eq p0,p6=topbit,r0
|
||||
mov ar.lc=lc
|
||||
mov ar.ec=3 };;
|
||||
mov ar.ec=2 };;
|
||||
|
||||
.Lcopy_ctop:
|
||||
{ .mmb; (p16) ld8 n[0]=[nptr],8
|
||||
(p18) st8 [tptr]=r0,8
|
||||
(p16) nop.b 0 }
|
||||
{ .mmb; (p16) nop.m 0
|
||||
(p18) st8 [rptr]=n[2],8
|
||||
{ .mmi; (p16) ld8 a[0]=[aptr],8
|
||||
(p16) ld8 t[0]=[bptr],8
|
||||
(p6) mov a[1]=t[1] };; // (p17)
|
||||
{ .mmb; (p17) st8 [rptr]=a[1],8
|
||||
(p17) st8 [tptr]=r0,8
|
||||
br.ctop.sptk .Lcopy_ctop };;
|
||||
.Lcopy_cend:
|
||||
|
||||
|
||||
@@ -377,15 +377,13 @@ $code.=<<___;
|
||||
$PTR_SUB $rp,$num # restore rp
|
||||
not $hi1,$hi0
|
||||
|
||||
and $ap,$hi0,$sp
|
||||
and $bp,$hi1,$rp
|
||||
or $ap,$ap,$bp # ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
.Lcopy: $LD $aj,($ap)
|
||||
$PTR_ADD $ap,$BNSZ
|
||||
.Lcopy: $LD $nj,($tp) # conditional move
|
||||
$LD $aj,($rp)
|
||||
$ST $zero,($tp)
|
||||
$PTR_ADD $tp,$BNSZ
|
||||
and $nj,$hi0
|
||||
and $aj,$hi1
|
||||
or $aj,$nj
|
||||
sltu $at,$tp,$tj
|
||||
$ST $aj,($rp)
|
||||
bnez $at,.Lcopy
|
||||
|
||||
@@ -510,7 +510,6 @@ L\$sub
|
||||
stws,ma $hi1,4($rp)
|
||||
|
||||
subb $ti0,%r0,$hi1
|
||||
ldo -4($tp),$tp
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==8);
|
||||
ldd,ma 8($tp),$ti0
|
||||
@@ -525,21 +524,19 @@ L\$sub
|
||||
|
||||
extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
|
||||
sub,db $ti0,%r0,$hi1
|
||||
ldo -8($tp),$tp
|
||||
___
|
||||
$code.=<<___;
|
||||
and $tp,$hi1,$ap
|
||||
andcm $rp,$hi1,$bp
|
||||
or $ap,$bp,$np
|
||||
|
||||
ldo `$LOCALS+32`($fp),$tp
|
||||
sub $rp,$arrsz,$rp ; rewind rp
|
||||
subi 0,$arrsz,$idx
|
||||
ldo `$LOCALS+32`($fp),$tp
|
||||
L\$copy
|
||||
ldd $idx($np),$hi0
|
||||
ldd 0($tp),$ti0
|
||||
ldd 0($rp),$hi0
|
||||
std,ma %r0,8($tp)
|
||||
addib,<> 8,$idx,.-8 ; L\$copy
|
||||
std,ma $hi0,8($rp)
|
||||
comiclr,= 0,$hi1,%r0
|
||||
copy $ti0,$hi0
|
||||
addib,<> 8,$idx,L\$copy
|
||||
std,ma $hi0,8($rp)
|
||||
___
|
||||
|
||||
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
|
||||
@@ -849,17 +846,16 @@ L\$sub_pa11
|
||||
stws,ma $hi1,4($rp)
|
||||
|
||||
subb $ti0,%r0,$hi1
|
||||
ldo -4($tp),$tp
|
||||
and $tp,$hi1,$ap
|
||||
andcm $rp,$hi1,$bp
|
||||
or $ap,$bp,$np
|
||||
|
||||
ldo `$LOCALS+32`($fp),$tp
|
||||
sub $rp,$arrsz,$rp ; rewind rp
|
||||
subi 0,$arrsz,$idx
|
||||
ldo `$LOCALS+32`($fp),$tp
|
||||
L\$copy_pa11
|
||||
ldwx $idx($np),$hi0
|
||||
ldw 0($tp),$ti0
|
||||
ldw 0($rp),$hi0
|
||||
stws,ma %r0,4($tp)
|
||||
comiclr,= 0,$hi1,%r0
|
||||
copy $ti0,$hi0
|
||||
addib,<> 4,$idx,L\$copy_pa11
|
||||
stws,ma $hi0,4($rp)
|
||||
|
||||
|
||||
@@ -294,15 +294,16 @@ Lsub: $LDX $tj,$tp,$j
|
||||
li $j,0
|
||||
mtctr $num
|
||||
subfe $ovf,$j,$ovf ; handle upmost overflow bit
|
||||
and $ap,$tp,$ovf
|
||||
andc $np,$rp,$ovf
|
||||
or $ap,$ap,$np ; ap=borrow?tp:rp
|
||||
|
||||
.align 4
|
||||
Lcopy: ; copy or in-place refresh
|
||||
$LDX $tj,$ap,$j
|
||||
$STX $tj,$rp,$j
|
||||
Lcopy: ; conditional copy
|
||||
$LDX $tj,$tp,$j
|
||||
$LDX $aj,$rp,$j
|
||||
and $tj,$tj,$ovf
|
||||
andc $aj,$aj,$ovf
|
||||
$STX $j,$tp,$j ; zap at once
|
||||
or $aj,$aj,$tj
|
||||
$STX $aj,$rp,$j
|
||||
addi $j,$j,$BNSZ
|
||||
bdnz Lcopy
|
||||
|
||||
|
||||
@@ -1494,16 +1494,14 @@ Lsub: ldx $t0,$tp,$i
|
||||
|
||||
li $i,0
|
||||
subfe $ovf,$i,$ovf ; handle upmost overflow bit
|
||||
and $ap,$tp,$ovf
|
||||
andc $np,$rp,$ovf
|
||||
or $ap,$ap,$np ; ap=borrow?tp:rp
|
||||
addi $t7,$ap,8
|
||||
mtctr $j
|
||||
|
||||
.align 4
|
||||
Lcopy: ; copy or in-place refresh
|
||||
ldx $t0,$ap,$i
|
||||
ldx $t1,$t7,$i
|
||||
Lcopy: ; conditional copy
|
||||
ldx $t0,$tp,$i
|
||||
ldx $t1,$t4,$i
|
||||
ldx $t2,$rp,$i
|
||||
ldx $t3,$t6,$i
|
||||
std $i,8($nap_d) ; zap nap_d
|
||||
std $i,16($nap_d)
|
||||
std $i,24($nap_d)
|
||||
@@ -1512,6 +1510,12 @@ Lcopy: ; copy or in-place refresh
|
||||
std $i,48($nap_d)
|
||||
std $i,56($nap_d)
|
||||
stdu $i,64($nap_d)
|
||||
and $t0,$t0,$ovf
|
||||
and $t1,$t1,$ovf
|
||||
andc $t2,$t2,$ovf
|
||||
andc $t3,$t3,$ovf
|
||||
or $t0,$t0,$t2
|
||||
or $t1,$t1,$t3
|
||||
stdx $t0,$rp,$i
|
||||
stdx $t1,$t6,$i
|
||||
stdx $i,$tp,$i ; zap tp at once
|
||||
@@ -1554,20 +1558,21 @@ Lsub: lwz $t0,12($tp) ; load tp[j..j+3] in 64-bit word order
|
||||
|
||||
li $i,0
|
||||
subfe $ovf,$i,$ovf ; handle upmost overflow bit
|
||||
addi $tp,$sp,`$FRAME+$TRANSFER+4`
|
||||
addi $ap,$sp,`$FRAME+$TRANSFER+4`
|
||||
subf $rp,$num,$rp ; rewind rp
|
||||
and $ap,$tp,$ovf
|
||||
andc $np,$rp,$ovf
|
||||
or $ap,$ap,$np ; ap=borrow?tp:rp
|
||||
addi $tp,$sp,`$FRAME+$TRANSFER`
|
||||
mtctr $j
|
||||
|
||||
.align 4
|
||||
Lcopy: ; copy or in-place refresh
|
||||
Lcopy: ; conditional copy
|
||||
lwz $t0,4($ap)
|
||||
lwz $t1,8($ap)
|
||||
lwz $t2,12($ap)
|
||||
lwzu $t3,16($ap)
|
||||
lwz $t4,4($rp)
|
||||
lwz $t5,8($rp)
|
||||
lwz $t6,12($rp)
|
||||
lwz $t7,16($rp)
|
||||
std $i,8($nap_d) ; zap nap_d
|
||||
std $i,16($nap_d)
|
||||
std $i,24($nap_d)
|
||||
@@ -1576,6 +1581,18 @@ Lcopy: ; copy or in-place refresh
|
||||
std $i,48($nap_d)
|
||||
std $i,56($nap_d)
|
||||
stdu $i,64($nap_d)
|
||||
and $t0,$t0,$ovf
|
||||
and $t1,$t1,$ovf
|
||||
and $t2,$t2,$ovf
|
||||
and $t3,$t3,$ovf
|
||||
andc $t4,$t4,$ovf
|
||||
andc $t5,$t5,$ovf
|
||||
andc $t6,$t6,$ovf
|
||||
andc $t7,$t7,$ovf
|
||||
or $t0,$t0,$t4
|
||||
or $t1,$t1,$t5
|
||||
or $t2,$t2,$t6
|
||||
or $t3,$t3,$t7
|
||||
stw $t0,4($rp)
|
||||
stw $t1,8($rp)
|
||||
stw $t2,12($rp)
|
||||
|
||||
@@ -97,7 +97,7 @@ if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
|
||||
$addx = ($1>=11);
|
||||
}
|
||||
|
||||
if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
|
||||
if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9])\.([0-9]+)/) {
|
||||
my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
|
||||
$avx = ($ver>=3.0) + ($ver>=3.01);
|
||||
$addx = ($ver>=3.03);
|
||||
|
||||
@@ -245,16 +245,16 @@ $code.=<<___;
|
||||
brct $count,.Lsub
|
||||
lghi $ahi,0
|
||||
slbgr $AHI,$ahi # handle upmost carry
|
||||
|
||||
ngr $ap,$AHI
|
||||
lghi $np,-1
|
||||
xgr $np,$AHI
|
||||
ngr $np,$rp
|
||||
ogr $ap,$np # ap=borrow?tp:rp
|
||||
lghi $NHI,-1
|
||||
xgr $NHI,$AHI
|
||||
|
||||
la $j,0(%r0)
|
||||
lgr $count,$num
|
||||
.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh
|
||||
.Lcopy: lg $ahi,$stdframe($j,$sp) # conditional copy
|
||||
lg $alo,0($j,$rp)
|
||||
ngr $ahi,$AHI
|
||||
ngr $alo,$NHI
|
||||
ogr $alo,$ahi
|
||||
_dswap $alo
|
||||
stg $j,$stdframe($j,$sp) # zap tp
|
||||
stg $alo,0($j,$rp)
|
||||
|
||||
@@ -878,19 +878,17 @@ $code.=<<___;
|
||||
sub $tp, $num, $tp
|
||||
sub $rp, $num, $rp
|
||||
|
||||
subc $ovf, %g0, $ovf ! handle upmost overflow bit
|
||||
and $tp, $ovf, $ap
|
||||
andn $rp, $ovf, $np
|
||||
or $np, $ap, $ap ! ap=borrow?tp:rp
|
||||
subccc $ovf, %g0, $ovf ! handle upmost overflow bit
|
||||
ba .Lcopy
|
||||
sub $num, 8, $cnt
|
||||
|
||||
.align 16
|
||||
.Lcopy: ! copy or in-place refresh
|
||||
ldx [$ap+0], $t2
|
||||
add $ap, 8, $ap
|
||||
.Lcopy: ! conditional copy
|
||||
ldx [$tp], $tj
|
||||
ldx [$rp+0], $t2
|
||||
stx %g0, [$tp] ! zap
|
||||
add $tp, 8, $tp
|
||||
movcs %icc, $tj, $t2
|
||||
stx $t2, [$rp+0]
|
||||
add $rp, 8, $rp
|
||||
brnz $cnt, .Lcopy
|
||||
@@ -1126,19 +1124,17 @@ $code.=<<___;
|
||||
sub $tp, $num, $tp
|
||||
sub $rp, $num, $rp
|
||||
|
||||
subc $ovf, %g0, $ovf ! handle upmost overflow bit
|
||||
and $tp, $ovf, $ap
|
||||
andn $rp, $ovf, $np
|
||||
or $np, $ap, $ap ! ap=borrow?tp:rp
|
||||
subccc $ovf, %g0, $ovf ! handle upmost overflow bit
|
||||
ba .Lcopy_g5
|
||||
sub $num, 8, $cnt
|
||||
|
||||
.align 16
|
||||
.Lcopy_g5: ! copy or in-place refresh
|
||||
ldx [$ap+0], $t2
|
||||
add $ap, 8, $ap
|
||||
.Lcopy_g5: ! conditional copy
|
||||
ldx [$tp], $tj
|
||||
ldx [$rp+0], $t2
|
||||
stx %g0, [$tp] ! zap
|
||||
add $tp, 8, $tp
|
||||
movcs %icc, $tj, $t2
|
||||
stx $t2, [$rp+0]
|
||||
add $rp, 8, $rp
|
||||
brnz $cnt, .Lcopy_g5
|
||||
|
||||
@@ -255,7 +255,6 @@ $fname:
|
||||
.Ltail:
|
||||
add $np,$num,$np
|
||||
add $rp,$num,$rp
|
||||
mov $tp,$ap
|
||||
sub %g0,$num,%o7 ! k=-num
|
||||
ba .Lsub
|
||||
subcc %g0,%g0,%g0 ! clear %icc.c
|
||||
@@ -268,15 +267,14 @@ $fname:
|
||||
add %o7,4,%o7
|
||||
brnz %o7,.Lsub
|
||||
st %o1,[$i]
|
||||
subc $car2,0,$car2 ! handle upmost overflow bit
|
||||
and $tp,$car2,$ap
|
||||
andn $rp,$car2,$np
|
||||
or $ap,$np,$ap
|
||||
subccc $car2,0,$car2 ! handle upmost overflow bit
|
||||
sub %g0,$num,%o7
|
||||
|
||||
.Lcopy:
|
||||
ld [$ap+%o7],%o0 ! copy or in-place refresh
|
||||
ld [$tp+%o7],%o1 ! conditional copy
|
||||
ld [$rp+%o7],%o0
|
||||
st %g0,[$tp+%o7] ! zap tp
|
||||
movcs %icc,%o1,%o0
|
||||
st %o0,[$rp+%o7]
|
||||
add %o7,4,%o7
|
||||
brnz %o7,.Lcopy
|
||||
@@ -485,6 +483,9 @@ $code.=<<___;
|
||||
mulx $npj,$mul1,$acc1
|
||||
add $tpj,$car1,$car1
|
||||
ld [$np+$j],$npj ! np[j]
|
||||
srlx $car1,32,$tmp0
|
||||
and $car1,$mask,$car1
|
||||
add $tmp0,$sbit,$sbit
|
||||
add $acc0,$car1,$car1
|
||||
ld [$tp+8],$tpj ! tp[j]
|
||||
add $acc1,$car1,$car1
|
||||
|
||||
@@ -203,18 +203,15 @@ $sp=&DWP(28,"esp");
|
||||
|
||||
&mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit
|
||||
&sbb ("eax",0);
|
||||
&and ("esi","eax");
|
||||
¬ ("eax");
|
||||
&mov ("ebp","edi");
|
||||
&and ("ebp","eax");
|
||||
&or ("esi","ebp"); # tp=carry?tp:rp
|
||||
|
||||
&mov ("ecx","edx"); # num
|
||||
&xor ("edx","edx"); # i=0
|
||||
&mov ("edx",0); # i=0
|
||||
|
||||
&set_label("copy",8);
|
||||
&mov ("eax",&DWP(0,"esi","edx",4));
|
||||
&mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp
|
||||
&mov ("ebx",&DWP(0,"esi","edx",4));
|
||||
&mov ("eax",&DWP(0,"edi","edx",4));
|
||||
&mov (&DWP(0,"esi","edx",4),"ecx"); # zap tp
|
||||
&cmovc ("eax","ebx");
|
||||
&mov (&DWP(0,"edi","edx",4),"eax");
|
||||
&lea ("edx",&DWP(1,"edx")); # i++
|
||||
&loop (&label("copy"));
|
||||
|
||||
@@ -299,23 +299,23 @@ $code.=<<___;
|
||||
sub $anp, $num, $anp
|
||||
sub $rp, $num, $rp
|
||||
|
||||
subc $ovf, %g0, $ovf ! handle upmost overflow bit
|
||||
and $tp, $ovf, $ap
|
||||
andn $rp, $ovf, $np
|
||||
or $np, $ap, $ap ! ap=borrow?tp:rp
|
||||
subccc $ovf, %g0, $ovf ! handle upmost overflow bit
|
||||
ba .Lcopy
|
||||
sub $num, 8, $cnt
|
||||
|
||||
.align 16
|
||||
.Lcopy: ! copy or in-place refresh
|
||||
ld [$ap+0], $t2
|
||||
ld [$ap+4], $t3
|
||||
add $ap, 8, $ap
|
||||
.Lcopy: ! conditional copy
|
||||
ld [$tp+0], $t0
|
||||
ld [$tp+4], $t1
|
||||
ld [$rp+0], $t2
|
||||
ld [$rp+4], $t3
|
||||
stx %g0, [$tp] ! zap
|
||||
add $tp, 8, $tp
|
||||
stx %g0, [$anp] ! zap
|
||||
stx %g0, [$anp+8]
|
||||
add $anp, 16, $anp
|
||||
movcs %icc, $t0, $t2
|
||||
movcs %icc, $t1, $t3
|
||||
st $t3, [$rp+0] ! flip order
|
||||
st $t2, [$rp+4]
|
||||
add $rp, 8, $rp
|
||||
|
||||
@@ -592,16 +592,18 @@ $sbit=$num;
|
||||
&jge (&label("sub"));
|
||||
|
||||
&sbb ("eax",0); # handle upmost overflow bit
|
||||
&and ($tp,"eax");
|
||||
¬ ("eax");
|
||||
&mov ($np,$rp);
|
||||
&and ($np,"eax");
|
||||
&or ($tp,$np); # tp=carry?tp:rp
|
||||
&mov ("edx",-1);
|
||||
&xor ("edx","eax");
|
||||
&jmp (&label("copy"));
|
||||
|
||||
&set_label("copy",16); # copy or in-place refresh
|
||||
&mov ("eax",&DWP(0,$tp,$num,4));
|
||||
&mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
|
||||
&set_label("copy",16); # conditional copy
|
||||
&mov ($tp,&DWP($frame,"esp",$num,4));
|
||||
&mov ($np,&DWP(0,$rp,$num,4));
|
||||
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
|
||||
&and ($tp,"eax");
|
||||
&and ($np,"edx");
|
||||
&or ($np,$tp);
|
||||
&mov (&DWP(0,$rp,$num,4),$np);
|
||||
&dec ($num);
|
||||
&jge (&label("copy"));
|
||||
|
||||
|
||||
@@ -293,30 +293,30 @@ $code.=<<___;
|
||||
|
||||
xor $i,$i # i=0 and clear CF!
|
||||
mov (%rsp),%rax # tp[0]
|
||||
lea (%rsp),$ap # borrow ap for tp
|
||||
mov $num,$j # j=num
|
||||
jmp .Lsub
|
||||
|
||||
.align 16
|
||||
.Lsub: sbb ($np,$i,8),%rax
|
||||
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
mov 8($ap,$i,8),%rax # tp[i+1]
|
||||
mov 8(%rsp,$i,8),%rax # tp[i+1]
|
||||
lea 1($i),$i # i++
|
||||
dec $j # doesnn't affect CF!
|
||||
jnz .Lsub
|
||||
|
||||
sbb \$0,%rax # handle upmost overflow bit
|
||||
mov \$-1,%rbx
|
||||
xor %rax,%rbx # not %rax
|
||||
xor $i,$i
|
||||
and %rax,$ap
|
||||
not %rax
|
||||
mov $rp,$np
|
||||
and %rax,$np
|
||||
mov $num,$j # j=num
|
||||
or $np,$ap # ap=borrow?tp:rp
|
||||
.align 16
|
||||
.Lcopy: # copy or in-place refresh
|
||||
mov ($ap,$i,8),%rax
|
||||
mov $i,(%rsp,$i,8) # zap temporary vector
|
||||
mov %rax,($rp,$i,8) # rp[i]=tp[i]
|
||||
|
||||
.Lcopy: # conditional copy
|
||||
mov ($rp,$i,8),%rcx
|
||||
mov (%rsp,$i,8),%rdx
|
||||
and %rbx,%rcx
|
||||
and %rax,%rdx
|
||||
mov $num,(%rsp,$i,8) # zap temporary vector
|
||||
or %rcx,%rdx
|
||||
mov %rdx,($rp,$i,8) # rp[i]=tp[i]
|
||||
lea 1($i),$i
|
||||
sub \$1,$j
|
||||
jnz .Lcopy
|
||||
@@ -686,10 +686,10 @@ ___
|
||||
my @ri=("%rax","%rdx",$m0,$m1);
|
||||
$code.=<<___;
|
||||
mov 16(%rsp,$num,8),$rp # restore $rp
|
||||
lea -4($num),$j
|
||||
mov 0(%rsp),@ri[0] # tp[0]
|
||||
pxor %xmm0,%xmm0
|
||||
mov 8(%rsp),@ri[1] # tp[1]
|
||||
shr \$2,$num # num/=4
|
||||
shr \$2,$j # j=num/4-1
|
||||
lea (%rsp),$ap # borrow ap for tp
|
||||
xor $i,$i # i=0 and clear CF!
|
||||
|
||||
@@ -697,9 +697,7 @@ $code.=<<___;
|
||||
mov 16($ap),@ri[2] # tp[2]
|
||||
mov 24($ap),@ri[3] # tp[3]
|
||||
sbb 8($np),@ri[1]
|
||||
lea -1($num),$j # j=num/4-1
|
||||
jmp .Lsub4x
|
||||
.align 16
|
||||
|
||||
.Lsub4x:
|
||||
mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
@@ -726,34 +724,35 @@ $code.=<<___;
|
||||
|
||||
sbb \$0,@ri[0] # handle upmost overflow bit
|
||||
mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||||
xor $i,$i # i=0
|
||||
and @ri[0],$ap
|
||||
not @ri[0]
|
||||
mov $rp,$np
|
||||
and @ri[0],$np
|
||||
lea -1($num),$j
|
||||
or $np,$ap # ap=borrow?tp:rp
|
||||
pxor %xmm0,%xmm0
|
||||
movq @ri[0],%xmm4
|
||||
pcmpeqd %xmm5,%xmm5
|
||||
pshufd \$0,%xmm4,%xmm4
|
||||
mov $num,$j
|
||||
pxor %xmm4,%xmm5
|
||||
shr \$2,$j # j=num/4
|
||||
xor %eax,%eax # i=0
|
||||
|
||||
movdqu ($ap),%xmm1
|
||||
movdqa %xmm0,(%rsp)
|
||||
movdqu %xmm1,($rp)
|
||||
jmp .Lcopy4x
|
||||
.align 16
|
||||
.Lcopy4x: # copy or in-place refresh
|
||||
movdqu 16($ap,$i),%xmm2
|
||||
movdqu 32($ap,$i),%xmm1
|
||||
movdqa %xmm0,16(%rsp,$i)
|
||||
movdqu %xmm2,16($rp,$i)
|
||||
movdqa %xmm0,32(%rsp,$i)
|
||||
movdqu %xmm1,32($rp,$i)
|
||||
lea 32($i),$i
|
||||
.Lcopy4x: # conditional copy
|
||||
movdqa (%rsp,%rax),%xmm1
|
||||
movdqu ($rp,%rax),%xmm2
|
||||
pand %xmm4,%xmm1
|
||||
pand %xmm5,%xmm2
|
||||
movdqa 16(%rsp,%rax),%xmm3
|
||||
movdqa %xmm0,(%rsp,%rax)
|
||||
por %xmm2,%xmm1
|
||||
movdqu 16($rp,%rax),%xmm2
|
||||
movdqu %xmm1,($rp,%rax)
|
||||
pand %xmm4,%xmm3
|
||||
pand %xmm5,%xmm2
|
||||
movdqa %xmm0,16(%rsp,%rax)
|
||||
por %xmm2,%xmm3
|
||||
movdqu %xmm3,16($rp,%rax)
|
||||
lea 32(%rax),%rax
|
||||
dec $j
|
||||
jnz .Lcopy4x
|
||||
|
||||
shl \$2,$num
|
||||
movdqu 16($ap,$i),%xmm2
|
||||
movdqa %xmm0,16(%rsp,$i)
|
||||
movdqu %xmm2,16($rp,$i)
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
|
||||
@@ -405,18 +405,19 @@ $code.=<<___;
|
||||
jnz .Lsub
|
||||
|
||||
sbb \$0,%rax # handle upmost overflow bit
|
||||
mov \$-1,%rbx
|
||||
xor %rax,%rbx
|
||||
xor $i,$i
|
||||
and %rax,$ap
|
||||
not %rax
|
||||
mov $rp,$np
|
||||
and %rax,$np
|
||||
mov $num,$j # j=num
|
||||
or $np,$ap # ap=borrow?tp:rp
|
||||
.align 16
|
||||
.Lcopy: # copy or in-place refresh
|
||||
mov ($ap,$i,8),%rax
|
||||
|
||||
.Lcopy: # conditional copy
|
||||
mov ($rp,$i,8),%rcx
|
||||
mov (%rsp,$i,8),%rdx
|
||||
and %rbx,%rcx
|
||||
and %rax,%rdx
|
||||
mov $i,(%rsp,$i,8) # zap temporary vector
|
||||
mov %rax,($rp,$i,8) # rp[i]=tp[i]
|
||||
or %rcx,%rdx
|
||||
mov %rdx,($rp,$i,8) # rp[i]=tp[i]
|
||||
lea 1($i),$i
|
||||
sub \$1,$j
|
||||
jnz .Lcopy
|
||||
|
||||
Reference in New Issue
Block a user