Import OpenSSL 1.1.0f

This commit is contained in:
Steve Dower
2017-09-07 16:27:43 -07:00
committed by Steve Dower
parent ccd3ab4aff
commit f4b81cb7c9
3340 changed files with 325158 additions and 557542 deletions

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -191,6 +198,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output = pop;
open OUT,">$output";
*STDOUT=*OUT;
&asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
&static_label("AES_Te");
&static_label("AES_Td");
@@ -2861,12 +2872,12 @@ sub enckey()
&set_label("exit");
&function_end("_x86_AES_set_encrypt_key");
# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
&function_begin_B("private_AES_set_encrypt_key");
&function_begin_B("AES_set_encrypt_key");
&call ("_x86_AES_set_encrypt_key");
&ret ();
&function_end_B("private_AES_set_encrypt_key");
&function_end_B("AES_set_encrypt_key");
sub deckey()
{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
@@ -2923,9 +2934,9 @@ sub deckey()
&mov (&DWP(4*$i,$key),$tp1);
}
# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
&function_begin_B("private_AES_set_decrypt_key");
&function_begin_B("AES_set_decrypt_key");
&call ("_x86_AES_set_encrypt_key");
&cmp ("eax",0);
&je (&label("proceed"));
@@ -2981,7 +2992,9 @@ sub deckey()
&jb (&label("permute"));
&xor ("eax","eax"); # return success
&function_end("private_AES_set_decrypt_key");
&function_end("AES_set_decrypt_key");
&asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT;

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -32,8 +39,20 @@
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~21.5 cycles per byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
$s0="r0";
$s1="r1";
@@ -58,15 +77,12 @@ $code=<<___;
#endif
.text
#if __ARM_ARCH__<7
.code 32
#else
#if defined(__thumb2__) && !defined(__APPLE__)
.syntax unified
# ifdef __thumb2__
.thumb
# else
#else
.code 32
# endif
#undef __thumb2__
#endif
.type AES_Te,%object
@@ -181,15 +197,19 @@ AES_Te:
.type AES_encrypt,%function
.align 5
AES_encrypt:
#if __ARM_ARCH__<7
#ifndef __thumb2__
sub r3,pc,#8 @ AES_encrypt
#else
adr r3,AES_encrypt
#endif
stmdb sp!,{r1,r4-r12,lr}
#ifdef __APPLE__
adr $tbl,AES_Te
#else
sub $tbl,r3,#AES_encrypt-AES_Te @ Te
#endif
mov $rounds,r0 @ inp
mov $key,r2
sub $tbl,r3,#AES_encrypt-AES_Te @ Te
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
@@ -422,24 +442,24 @@ _armv4_AES_encrypt:
ldr pc,[sp],#4 @ pop and return
.size _armv4_AES_encrypt,.-_armv4_AES_encrypt
.global private_AES_set_encrypt_key
.type private_AES_set_encrypt_key,%function
.global AES_set_encrypt_key
.type AES_set_encrypt_key,%function
.align 5
private_AES_set_encrypt_key:
AES_set_encrypt_key:
_armv4_AES_set_encrypt_key:
#if __ARM_ARCH__<7
#ifndef __thumb2__
sub r3,pc,#8 @ AES_set_encrypt_key
#else
adr r3,private_AES_set_encrypt_key
adr r3,AES_set_encrypt_key
#endif
teq r0,#0
#if __ARM_ARCH__>=7
#ifdef __thumb2__
itt eq @ Thumb2 thing, sanity check in ARM
#endif
moveq r0,#-1
beq .Labrt
teq r2,#0
#if __ARM_ARCH__>=7
#ifdef __thumb2__
itt eq @ Thumb2 thing, sanity check in ARM
#endif
moveq r0,#-1
@@ -450,19 +470,23 @@ _armv4_AES_set_encrypt_key:
teq r1,#192
beq .Lok
teq r1,#256
#if __ARM_ARCH__>=7
#ifdef __thumb2__
itt ne @ Thumb2 thing, sanity check in ARM
#endif
movne r0,#-1
bne .Labrt
.Lok: stmdb sp!,{r4-r12,lr}
sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
mov $rounds,r0 @ inp
mov lr,r1 @ bits
mov $key,r2 @ key
#ifdef __APPLE__
adr $tbl,AES_Te+1024 @ Te4
#else
sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4
#endif
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
@@ -607,7 +631,7 @@ _armv4_AES_set_encrypt_key:
str $s2,[$key,#-16]
subs $rounds,$rounds,#1
str $s3,[$key,#-12]
#if __ARM_ARCH__>=7
#ifdef __thumb2__
itt eq @ Thumb2 thing, sanity check in ARM
#endif
subeq r2,$key,#216
@@ -679,7 +703,7 @@ _armv4_AES_set_encrypt_key:
str $s2,[$key,#-24]
subs $rounds,$rounds,#1
str $s3,[$key,#-20]
#if __ARM_ARCH__>=7
#ifdef __thumb2__
itt eq @ Thumb2 thing, sanity check in ARM
#endif
subeq r2,$key,#256
@@ -722,12 +746,12 @@ _armv4_AES_set_encrypt_key:
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.size AES_set_encrypt_key,.-AES_set_encrypt_key
.global private_AES_set_decrypt_key
.type private_AES_set_decrypt_key,%function
.global AES_set_decrypt_key
.type AES_set_decrypt_key,%function
.align 5
private_AES_set_decrypt_key:
AES_set_decrypt_key:
str lr,[sp,#-4]! @ push lr
bl _armv4_AES_set_encrypt_key
teq r0,#0
@@ -737,7 +761,7 @@ private_AES_set_decrypt_key:
mov r0,r2 @ AES_set_encrypt_key preserves r2,
mov r1,r2 @ which is AES_KEY *key
b _armv4_AES_set_enc2dec_key
.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
.size AES_set_decrypt_key,.-AES_set_decrypt_key
@ void AES_set_enc2dec_key(const AES_KEY *inp,AES_KEY *out)
.global AES_set_enc2dec_key
@@ -750,7 +774,7 @@ _armv4_AES_set_enc2dec_key:
ldr $rounds,[r0,#240]
mov $i1,r0 @ input
add $i2,r0,$rounds,lsl#4
mov $key,r1 @ ouput
mov $key,r1 @ output
add $tbl,r1,$rounds,lsl#4
str $rounds,[r1,#240]
@@ -949,15 +973,19 @@ AES_Td:
.type AES_decrypt,%function
.align 5
AES_decrypt:
#if __ARM_ARCH__<7
#ifndef __thumb2__
sub r3,pc,#8 @ AES_decrypt
#else
adr r3,AES_decrypt
#endif
stmdb sp!,{r1,r4-r12,lr}
#ifdef __APPLE__
adr $tbl,AES_Td
#else
sub $tbl,r3,#AES_decrypt-AES_Td @ Td
#endif
mov $rounds,r0 @ inp
mov $key,r2
sub $tbl,r3,#AES_decrypt-AES_Td @ Td
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...

File diff suppressed because it is too large Load Diff

View File

@@ -1,3 +1,10 @@
// Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the OpenSSL license (the "License"). You may not use
// this file except in compliance with the License. You can obtain a copy
// in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// ====================================================================
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
// project. Rights for redistribution and usage in source and binary
@@ -10,7 +17,7 @@
// 'and' which in turn can be assigned to M-port [there're double as
// much M-ports as there're I-ports on Itanium 2]. By sacrificing few
// registers for small constants (255, 24 and 16) to be used with
// 'shr' and 'and' instructions I can achieve better ILP, Intruction
// 'shr' and 'and' instructions I can achieve better ILP, Instruction
// Level Parallelism, and performance. This code outperforms GCC 3.3
// generated code by over factor of 2 (two), GCC 3.4 - by 70% and
// HP C - by 40%. Measured best-case scenario, i.e. aligned

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -57,6 +64,7 @@
$flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
if ($flavour =~ /64|n32/i) {
$PTR_LA="dla";
$PTR_ADD="dadd"; # incidentally works even on n32
$PTR_SUB="dsub"; # incidentally works even on n32
$PTR_INS="dins";
@@ -65,6 +73,7 @@ if ($flavour =~ /64|n32/i) {
$PTR_SLL="dsll"; # incidentally works even on n32
$SZREG=8;
} else {
$PTR_LA="la";
$PTR_ADD="add";
$PTR_SUB="sub";
$PTR_INS="ins";
@@ -81,13 +90,13 @@ $pf = ($flavour =~ /nubi/i) ? $t0 : $t2;
$big_endian=(`echo MIPSEL | $ENV{CC} -E -`=~/MIPSEL/)?1:0 if ($ENV{CC});
for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); }
for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); }
open STDOUT,">$output";
if (!defined($big_endian))
{ $big_endian=(unpack('L',pack('N',1))==1); }
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
my ($MSB,$LSB)=(0,3); # automatically converted to little-endian
@@ -110,7 +119,7 @@ ___
{{{
my $FRAMESIZE=16*$SZREG;
my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000;
my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc0fff008" : "0xc0ff0000";
my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7);
my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2);
@@ -646,7 +655,7 @@ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
___
$code.=<<___;
.set reorder
la $Tbl,AES_Te # PIC-ified 'load address'
$PTR_LA $Tbl,AES_Te # PIC-ified 'load address'
lwl $s0,0+$MSB($inp)
lwl $s1,4+$MSB($inp)
@@ -1217,7 +1226,7 @@ $code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
___
$code.=<<___;
.set reorder
la $Tbl,AES_Td # PIC-ified 'load address'
$PTR_LA $Tbl,AES_Td # PIC-ified 'load address'
lwl $s0,0+$MSB($inp)
lwl $s1,4+$MSB($inp)
@@ -1267,7 +1276,7 @@ ___
{{{
my $FRAMESIZE=8*$SZREG;
my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000;
my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0xc000f008" : "0xc0000000";
my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3);
my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
@@ -1528,9 +1537,9 @@ _mips_AES_set_encrypt_key:
nop
.end _mips_AES_set_encrypt_key
.globl private_AES_set_encrypt_key
.ent private_AES_set_encrypt_key
private_AES_set_encrypt_key:
.globl AES_set_encrypt_key
.ent AES_set_encrypt_key
AES_set_encrypt_key:
.frame $sp,$FRAMESIZE,$ra
.mask $SAVED_REGS_MASK,-$SZREG
.set noreorder
@@ -1552,11 +1561,11 @@ $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
___
$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
.cplocal $Tbl
.cpsetup $pf,$zero,private_AES_set_encrypt_key
.cpsetup $pf,$zero,AES_set_encrypt_key
___
$code.=<<___;
.set reorder
la $Tbl,AES_Te4 # PIC-ified 'load address'
$PTR_LA $Tbl,AES_Te4 # PIC-ified 'load address'
bal _mips_AES_set_encrypt_key
@@ -1575,7 +1584,7 @@ ___
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE
.end private_AES_set_encrypt_key
.end AES_set_encrypt_key
___
my ($head,$tail)=($inp,$bits);
@@ -1583,9 +1592,9 @@ my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3);
my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2);
$code.=<<___;
.align 5
.globl private_AES_set_decrypt_key
.ent private_AES_set_decrypt_key
private_AES_set_decrypt_key:
.globl AES_set_decrypt_key
.ent AES_set_decrypt_key
AES_set_decrypt_key:
.frame $sp,$FRAMESIZE,$ra
.mask $SAVED_REGS_MASK,-$SZREG
.set noreorder
@@ -1607,11 +1616,11 @@ $code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue
___
$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification
.cplocal $Tbl
.cpsetup $pf,$zero,private_AES_set_decrypt_key
.cpsetup $pf,$zero,AES_set_decrypt_key
___
$code.=<<___;
.set reorder
la $Tbl,AES_Te4 # PIC-ified 'load address'
$PTR_LA $Tbl,AES_Te4 # PIC-ified 'load address'
bal _mips_AES_set_encrypt_key
@@ -1729,7 +1738,7 @@ ___
$code.=<<___;
jr $ra
$PTR_ADD $sp,$FRAMESIZE
.end private_AES_set_decrypt_key
.end AES_set_decrypt_key
___
}}}

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -19,7 +26,7 @@
# February 2010
#
# Rescheduling instructions to favour Power6 pipeline gave 10%
# performance improvement on the platfrom in question (and marginal
# performance improvement on the platform in question (and marginal
# improvement even on others). It should be noted that Power6 fails
# to process byte in 18 cycles, only in 23, because it fails to issue
# 4 load instructions in two cycles, only in 3. As result non-compact

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -92,7 +99,7 @@ if ($flavour =~ /3[12]/) {
$g="g";
}
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$softonly=0; # allow hardware support
@@ -779,10 +786,10 @@ ___
$code.=<<___;
# void AES_set_encrypt_key(const unsigned char *in, int bits,
# AES_KEY *key) {
.globl private_AES_set_encrypt_key
.type private_AES_set_encrypt_key,\@function
.globl AES_set_encrypt_key
.type AES_set_encrypt_key,\@function
.align 16
private_AES_set_encrypt_key:
AES_set_encrypt_key:
_s390x_AES_set_encrypt_key:
lghi $t0,0
cl${g}r $inp,$t0
@@ -1059,14 +1066,14 @@ $code.=<<___;
.Lminus1:
lghi %r2,-1
br $ra
.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.size AES_set_encrypt_key,.-AES_set_encrypt_key
# void AES_set_decrypt_key(const unsigned char *in, int bits,
# AES_KEY *key) {
.globl private_AES_set_decrypt_key
.type private_AES_set_decrypt_key,\@function
.globl AES_set_decrypt_key
.type AES_set_decrypt_key,\@function
.align 16
private_AES_set_decrypt_key:
AES_set_decrypt_key:
#st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to
st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key!
bras $ra,_s390x_AES_set_encrypt_key
@@ -1166,7 +1173,7 @@ $code.=<<___;
lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
lghi %r2,0
br $ra
.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
.size AES_set_decrypt_key,.-AES_set_decrypt_key
___
########################################################################
@@ -1568,8 +1575,8 @@ ___
}
########################################################################
# void AES_xts_encrypt(const unsigned char *inp, unsigned char *out,
# size_t len, const AES_KEY *key1, const AES_KEY *key2,
# void AES_xts_encrypt(const char *inp,char *out,size_t len,
# const AES_KEY *key1, const AES_KEY *key2,
# const unsigned char iv[16]);
#
{
@@ -1937,8 +1944,8 @@ $code.=<<___;
br $ra
.size AES_xts_encrypt,.-AES_xts_encrypt
___
# void AES_xts_decrypt(const unsigned char *inp, unsigned char *out,
# size_t len, const AES_KEY *key1, const AES_KEY *key2,
# void AES_xts_decrypt(const char *inp,char *out,size_t len,
# const AES_KEY *key1, const AES_KEY *key2,
# const unsigned char iv[16]);
#
$code.=<<___;

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -30,10 +37,11 @@
# optimal decrypt procedure]. Compared to GNU C generated code both
# procedures are more than 60% faster:-)
$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }
$output = pop;
open STDOUT,">$output";
$frame="STACK_FRAME";
$bias="STACK_BIAS";
$locals=16;
$acc0="%l0";
@@ -74,11 +82,13 @@ sub _data_word()
while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
}
$code.=<<___ if ($bits==64);
$code.=<<___;
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
#endif
.section ".text",#alloc,#execinstr
.align 256

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -37,7 +44,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$verticalspin=1; # unlike 32-bit version $verticalspin performs
@@ -1282,13 +1289,13 @@ $code.=<<___;
___
}
# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
$code.=<<___;
.globl private_AES_set_encrypt_key
.type private_AES_set_encrypt_key,\@function,3
.globl AES_set_encrypt_key
.type AES_set_encrypt_key,\@function,3
.align 16
private_AES_set_encrypt_key:
AES_set_encrypt_key:
push %rbx
push %rbp
push %r12 # redundant, but allows to share
@@ -1305,7 +1312,7 @@ private_AES_set_encrypt_key:
add \$56,%rsp
.Lenc_key_epilogue:
ret
.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key
.size AES_set_encrypt_key,.-AES_set_encrypt_key
.type _x86_64_AES_set_encrypt_key,\@abi-omnipotent
.align 16
@@ -1548,13 +1555,13 @@ $code.=<<___;
___
}
# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
$code.=<<___;
.globl private_AES_set_decrypt_key
.type private_AES_set_decrypt_key,\@function,3
.globl AES_set_decrypt_key
.type AES_set_decrypt_key,\@function,3
.align 16
private_AES_set_decrypt_key:
AES_set_decrypt_key:
push %rbx
push %rbp
push %r12
@@ -1623,7 +1630,7 @@ $code.=<<___;
add \$56,%rsp
.Ldec_key_epilogue:
ret
.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key
.size AES_set_decrypt_key,.-AES_set_decrypt_key
___
# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
@@ -2770,13 +2777,13 @@ cbc_se_handler:
.rva .LSEH_end_AES_decrypt
.rva .LSEH_info_AES_decrypt
.rva .LSEH_begin_private_AES_set_encrypt_key
.rva .LSEH_end_private_AES_set_encrypt_key
.rva .LSEH_info_private_AES_set_encrypt_key
.rva .LSEH_begin_AES_set_encrypt_key
.rva .LSEH_end_AES_set_encrypt_key
.rva .LSEH_info_AES_set_encrypt_key
.rva .LSEH_begin_private_AES_set_decrypt_key
.rva .LSEH_end_private_AES_set_decrypt_key
.rva .LSEH_info_private_AES_set_decrypt_key
.rva .LSEH_begin_AES_set_decrypt_key
.rva .LSEH_end_AES_set_decrypt_key
.rva .LSEH_info_AES_set_decrypt_key
.rva .LSEH_begin_AES_cbc_encrypt
.rva .LSEH_end_AES_cbc_encrypt
@@ -2792,11 +2799,11 @@ cbc_se_handler:
.byte 9,0,0,0
.rva block_se_handler
.rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
.LSEH_info_private_AES_set_encrypt_key:
.LSEH_info_AES_set_encrypt_key:
.byte 9,0,0,0
.rva key_se_handler
.rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[]
.LSEH_info_private_AES_set_decrypt_key:
.LSEH_info_AES_set_decrypt_key:
.byte 9,0,0,0
.rva key_se_handler
.rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[]

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -67,7 +74,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([
$avx = ($2>=3.0) + ($2>3.0);
}
open OUT,"| \"$^X\" $xlate $flavour $output";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
# void aesni_multi_cbc_encrypt (

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -25,6 +32,7 @@
# Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%)
# Ivy Bridge 5.05[+4.6] 9.65 5.54 +74%
# Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%)
# Skylake 2.63[+3.5(4.1)] 6.17(6.69) 4.23(4.44) +46%(+51%)
# Bulldozer 5.77[+6.0] 11.72 6.37 +84%
#
# AES-192-CBC
@@ -39,6 +47,7 @@
# Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%)
# Ivy Bridge 7.05 11.65 7.12 +64%
# Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%)
# Skylake 3.62 7.16(7.68) 4.56(4.76) +57%(+61$)
# Bulldozer 8.00 13.95 8.25 +69%
#
# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
@@ -100,7 +109,7 @@ $shaext=1; ### set to zero if compiling for 1.0.1
$stitched_decrypt=0;
open OUT,"| \"$^X\" $xlate $flavour $output";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
# void aesni_cbc_sha1_enc(const void *inp,
@@ -298,7 +307,7 @@ ___
$r++; unshift(@rndkey,pop(@rndkey));
};
sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
@@ -1137,7 +1146,7 @@ ___
$r++; unshift(@rndkey,pop(@rndkey));
};
sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
sub Xupdate_avx_16_31() # recall that $Xi starts with 4
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
@@ -1702,6 +1711,7 @@ $code.=<<___;
mov 240($key),$rounds
sub $in0,$out
movups ($key),$rndkey0 # $key[0]
movups ($ivp),$iv # load IV
movups 16($key),$rndkey[0] # forward reference
lea 112($key),$key # size optimization

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -25,9 +32,10 @@
# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
#
# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
# (*) there are XOP, AVX1 and AVX2 code paths, meaning that
# Westmere is omitted from loop, this is because gain was not
# estimated high enough to justify the effort;
# (**) these are EVP-free results, results obtained with 'speed
@@ -66,7 +74,7 @@ if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([
$shaext=$avx; ### set to zero if compiling for 1.0.1
$avx=1 if (!$shaext && $avx);
open OUT,"| \"$^X\" $xlate $flavour $output";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$func="aesni_cbc_sha256_enc";
@@ -1299,6 +1307,7 @@ $code.=<<___;
mov 240($key),$rounds
sub $in0,$out
movups ($key),$rndkey0 # $key[0]
movups ($ivp),$iv # load IV
movups 16($key),$rndkey[0] # forward reference
lea 112($key),$key # size optimization

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -43,16 +50,20 @@
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
# November 2015
#
# Add aesni_ocb_[en|de]crypt.
######################################################################
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
# CBC en-/decrypt CTR XTS ECB
# CBC en-/decrypt CTR XTS ECB OCB
# Westmere 3.77/1.37 1.37 1.52 1.27
# * Bridge 5.07/0.98 0.99 1.09 0.91
# Haswell 4.44/0.80 0.97 1.03 0.72
# Silvermont 5.77/3.56 3.67 4.03 3.46
# Bulldozer 5.80/0.98 1.05 1.24 0.93
# * Bridge 5.07/0.98 0.99 1.09 0.91 1.10
# Haswell 4.44/0.80 0.97 1.03 0.72 0.76
# Silvermont 5.77/3.56 3.67 4.03 3.46 4.03
# Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
@@ -63,6 +74,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output = pop;
open OUT,">$output";
*STDOUT=*OUT;
&asm_init($ARGV[0],$0);
&external_label("OPENSSL_ia32cap_P");
@@ -1831,6 +1846,877 @@ if ($PREFIX eq "aesni") {
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
&function_end("aesni_xts_decrypt");
}
######################################################################
# void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
# const AES_KEY *key, unsigned int start_block_num,
# unsigned char offset_i[16], const unsigned char L_[][16],
# unsigned char checksum[16]);
#
{
# offsets within stack frame
my $checksum = 16*6;
my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
# reassigned registers
my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
# $l_, $blocks, $inp, $key are permanently allocated in registers;
# remaining non-volatile ones are offloaded to stack, which even
# stay invariant after written to stack.
&function_begin("aesni_ocb_encrypt");
&mov ($rounds,&wparam(5)); # &offset_i
&mov ($rounds_,&wparam(7)); # &checksum
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3));
&movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
&mov ($block,&wparam(4)); # start_block_num
&movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
&mov ($l_,&wparam(6)); # L_
&mov ($rounds,"esp");
&sub ("esp",$esp_off+4); # alloca
&and ("esp",-16); # align stack
&sub ($out,$inp);
&shl ($len,4);
&lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
&mov (&DWP($out_off,"esp"),$out);
&mov (&DWP($end_off,"esp"),$len);
&mov (&DWP($esp_off,"esp"),$rounds);
&mov ($rounds,&DWP(240,$key));
&test ($block,1);
&jnz (&label("odd"));
&bsf ($i3,$block);
&add ($block,1);
&shl ($i3,4);
&movdqu ($inout5,&QWP(0,$l_,$i3));
&mov ($i3,$key); # put aside key
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&lea ($inp,&DWP(16,$inp));
&pxor ($inout5,$rndkey0); # ^ last offset_i
&pxor ($rndkey1,$inout0); # checksum
&pxor ($inout0,$inout5); # ^ offset_i
&movdqa ($inout4,$rndkey1);
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&xorps ($inout0,$inout5); # ^ offset_i
&movdqa ($rndkey0,$inout5); # pass last offset_i
&movdqa ($rndkey1,$inout4); # pass the checksum
&movups (&QWP(-16,$out,$inp),$inout0); # store output
&mov ($rounds,&DWP(240,$i3));
&mov ($key,$i3); # restore key
&mov ($len,&DWP($end_off,"esp"));
&set_label("odd");
&shl ($rounds,4);
&mov ($out,16);
&sub ($out,$rounds); # twisted rounds
&mov (&DWP($key_off,"esp"),$key);
&lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
&mov (&DWP($rounds_off,"esp"),$out);
&cmp ($inp,$len);
&ja (&label("short"));
&jmp (&label("grandloop"));
&set_label("grandloop",32);
&lea ($i1,&DWP(1,$block));
&lea ($i3,&DWP(3,$block));
&lea ($i5,&DWP(5,$block));
&add ($block,6);
&bsf ($i1,$i1);
&bsf ($i3,$i3);
&bsf ($i5,$i5);
&shl ($i1,4);
&shl ($i3,4);
&shl ($i5,4);
&movdqu ($inout0,&QWP(0,$l_));
&movdqu ($inout1,&QWP(0,$l_,$i1));
&mov ($rounds,&DWP($rounds_off,"esp"));
&movdqa ($inout2,$inout0);
&movdqu ($inout3,&QWP(0,$l_,$i3));
&movdqa ($inout4,$inout0);
&movdqu ($inout5,&QWP(0,$l_,$i5));
&pxor ($inout0,$rndkey0); # ^ last offset_i
&pxor ($inout1,$inout0);
&movdqa (&QWP(16*0,"esp"),$inout0);
&pxor ($inout2,$inout1);
&movdqa (&QWP(16*1,"esp"),$inout1);
&pxor ($inout3,$inout2);
&movdqa (&QWP(16*2,"esp"),$inout2);
&pxor ($inout4,$inout3);
&movdqa (&QWP(16*3,"esp"),$inout3);
&pxor ($inout5,$inout4);
&movdqa (&QWP(16*4,"esp"),$inout4);
&movdqa (&QWP(16*5,"esp"),$inout5);
&$movekey ($rndkey0,&QWP(-48,$key,$rounds));
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&movdqu ($inout1,&QWP(16*1,$inp));
&movdqu ($inout2,&QWP(16*2,$inp));
&movdqu ($inout3,&QWP(16*3,$inp));
&movdqu ($inout4,&QWP(16*4,$inp));
&movdqu ($inout5,&QWP(16*5,$inp));
&lea ($inp,&DWP(16*6,$inp));
&pxor ($rndkey1,$inout0); # checksum
&pxor ($inout0,$rndkey0); # ^ roundkey[0]
&pxor ($rndkey1,$inout1);
&pxor ($inout1,$rndkey0);
&pxor ($rndkey1,$inout2);
&pxor ($inout2,$rndkey0);
&pxor ($rndkey1,$inout3);
&pxor ($inout3,$rndkey0);
&pxor ($rndkey1,$inout4);
&pxor ($inout4,$rndkey0);
&pxor ($rndkey1,$inout5);
&pxor ($inout5,$rndkey0);
&movdqa (&QWP($checksum,"esp"),$rndkey1);
&$movekey ($rndkey1,&QWP(-32,$key,$rounds));
&pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
&pxor ($inout1,&QWP(16*1,"esp"));
&pxor ($inout2,&QWP(16*2,"esp"));
&pxor ($inout3,&QWP(16*3,"esp"));
&pxor ($inout4,&QWP(16*4,"esp"));
&pxor ($inout5,&QWP(16*5,"esp"));
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&aesenc ($inout0,$rndkey1);
&aesenc ($inout1,$rndkey1);
&aesenc ($inout2,$rndkey1);
&aesenc ($inout3,$rndkey1);
&aesenc ($inout4,$rndkey1);
&aesenc ($inout5,$rndkey1);
&mov ($out,&DWP($out_off,"esp"));
&mov ($len,&DWP($end_off,"esp"));
&call ("_aesni_encrypt6_enter");
&movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
&pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
&pxor ($inout1,&QWP(16*1,"esp"));
&pxor ($inout2,&QWP(16*2,"esp"));
&pxor ($inout3,&QWP(16*3,"esp"));
&pxor ($inout4,&QWP(16*4,"esp"));
&pxor ($inout5,$rndkey0);
&movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
&movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
&movdqu (&QWP(-16*5,$out,$inp),$inout1);
&movdqu (&QWP(-16*4,$out,$inp),$inout2);
&movdqu (&QWP(-16*3,$out,$inp),$inout3);
&movdqu (&QWP(-16*2,$out,$inp),$inout4);
&movdqu (&QWP(-16*1,$out,$inp),$inout5);
&cmp ($inp,$len); # done yet?
&jb (&label("grandloop"));
&set_label("short");
&add ($len,16*6);
&sub ($len,$inp);
&jz (&label("done"));
&cmp ($len,16*2);
&jb (&label("one"));
&je (&label("two"));
&cmp ($len,16*4);
&jb (&label("three"));
&je (&label("four"));
&lea ($i1,&DWP(1,$block));
&lea ($i3,&DWP(3,$block));
&bsf ($i1,$i1);
&bsf ($i3,$i3);
&shl ($i1,4);
&shl ($i3,4);
&movdqu ($inout0,&QWP(0,$l_));
&movdqu ($inout1,&QWP(0,$l_,$i1));
&mov ($rounds,&DWP($rounds_off,"esp"));
&movdqa ($inout2,$inout0);
&movdqu ($inout3,&QWP(0,$l_,$i3));
&movdqa ($inout4,$inout0);
&pxor ($inout0,$rndkey0); # ^ last offset_i
&pxor ($inout1,$inout0);
&movdqa (&QWP(16*0,"esp"),$inout0);
&pxor ($inout2,$inout1);
&movdqa (&QWP(16*1,"esp"),$inout1);
&pxor ($inout3,$inout2);
&movdqa (&QWP(16*2,"esp"),$inout2);
&pxor ($inout4,$inout3);
&movdqa (&QWP(16*3,"esp"),$inout3);
&pxor ($inout5,$inout4);
&movdqa (&QWP(16*4,"esp"),$inout4);
&$movekey ($rndkey0,&QWP(-48,$key,$rounds));
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&movdqu ($inout1,&QWP(16*1,$inp));
&movdqu ($inout2,&QWP(16*2,$inp));
&movdqu ($inout3,&QWP(16*3,$inp));
&movdqu ($inout4,&QWP(16*4,$inp));
&pxor ($inout5,$inout5);
&pxor ($rndkey1,$inout0); # checksum
&pxor ($inout0,$rndkey0); # ^ roundkey[0]
&pxor ($rndkey1,$inout1);
&pxor ($inout1,$rndkey0);
&pxor ($rndkey1,$inout2);
&pxor ($inout2,$rndkey0);
&pxor ($rndkey1,$inout3);
&pxor ($inout3,$rndkey0);
&pxor ($rndkey1,$inout4);
&pxor ($inout4,$rndkey0);
&movdqa (&QWP($checksum,"esp"),$rndkey1);
&$movekey ($rndkey1,&QWP(-32,$key,$rounds));
&pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
&pxor ($inout1,&QWP(16*1,"esp"));
&pxor ($inout2,&QWP(16*2,"esp"));
&pxor ($inout3,&QWP(16*3,"esp"));
&pxor ($inout4,&QWP(16*4,"esp"));
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&aesenc ($inout0,$rndkey1);
&aesenc ($inout1,$rndkey1);
&aesenc ($inout2,$rndkey1);
&aesenc ($inout3,$rndkey1);
&aesenc ($inout4,$rndkey1);
&aesenc ($inout5,$rndkey1);
&mov ($out,&DWP($out_off,"esp"));
&call ("_aesni_encrypt6_enter");
&movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
&pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
&pxor ($inout1,&QWP(16*1,"esp"));
&pxor ($inout2,&QWP(16*2,"esp"));
&pxor ($inout3,&QWP(16*3,"esp"));
&pxor ($inout4,$rndkey0);
&movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
&movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
&movdqu (&QWP(16*1,$out,$inp),$inout1);
&movdqu (&QWP(16*2,$out,$inp),$inout2);
&movdqu (&QWP(16*3,$out,$inp),$inout3);
&movdqu (&QWP(16*4,$out,$inp),$inout4);
&jmp (&label("done"));
&set_label("one",16);
&movdqu ($inout5,&QWP(0,$l_));
&mov ($key,&DWP($key_off,"esp")); # restore key
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&mov ($rounds,&DWP(240,$key));
&pxor ($inout5,$rndkey0); # ^ last offset_i
&pxor ($rndkey1,$inout0); # checksum
&pxor ($inout0,$inout5); # ^ offset_i
&movdqa ($inout4,$rndkey1);
&mov ($out,&DWP($out_off,"esp"));
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&xorps ($inout0,$inout5); # ^ offset_i
&movdqa ($rndkey0,$inout5); # pass last offset_i
&movdqa ($rndkey1,$inout4); # pass the checksum
&movups (&QWP(0,$out,$inp),$inout0);
&jmp (&label("done"));
&set_label("two",16);
&lea ($i1,&DWP(1,$block));
&mov ($key,&DWP($key_off,"esp")); # restore key
&bsf ($i1,$i1);
&shl ($i1,4);
&movdqu ($inout4,&QWP(0,$l_));
&movdqu ($inout5,&QWP(0,$l_,$i1));
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&movdqu ($inout1,&QWP(16*1,$inp));
&mov ($rounds,&DWP(240,$key));
&pxor ($inout4,$rndkey0); # ^ last offset_i
&pxor ($inout5,$inout4);
&pxor ($rndkey1,$inout0); # checksum
&pxor ($inout0,$inout4); # ^ offset_i
&pxor ($rndkey1,$inout1);
&pxor ($inout1,$inout5);
&movdqa ($inout3,$rndkey1)
&mov ($out,&DWP($out_off,"esp"));
&call ("_aesni_encrypt2");
&xorps ($inout0,$inout4); # ^ offset_i
&xorps ($inout1,$inout5);
&movdqa ($rndkey0,$inout5); # pass last offset_i
&movdqa ($rndkey1,$inout3); # pass the checksum
&movups (&QWP(16*0,$out,$inp),$inout0); # store output
&movups (&QWP(16*1,$out,$inp),$inout1);
&jmp (&label("done"));
&set_label("three",16);
&lea ($i1,&DWP(1,$block));
&mov ($key,&DWP($key_off,"esp")); # restore key
&bsf ($i1,$i1);
&shl ($i1,4);
&movdqu ($inout3,&QWP(0,$l_));
&movdqu ($inout4,&QWP(0,$l_,$i1));
&movdqa ($inout5,$inout3);
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&movdqu ($inout1,&QWP(16*1,$inp));
&movdqu ($inout2,&QWP(16*2,$inp));
&mov ($rounds,&DWP(240,$key));
&pxor ($inout3,$rndkey0); # ^ last offset_i
&pxor ($inout4,$inout3);
&pxor ($inout5,$inout4);
&pxor ($rndkey1,$inout0); # checksum
&pxor ($inout0,$inout3); # ^ offset_i
&pxor ($rndkey1,$inout1);
&pxor ($inout1,$inout4);
&pxor ($rndkey1,$inout2);
&pxor ($inout2,$inout5);
&movdqa (&QWP($checksum,"esp"),$rndkey1);
&mov ($out,&DWP($out_off,"esp"));
&call ("_aesni_encrypt3");
&xorps ($inout0,$inout3); # ^ offset_i
&xorps ($inout1,$inout4);
&xorps ($inout2,$inout5);
&movdqa ($rndkey0,$inout5); # pass last offset_i
&movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
&movups (&QWP(16*0,$out,$inp),$inout0); # store output
&movups (&QWP(16*1,$out,$inp),$inout1);
&movups (&QWP(16*2,$out,$inp),$inout2);
&jmp (&label("done"));
&set_label("four",16);
&lea ($i1,&DWP(1,$block));
&lea ($i3,&DWP(3,$block));
&bsf ($i1,$i1);
&bsf ($i3,$i3);
&mov ($key,&DWP($key_off,"esp")); # restore key
&shl ($i1,4);
&shl ($i3,4);
&movdqu ($inout2,&QWP(0,$l_));
&movdqu ($inout3,&QWP(0,$l_,$i1));
&movdqa ($inout4,$inout2);
&movdqu ($inout5,&QWP(0,$l_,$i3));
&pxor ($inout2,$rndkey0); # ^ last offset_i
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&pxor ($inout3,$inout2);
&movdqu ($inout1,&QWP(16*1,$inp));
&pxor ($inout4,$inout3);
&movdqa (&QWP(16*0,"esp"),$inout2);
&pxor ($inout5,$inout4);
&movdqa (&QWP(16*1,"esp"),$inout3);
&movdqu ($inout2,&QWP(16*2,$inp));
&movdqu ($inout3,&QWP(16*3,$inp));
&mov ($rounds,&DWP(240,$key));
&pxor ($rndkey1,$inout0); # checksum
&pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
&pxor ($rndkey1,$inout1);
&pxor ($inout1,&QWP(16*1,"esp"));
&pxor ($rndkey1,$inout2);
&pxor ($inout2,$inout4);
&pxor ($rndkey1,$inout3);
&pxor ($inout3,$inout5);
&movdqa (&QWP($checksum,"esp"),$rndkey1)
&mov ($out,&DWP($out_off,"esp"));
&call ("_aesni_encrypt4");
&xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,$inout4);
&movups (&QWP(16*0,$out,$inp),$inout0); # store output
&xorps ($inout3,$inout5);
&movups (&QWP(16*1,$out,$inp),$inout1);
&movdqa ($rndkey0,$inout5); # pass last offset_i
&movups (&QWP(16*2,$out,$inp),$inout2);
&movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
&movups (&QWP(16*3,$out,$inp),$inout3);
&set_label("done");
&mov ($key,&DWP($esp_off,"esp"));
&pxor ($inout0,$inout0); # clear register bank
&pxor ($inout1,$inout1);
&movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
&pxor ($inout2,$inout2);
&movdqa (&QWP(16*1,"esp"),$inout0);
&pxor ($inout3,$inout3);
&movdqa (&QWP(16*2,"esp"),$inout0);
&pxor ($inout4,$inout4);
&movdqa (&QWP(16*3,"esp"),$inout0);
&pxor ($inout5,$inout5);
&movdqa (&QWP(16*4,"esp"),$inout0);
&movdqa (&QWP(16*5,"esp"),$inout0);
&movdqa (&QWP(16*6,"esp"),$inout0);
&lea ("esp",&DWP(0,$key));
&mov ($rounds,&wparam(5)); # &offset_i
&mov ($rounds_,&wparam(7)); # &checksum
&movdqu (&QWP(0,$rounds),$rndkey0);
&pxor ($rndkey0,$rndkey0);
&movdqu (&QWP(0,$rounds_),$rndkey1);
&pxor ($rndkey1,$rndkey1);
&function_end("aesni_ocb_encrypt");
&function_begin("aesni_ocb_decrypt");
&mov ($rounds,&wparam(5)); # &offset_i
&mov ($rounds_,&wparam(7)); # &checksum
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3));
&movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
&mov ($block,&wparam(4)); # start_block_num
&movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
&mov ($l_,&wparam(6)); # L_
&mov ($rounds,"esp");
&sub ("esp",$esp_off+4); # alloca
&and ("esp",-16); # align stack
&sub ($out,$inp);
&shl ($len,4);
&lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
&mov (&DWP($out_off,"esp"),$out);
&mov (&DWP($end_off,"esp"),$len);
&mov (&DWP($esp_off,"esp"),$rounds);
&mov ($rounds,&DWP(240,$key));
&test ($block,1);
&jnz (&label("odd"));
&bsf ($i3,$block);
&add ($block,1);
&shl ($i3,4);
&movdqu ($inout5,&QWP(0,$l_,$i3));
&mov ($i3,$key); # put aside key
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&lea ($inp,&DWP(16,$inp));
&pxor ($inout5,$rndkey0); # ^ last offset_i
&pxor ($inout0,$inout5); # ^ offset_i
&movdqa ($inout4,$rndkey1);
if ($inline)
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
&xorps ($inout0,$inout5); # ^ offset_i
&movaps ($rndkey1,$inout4); # pass the checksum
&movdqa ($rndkey0,$inout5); # pass last offset_i
&xorps ($rndkey1,$inout0); # checksum
&movups (&QWP(-16,$out,$inp),$inout0); # store output
&mov ($rounds,&DWP(240,$i3));
&mov ($key,$i3); # restore key
&mov ($len,&DWP($end_off,"esp"));
&set_label("odd");
&shl ($rounds,4);
&mov ($out,16);
&sub ($out,$rounds); # twisted rounds
&mov (&DWP($key_off,"esp"),$key);
&lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
&mov (&DWP($rounds_off,"esp"),$out);
&cmp ($inp,$len);
&ja (&label("short"));
&jmp (&label("grandloop"));
&set_label("grandloop",32);
&lea ($i1,&DWP(1,$block));
&lea ($i3,&DWP(3,$block));
&lea ($i5,&DWP(5,$block));
&add ($block,6);
&bsf ($i1,$i1);
&bsf ($i3,$i3);
&bsf ($i5,$i5);
&shl ($i1,4);
&shl ($i3,4);
&shl ($i5,4);
&movdqu ($inout0,&QWP(0,$l_));
&movdqu ($inout1,&QWP(0,$l_,$i1));
&mov ($rounds,&DWP($rounds_off,"esp"));
&movdqa ($inout2,$inout0);
&movdqu ($inout3,&QWP(0,$l_,$i3));
&movdqa ($inout4,$inout0);
&movdqu ($inout5,&QWP(0,$l_,$i5));
&pxor ($inout0,$rndkey0); # ^ last offset_i
&pxor ($inout1,$inout0);
&movdqa (&QWP(16*0,"esp"),$inout0);
&pxor ($inout2,$inout1);
&movdqa (&QWP(16*1,"esp"),$inout1);
&pxor ($inout3,$inout2);
&movdqa (&QWP(16*2,"esp"),$inout2);
&pxor ($inout4,$inout3);
&movdqa (&QWP(16*3,"esp"),$inout3);
&pxor ($inout5,$inout4);
&movdqa (&QWP(16*4,"esp"),$inout4);
&movdqa (&QWP(16*5,"esp"),$inout5);
&$movekey ($rndkey0,&QWP(-48,$key,$rounds));
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&movdqu ($inout1,&QWP(16*1,$inp));
&movdqu ($inout2,&QWP(16*2,$inp));
&movdqu ($inout3,&QWP(16*3,$inp));
&movdqu ($inout4,&QWP(16*4,$inp));
&movdqu ($inout5,&QWP(16*5,$inp));
&lea ($inp,&DWP(16*6,$inp));
&movdqa (&QWP($checksum,"esp"),$rndkey1);
&pxor ($inout0,$rndkey0); # ^ roundkey[0]
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&pxor ($inout3,$rndkey0);
&pxor ($inout4,$rndkey0);
&pxor ($inout5,$rndkey0);
&$movekey ($rndkey1,&QWP(-32,$key,$rounds));
&pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
&pxor ($inout1,&QWP(16*1,"esp"));
&pxor ($inout2,&QWP(16*2,"esp"));
&pxor ($inout3,&QWP(16*3,"esp"));
&pxor ($inout4,&QWP(16*4,"esp"));
&pxor ($inout5,&QWP(16*5,"esp"));
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&aesdec ($inout0,$rndkey1);
&aesdec ($inout1,$rndkey1);
&aesdec ($inout2,$rndkey1);
&aesdec ($inout3,$rndkey1);
&aesdec ($inout4,$rndkey1);
&aesdec ($inout5,$rndkey1);
&mov ($out,&DWP($out_off,"esp"));
&mov ($len,&DWP($end_off,"esp"));
&call ("_aesni_decrypt6_enter");
&movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
&pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
&movdqa ($rndkey1,&QWP($checksum,"esp"));
&pxor ($inout1,&QWP(16*1,"esp"));
&pxor ($inout2,&QWP(16*2,"esp"));
&pxor ($inout3,&QWP(16*3,"esp"));
&pxor ($inout4,&QWP(16*4,"esp"));
&pxor ($inout5,$rndkey0);
&pxor ($rndkey1,$inout0); # checksum
&movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
&pxor ($rndkey1,$inout1);
&movdqu (&QWP(-16*5,$out,$inp),$inout1);
&pxor ($rndkey1,$inout2);
&movdqu (&QWP(-16*4,$out,$inp),$inout2);
&pxor ($rndkey1,$inout3);
&movdqu (&QWP(-16*3,$out,$inp),$inout3);
&pxor ($rndkey1,$inout4);
&movdqu (&QWP(-16*2,$out,$inp),$inout4);
&pxor ($rndkey1,$inout5);
&movdqu (&QWP(-16*1,$out,$inp),$inout5);
&cmp ($inp,$len); # done yet?
&jb (&label("grandloop"));
&set_label("short");
&add ($len,16*6);
&sub ($len,$inp);
&jz (&label("done"));
&cmp ($len,16*2);
&jb (&label("one"));
&je (&label("two"));
&cmp ($len,16*4);
&jb (&label("three"));
&je (&label("four"));
&lea ($i1,&DWP(1,$block));
&lea ($i3,&DWP(3,$block));
&bsf ($i1,$i1);
&bsf ($i3,$i3);
&shl ($i1,4);
&shl ($i3,4);
&movdqu ($inout0,&QWP(0,$l_));
&movdqu ($inout1,&QWP(0,$l_,$i1));
&mov ($rounds,&DWP($rounds_off,"esp"));
&movdqa ($inout2,$inout0);
&movdqu ($inout3,&QWP(0,$l_,$i3));
&movdqa ($inout4,$inout0);
&pxor ($inout0,$rndkey0); # ^ last offset_i
&pxor ($inout1,$inout0);
&movdqa (&QWP(16*0,"esp"),$inout0);
&pxor ($inout2,$inout1);
&movdqa (&QWP(16*1,"esp"),$inout1);
&pxor ($inout3,$inout2);
&movdqa (&QWP(16*2,"esp"),$inout2);
&pxor ($inout4,$inout3);
&movdqa (&QWP(16*3,"esp"),$inout3);
&pxor ($inout5,$inout4);
&movdqa (&QWP(16*4,"esp"),$inout4);
&$movekey ($rndkey0,&QWP(-48,$key,$rounds));
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&movdqu ($inout1,&QWP(16*1,$inp));
&movdqu ($inout2,&QWP(16*2,$inp));
&movdqu ($inout3,&QWP(16*3,$inp));
&movdqu ($inout4,&QWP(16*4,$inp));
&pxor ($inout5,$inout5);
&movdqa (&QWP($checksum,"esp"),$rndkey1);
&pxor ($inout0,$rndkey0); # ^ roundkey[0]
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&pxor ($inout3,$rndkey0);
&pxor ($inout4,$rndkey0);
&$movekey ($rndkey1,&QWP(-32,$key,$rounds));
&pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
&pxor ($inout1,&QWP(16*1,"esp"));
&pxor ($inout2,&QWP(16*2,"esp"));
&pxor ($inout3,&QWP(16*3,"esp"));
&pxor ($inout4,&QWP(16*4,"esp"));
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&aesdec ($inout0,$rndkey1);
&aesdec ($inout1,$rndkey1);
&aesdec ($inout2,$rndkey1);
&aesdec ($inout3,$rndkey1);
&aesdec ($inout4,$rndkey1);
&aesdec ($inout5,$rndkey1);
&mov ($out,&DWP($out_off,"esp"));
&call ("_aesni_decrypt6_enter");
&movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
&pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
&movdqa ($rndkey1,&QWP($checksum,"esp"));
&pxor ($inout1,&QWP(16*1,"esp"));
&pxor ($inout2,&QWP(16*2,"esp"));
&pxor ($inout3,&QWP(16*3,"esp"));
&pxor ($inout4,$rndkey0);
&pxor ($rndkey1,$inout0); # checksum
&movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
&pxor ($rndkey1,$inout1);
&movdqu (&QWP(16*1,$out,$inp),$inout1);
&pxor ($rndkey1,$inout2);
&movdqu (&QWP(16*2,$out,$inp),$inout2);
&pxor ($rndkey1,$inout3);
&movdqu (&QWP(16*3,$out,$inp),$inout3);
&pxor ($rndkey1,$inout4);
&movdqu (&QWP(16*4,$out,$inp),$inout4);
&jmp (&label("done"));
&set_label("one",16);
&movdqu ($inout5,&QWP(0,$l_));
&mov ($key,&DWP($key_off,"esp")); # restore key
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&mov ($rounds,&DWP(240,$key));
&pxor ($inout5,$rndkey0); # ^ last offset_i
&pxor ($inout0,$inout5); # ^ offset_i
&movdqa ($inout4,$rndkey1);
&mov ($out,&DWP($out_off,"esp"));
if ($inline)
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
&xorps ($inout0,$inout5); # ^ offset_i
&movaps ($rndkey1,$inout4); # pass the checksum
&movdqa ($rndkey0,$inout5); # pass last offset_i
&xorps ($rndkey1,$inout0); # checksum
&movups (&QWP(0,$out,$inp),$inout0);
&jmp (&label("done"));
&set_label("two",16);
&lea ($i1,&DWP(1,$block));
&mov ($key,&DWP($key_off,"esp")); # restore key
&bsf ($i1,$i1);
&shl ($i1,4);
&movdqu ($inout4,&QWP(0,$l_));
&movdqu ($inout5,&QWP(0,$l_,$i1));
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&movdqu ($inout1,&QWP(16*1,$inp));
&mov ($rounds,&DWP(240,$key));
&movdqa ($inout3,$rndkey1);
&pxor ($inout4,$rndkey0); # ^ last offset_i
&pxor ($inout5,$inout4);
&pxor ($inout0,$inout4); # ^ offset_i
&pxor ($inout1,$inout5);
&mov ($out,&DWP($out_off,"esp"));
&call ("_aesni_decrypt2");
&xorps ($inout0,$inout4); # ^ offset_i
&xorps ($inout1,$inout5);
&movdqa ($rndkey0,$inout5); # pass last offset_i
&xorps ($inout3,$inout0); # checksum
&movups (&QWP(16*0,$out,$inp),$inout0); # store output
&xorps ($inout3,$inout1);
&movups (&QWP(16*1,$out,$inp),$inout1);
&movaps ($rndkey1,$inout3); # pass the checksum
&jmp (&label("done"));
&set_label("three",16);
&lea ($i1,&DWP(1,$block));
&mov ($key,&DWP($key_off,"esp")); # restore key
&bsf ($i1,$i1);
&shl ($i1,4);
&movdqu ($inout3,&QWP(0,$l_));
&movdqu ($inout4,&QWP(0,$l_,$i1));
&movdqa ($inout5,$inout3);
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&movdqu ($inout1,&QWP(16*1,$inp));
&movdqu ($inout2,&QWP(16*2,$inp));
&mov ($rounds,&DWP(240,$key));
&movdqa (&QWP($checksum,"esp"),$rndkey1);
&pxor ($inout3,$rndkey0); # ^ last offset_i
&pxor ($inout4,$inout3);
&pxor ($inout5,$inout4);
&pxor ($inout0,$inout3); # ^ offset_i
&pxor ($inout1,$inout4);
&pxor ($inout2,$inout5);
&mov ($out,&DWP($out_off,"esp"));
&call ("_aesni_decrypt3");
&movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
&xorps ($inout0,$inout3); # ^ offset_i
&xorps ($inout1,$inout4);
&xorps ($inout2,$inout5);
&movups (&QWP(16*0,$out,$inp),$inout0); # store output
&pxor ($rndkey1,$inout0); # checksum
&movdqa ($rndkey0,$inout5); # pass last offset_i
&movups (&QWP(16*1,$out,$inp),$inout1);
&pxor ($rndkey1,$inout1);
&movups (&QWP(16*2,$out,$inp),$inout2);
&pxor ($rndkey1,$inout2);
&jmp (&label("done"));
&set_label("four",16);
&lea ($i1,&DWP(1,$block));
&lea ($i3,&DWP(3,$block));
&bsf ($i1,$i1);
&bsf ($i3,$i3);
&mov ($key,&DWP($key_off,"esp")); # restore key
&shl ($i1,4);
&shl ($i3,4);
&movdqu ($inout2,&QWP(0,$l_));
&movdqu ($inout3,&QWP(0,$l_,$i1));
&movdqa ($inout4,$inout2);
&movdqu ($inout5,&QWP(0,$l_,$i3));
&pxor ($inout2,$rndkey0); # ^ last offset_i
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&pxor ($inout3,$inout2);
&movdqu ($inout1,&QWP(16*1,$inp));
&pxor ($inout4,$inout3);
&movdqa (&QWP(16*0,"esp"),$inout2);
&pxor ($inout5,$inout4);
&movdqa (&QWP(16*1,"esp"),$inout3);
&movdqu ($inout2,&QWP(16*2,$inp));
&movdqu ($inout3,&QWP(16*3,$inp));
&mov ($rounds,&DWP(240,$key));
&movdqa (&QWP($checksum,"esp"),$rndkey1);
&pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
&pxor ($inout1,&QWP(16*1,"esp"));
&pxor ($inout2,$inout4);
&pxor ($inout3,$inout5);
&mov ($out,&DWP($out_off,"esp"));
&call ("_aesni_decrypt4");
&movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
&xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,$inout4);
&movups (&QWP(16*0,$out,$inp),$inout0); # store output
&pxor ($rndkey1,$inout0); # checksum
&xorps ($inout3,$inout5);
&movups (&QWP(16*1,$out,$inp),$inout1);
&pxor ($rndkey1,$inout1);
&movdqa ($rndkey0,$inout5); # pass last offset_i
&movups (&QWP(16*2,$out,$inp),$inout2);
&pxor ($rndkey1,$inout2);
&movups (&QWP(16*3,$out,$inp),$inout3);
&pxor ($rndkey1,$inout3);
&set_label("done");
&mov ($key,&DWP($esp_off,"esp"));
&pxor ($inout0,$inout0); # clear register bank
&pxor ($inout1,$inout1);
&movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
&pxor ($inout2,$inout2);
&movdqa (&QWP(16*1,"esp"),$inout0);
&pxor ($inout3,$inout3);
&movdqa (&QWP(16*2,"esp"),$inout0);
&pxor ($inout4,$inout4);
&movdqa (&QWP(16*3,"esp"),$inout0);
&pxor ($inout5,$inout5);
&movdqa (&QWP(16*4,"esp"),$inout0);
&movdqa (&QWP(16*5,"esp"),$inout0);
&movdqa (&QWP(16*6,"esp"),$inout0);
&lea ("esp",&DWP(0,$key));
&mov ($rounds,&wparam(5)); # &offset_i
&mov ($rounds_,&wparam(7)); # &checksum
&movdqu (&QWP(0,$rounds),$rndkey0);
&pxor ($rndkey0,$rndkey0);
&movdqu (&QWP(0,$rounds_),$rndkey1);
&pxor ($rndkey1,$rndkey1);
&function_end("aesni_ocb_decrypt");
}
}
######################################################################
@@ -2419,7 +3305,7 @@ if ($PREFIX eq "aesni") {
&pxor ("xmm3","xmm3");
&aesenclast ("xmm2","xmm3");
&movdqa ("xmm3","xmm1")
&movdqa ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm3","xmm1");
&pslldq ("xmm1",4);
@@ -2523,3 +3409,5 @@ if ($PREFIX eq "aesni") {
&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
@@ -68,7 +75,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "sparcv9_modes.pl";
&asm_init(@ARGV);
$output = pop;
open STDOUT,">$output";
$::evp=1; # if $evp is set to 0, script generates module with
# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
@@ -83,12 +91,14 @@ $::evp=1; # if $evp is set to 0, script generates module with
{
my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
$code.=<<___ if ($::abibits==64);
$code.=<<___;
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
___
$code.=<<___;
.text
.globl aes_t4_encrypt

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -27,12 +34,21 @@
# Cortex-A53 1.32 1.29 1.46
# Cortex-A57(*) 1.95 0.85 0.93
# Denver 1.96 0.86 0.80
# Mongoose 1.33 1.20 1.20
#
# (*) original 3.64/1.34/1.32 results were for r0p0 revision
# and are still same even for updated module;
$flavour = shift;
open STDOUT,">".shift;
$output = shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
$prefix="aes_v8";
@@ -43,9 +59,12 @@ $code=<<___;
.text
___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
#^^^^^^ this is done to simplify adoption by not depending
# on latest binutils.
$code.=<<___ if ($flavour !~ /64/);
.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
.fpu neon
.code 32
#undef __thumb2__
___
# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
@@ -60,7 +79,7 @@ my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
$code.=<<___;
.align 5
rcon:
.Lrcon:
.long 0x01,0x01,0x01,0x01
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
.long 0x1b,0x1b,0x1b,0x1b
@@ -89,7 +108,7 @@ $code.=<<___;
tst $bits,#0x3f
b.ne .Lenc_key_abort
adr $ptr,rcon
adr $ptr,.Lrcon
cmp $bits,#192
veor $zero,$zero,$zero

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -47,8 +54,20 @@
#
# <ard.biesheuvel@linaro.org>
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$flavour = shift;
if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
if ($flavour && $flavour ne "void") {
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
open STDOUT,">$output";
}
my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
my @XMM=map("q$_",(0..15));
@@ -702,7 +721,7 @@ $code.=<<___;
# define BSAES_ASM_EXTENDED_KEY
# define XTS_CHAIN_TWEAK
# define __ARM_ARCH__ __LINUX_ARM_ARCH__
# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
# define __ARM_MAX_ARCH__ 7
#endif
#ifdef __thumb__
@@ -715,10 +734,11 @@ $code.=<<___;
.text
.syntax unified @ ARMv7-capable assembler is expected to handle this
#ifdef __thumb2__
#if defined(__thumb2__) && !defined(__APPLE__)
.thumb
#else
.code 32
# undef __thumb2__
#endif
.type _bsaes_decrypt8,%function
@@ -726,7 +746,11 @@ $code.=<<___;
_bsaes_decrypt8:
adr $const,_bsaes_decrypt8
vldmia $key!, {@XMM[9]} @ round 0 key
#ifdef __APPLE__
adr $const,.LM0ISR
#else
add $const,$const,#.LM0ISR-_bsaes_decrypt8
#endif
vldmia $const!, {@XMM[8]} @ .LM0ISR
veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
@@ -821,7 +845,11 @@ _bsaes_const:
_bsaes_encrypt8:
adr $const,_bsaes_encrypt8
vldmia $key!, {@XMM[9]} @ round 0 key
#ifdef __APPLE__
adr $const,.LM0SR
#else
sub $const,$const,#_bsaes_encrypt8-.LM0SR
#endif
vldmia $const!, {@XMM[8]} @ .LM0SR
_bsaes_encrypt8_alt:
@@ -925,7 +953,11 @@ $code.=<<___;
_bsaes_key_convert:
adr $const,_bsaes_key_convert
vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
#ifdef __APPLE__
adr $const,.LM0
#else
sub $const,$const,#_bsaes_key_convert-.LM0
#endif
vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
vmov.i8 @XMM[8], #0x01 @ bit masks
@@ -1333,7 +1365,7 @@ bsaes_cbc_encrypt:
vmov @XMM[4],@XMM[15] @ just in case ensure that IV
vmov @XMM[5],@XMM[0] @ and input are preserved
bl AES_decrypt
vld1.8 {@XMM[0]}, [$fp,:64] @ load result
vld1.8 {@XMM[0]}, [$fp] @ load result
veor @XMM[0], @XMM[0], @XMM[4] @ ^= IV
vmov @XMM[15], @XMM[5] @ @XMM[5] holds input
vst1.8 {@XMM[0]}, [$rounds] @ write output
@@ -1392,7 +1424,12 @@ bsaes_ctr32_encrypt_blocks:
vstmia r12, {@XMM[7]} @ save last round key
vld1.8 {@XMM[0]}, [$ctr] @ load counter
#ifdef __APPLE__
mov $ctr, #:lower16:(.LREVM0SR-.LM0)
add $ctr, $const, $ctr
#else
add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
#endif
vldmia $keysched, {@XMM[4]} @ load round0 key
#else
ldr r12, [$key, #244]
@@ -1449,7 +1486,12 @@ bsaes_ctr32_encrypt_blocks:
vldmia $ctr, {@XMM[8]} @ .LREVM0SR
mov r5, $rounds @ pass rounds
vstmia $fp, {@XMM[10]} @ save next counter
#ifdef __APPLE__
mov $const, #:lower16:(.LREVM0SR-.LSR)
sub $const, $ctr, $const
#else
sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
#endif
bl _bsaes_encrypt8_alt
@@ -1550,7 +1592,7 @@ bsaes_ctr32_encrypt_blocks:
rev r8, r8
#endif
sub sp, sp, #0x10
vst1.8 {@XMM[1]}, [sp,:64] @ copy counter value
vst1.8 {@XMM[1]}, [sp] @ copy counter value
sub sp, sp, #0x10
.Lctr_enc_short_loop:
@@ -1561,7 +1603,7 @@ bsaes_ctr32_encrypt_blocks:
bl AES_encrypt
vld1.8 {@XMM[0]}, [r4]! @ load input
vld1.8 {@XMM[1]}, [sp,:64] @ load encrypted counter
vld1.8 {@XMM[1]}, [sp] @ load encrypted counter
add r8, r8, #1
#ifdef __ARMEL__
rev r0, r8
@@ -2068,9 +2110,11 @@ bsaes_xts_decrypt:
vld1.8 {@XMM[8]}, [r0] @ initial tweak
adr $magic, .Lxts_magic
#ifndef XTS_CHAIN_TWEAK
tst $len, #0xf @ if not multiple of 16
it ne @ Thumb2 thing, sanity check in ARM
subne $len, #0x10 @ subtract another 16 bytes
#endif
subs $len, #0x80
blo .Lxts_dec_short

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
###################################################################
### AES-128 [originally in CTR mode] ###
@@ -41,6 +48,7 @@
# Nehalem(**) 7.63 6.88 +11%
# Atom 17.1 16.4 +4%
# Silvermont - 12.9
# Goldmont - 8.85
#
# (*) Comparison is not completely fair, because "this" is ECB,
# i.e. no extra processing such as counter values calculation
@@ -80,6 +88,7 @@
# Nehalem 7.80
# Atom 17.9
# Silvermont 14.0
# Goldmont 10.2
#
# November 2011.
#
@@ -99,7 +108,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
######################################################################
## Constant-time SSSE3 AES core implementation.
@@ -14,7 +21,8 @@
# 128-bit key.
#
# aes-ppc.pl this
# G4e 35.5/52.1/(23.8) 11.9(*)/15.4
# PPC74x0/G4e 35.5/52.1/(23.8) 11.9(*)/15.4
# PPC970/G5 37.9/55.0/(28.5) 22.2/28.5
# POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
# POWER7 32.3/42.9/(18.4) 18.5/23.3
#

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
######################################################################
## Constant-time SSSE3 AES core implementation.
@@ -51,6 +58,10 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output = pop;
open OUT,">$output";
*STDOUT=*OUT;
&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
$PREFIX="vpaes";
@@ -901,3 +912,5 @@ $k_dsbo=0x2c0; # decryption sbox final output
&function_end("${PREFIX}_cbc_encrypt");
&asm_finish();
close STDOUT;

View File

@@ -1,4 +1,11 @@
#!/usr/bin/env perl
#! /usr/bin/env perl
# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
######################################################################
## Constant-time SSSE3 AES core implementation.
@@ -31,6 +38,7 @@
# Nehalem 29.6/40.3/14.6 10.0/11.8
# Atom 57.3/74.2/32.1 60.9/77.2(***)
# Silvermont 52.7/64.0/19.5 48.8/60.8(***)
# Goldmont 38.9/49.0/17.8 10.6/12.6
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
@@ -57,7 +65,7 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$PREFIX="vpaes";