Import Tcl 8.6.10

This commit is contained in:
Steve Dower
2020-09-24 22:53:56 +01:00
parent 0343d03b22
commit 3bb8e3e086
1005 changed files with 593700 additions and 41637 deletions

View File

@@ -1,2 +1,2 @@
ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
ml64.exe /Flgvmat64 /c /Zi gvmat64.asm
ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
ml64.exe /Flgvmat64 /c /Zi gvmat64.asm

File diff suppressed because it is too large Load Diff

View File

@@ -1,186 +1,186 @@
/* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding
* version for AMD64 on Windows using Microsoft C compiler
*
* Copyright (C) 1995-2003 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Copyright (C) 2003 Chris Anderson <christop@charm.net>
* Please use the copyright conditions above.
*
* 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant
*
* inffas8664.c call function inffas8664fnc in inffasx64.asm
* inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
*
* Dec-29-2003 -- I added AMD64 inflate asm support. This version is also
* slightly quicker on x86 systems because, instead of using rep movsb to copy
* data, it uses rep movsw, which moves data in 2-byte chunks instead of single
* bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates
* from http://fedora.linux.duke.edu/fc1_x86_64
* which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with
* 1GB ram. The 64-bit version is about 4% faster than the 32-bit version,
* when decompressing mozilla-source-1.3.tar.gz.
*
* Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
* the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
* the moment. I have successfully compiled and tested this code with gcc2.96,
* gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
* compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
* enabled. I will attempt to merge the MMX code into this version. Newer
* versions of this and inffast.S can be found at
* http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
*
*/
#include <stdio.h>
#include "zutil.h"
#include "inftrees.h"
#include "inflate.h"
#include "inffast.h"
/* Mark Adler's comments from inffast.c: */
/*
Decode literal, length, and distance codes and write out the resulting
literal and match bytes until either not enough input or output is
available, an end-of-block is encountered, or a data error is encountered.
When large enough input and output buffers are supplied to inflate(), for
example, a 16K input buffer and a 64K output buffer, more than 95% of the
inflate execution time is spent in this routine.
Entry assumptions:
state->mode == LEN
strm->avail_in >= 6
strm->avail_out >= 258
start >= strm->avail_out
state->bits < 8
On return, state->mode is one of:
LEN -- ran out of enough output space or enough available input
TYPE -- reached end of block code, inflate() to interpret next block
BAD -- error in block data
Notes:
- The maximum input bits used by a length/distance pair is 15 bits for the
length code, 5 bits for the length extra, 15 bits for the distance code,
and 13 bits for the distance extra. This totals 48 bits, or six bytes.
Therefore if strm->avail_in >= 6, then there is enough input to avoid
checking for available input while decoding.
- The maximum bytes that a single length/distance pair can output is 258
bytes, which is the maximum length that can be coded. inflate_fast()
requires strm->avail_out >= 258 for each loop to avoid checking for
output space.
*/
typedef struct inffast_ar {
/* 64 32 x86 x86_64 */
/* ar offset register */
/* 0 0 */ void *esp; /* esp save */
/* 8 4 */ void *ebp; /* ebp save */
/* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */
/* 24 12 */ unsigned char FAR *last; /* r9 while in < last */
/* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */
/* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */
/* 48 24 */ unsigned char FAR *end; /* r10 while out < end */
/* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */
/* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */
/* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */
/* 80 40 */ size_t /*unsigned long */hold; /* edx rdx local strm->hold */
/* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */
/* 92 48 */ unsigned wsize; /* window size */
/* 96 52 */ unsigned write; /* window write index */
/*100 56 */ unsigned lmask; /* r12 mask for lcode */
/*104 60 */ unsigned dmask; /* r13 mask for dcode */
/*108 64 */ unsigned len; /* r14 match length */
/*112 68 */ unsigned dist; /* r15 match distance */
/*116 72 */ unsigned status; /* set when state chng*/
} type_ar;
#ifdef ASMINF
void inflate_fast(strm, start)
z_streamp strm;
unsigned start; /* inflate()'s starting value for strm->avail_out */
{
struct inflate_state FAR *state;
type_ar ar;
void inffas8664fnc(struct inffast_ar * par);
#if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64))
#define PAD_AVAIL_IN 6
#define PAD_AVAIL_OUT 258
#else
#define PAD_AVAIL_IN 5
#define PAD_AVAIL_OUT 257
#endif
/* copy state to local variables */
state = (struct inflate_state FAR *)strm->state;
ar.in = strm->next_in;
ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN);
ar.out = strm->next_out;
ar.beg = ar.out - (start - strm->avail_out);
ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT);
ar.wsize = state->wsize;
ar.write = state->wnext;
ar.window = state->window;
ar.hold = state->hold;
ar.bits = state->bits;
ar.lcode = state->lencode;
ar.dcode = state->distcode;
ar.lmask = (1U << state->lenbits) - 1;
ar.dmask = (1U << state->distbits) - 1;
/* decode literals and length/distances until end-of-block or not enough
input data or output space */
/* align in on 1/2 hold size boundary */
while (((size_t)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) {
ar.hold += (unsigned long)*ar.in++ << ar.bits;
ar.bits += 8;
}
inffas8664fnc(&ar);
if (ar.status > 1) {
if (ar.status == 2)
strm->msg = "invalid literal/length code";
else if (ar.status == 3)
strm->msg = "invalid distance code";
else
strm->msg = "invalid distance too far back";
state->mode = BAD;
}
else if ( ar.status == 1 ) {
state->mode = TYPE;
}
/* return unused bytes (on entry, bits < 8, so in won't go too far back) */
ar.len = ar.bits >> 3;
ar.in -= ar.len;
ar.bits -= ar.len << 3;
ar.hold &= (1U << ar.bits) - 1;
/* update state and return */
strm->next_in = ar.in;
strm->next_out = ar.out;
strm->avail_in = (unsigned)(ar.in < ar.last ?
PAD_AVAIL_IN + (ar.last - ar.in) :
PAD_AVAIL_IN - (ar.in - ar.last));
strm->avail_out = (unsigned)(ar.out < ar.end ?
PAD_AVAIL_OUT + (ar.end - ar.out) :
PAD_AVAIL_OUT - (ar.out - ar.end));
state->hold = (unsigned long)ar.hold;
state->bits = ar.bits;
return;
}
#endif
/* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding
* version for AMD64 on Windows using Microsoft C compiler
*
* Copyright (C) 1995-2003 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Copyright (C) 2003 Chris Anderson <christop@charm.net>
* Please use the copyright conditions above.
*
* 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant
*
* inffas8664.c call function inffas8664fnc in inffasx64.asm
* inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
*
* Dec-29-2003 -- I added AMD64 inflate asm support. This version is also
* slightly quicker on x86 systems because, instead of using rep movsb to copy
* data, it uses rep movsw, which moves data in 2-byte chunks instead of single
* bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates
* from http://fedora.linux.duke.edu/fc1_x86_64
* which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with
* 1GB ram. The 64-bit version is about 4% faster than the 32-bit version,
* when decompressing mozilla-source-1.3.tar.gz.
*
* Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
* the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
* the moment. I have successfully compiled and tested this code with gcc2.96,
* gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
* compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
* enabled. I will attempt to merge the MMX code into this version. Newer
* versions of this and inffast.S can be found at
* http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
*
*/
#include <stdio.h>
#include "zutil.h"
#include "inftrees.h"
#include "inflate.h"
#include "inffast.h"
/* Mark Adler's comments from inffast.c: */
/*
Decode literal, length, and distance codes and write out the resulting
literal and match bytes until either not enough input or output is
available, an end-of-block is encountered, or a data error is encountered.
When large enough input and output buffers are supplied to inflate(), for
example, a 16K input buffer and a 64K output buffer, more than 95% of the
inflate execution time is spent in this routine.
Entry assumptions:
state->mode == LEN
strm->avail_in >= 6
strm->avail_out >= 258
start >= strm->avail_out
state->bits < 8
On return, state->mode is one of:
LEN -- ran out of enough output space or enough available input
TYPE -- reached end of block code, inflate() to interpret next block
BAD -- error in block data
Notes:
- The maximum input bits used by a length/distance pair is 15 bits for the
length code, 5 bits for the length extra, 15 bits for the distance code,
and 13 bits for the distance extra. This totals 48 bits, or six bytes.
Therefore if strm->avail_in >= 6, then there is enough input to avoid
checking for available input while decoding.
- The maximum bytes that a single length/distance pair can output is 258
bytes, which is the maximum length that can be coded. inflate_fast()
requires strm->avail_out >= 258 for each loop to avoid checking for
output space.
*/
typedef struct inffast_ar {
/* 64 32 x86 x86_64 */
/* ar offset register */
/* 0 0 */ void *esp; /* esp save */
/* 8 4 */ void *ebp; /* ebp save */
/* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */
/* 24 12 */ unsigned char FAR *last; /* r9 while in < last */
/* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */
/* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */
/* 48 24 */ unsigned char FAR *end; /* r10 while out < end */
/* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */
/* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */
/* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */
/* 80 40 */ size_t /*unsigned long */hold; /* edx rdx local strm->hold */
/* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */
/* 92 48 */ unsigned wsize; /* window size */
/* 96 52 */ unsigned write; /* window write index */
/*100 56 */ unsigned lmask; /* r12 mask for lcode */
/*104 60 */ unsigned dmask; /* r13 mask for dcode */
/*108 64 */ unsigned len; /* r14 match length */
/*112 68 */ unsigned dist; /* r15 match distance */
/*116 72 */ unsigned status; /* set when state chng*/
} type_ar;
#ifdef ASMINF
void inflate_fast(strm, start)
z_streamp strm;
unsigned start; /* inflate()'s starting value for strm->avail_out */
{
struct inflate_state FAR *state;
type_ar ar;
void inffas8664fnc(struct inffast_ar * par);
#if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64))
#define PAD_AVAIL_IN 6
#define PAD_AVAIL_OUT 258
#else
#define PAD_AVAIL_IN 5
#define PAD_AVAIL_OUT 257
#endif
/* copy state to local variables */
state = (struct inflate_state FAR *)strm->state;
ar.in = strm->next_in;
ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN);
ar.out = strm->next_out;
ar.beg = ar.out - (start - strm->avail_out);
ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT);
ar.wsize = state->wsize;
ar.write = state->wnext;
ar.window = state->window;
ar.hold = state->hold;
ar.bits = state->bits;
ar.lcode = state->lencode;
ar.dcode = state->distcode;
ar.lmask = (1U << state->lenbits) - 1;
ar.dmask = (1U << state->distbits) - 1;
/* decode literals and length/distances until end-of-block or not enough
input data or output space */
/* align in on 1/2 hold size boundary */
while (((size_t)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) {
ar.hold += (unsigned long)*ar.in++ << ar.bits;
ar.bits += 8;
}
inffas8664fnc(&ar);
if (ar.status > 1) {
if (ar.status == 2)
strm->msg = "invalid literal/length code";
else if (ar.status == 3)
strm->msg = "invalid distance code";
else
strm->msg = "invalid distance too far back";
state->mode = BAD;
}
else if ( ar.status == 1 ) {
state->mode = TYPE;
}
/* return unused bytes (on entry, bits < 8, so in won't go too far back) */
ar.len = ar.bits >> 3;
ar.in -= ar.len;
ar.bits -= ar.len << 3;
ar.hold &= (1U << ar.bits) - 1;
/* update state and return */
strm->next_in = ar.in;
strm->next_out = ar.out;
strm->avail_in = (unsigned)(ar.in < ar.last ?
PAD_AVAIL_IN + (ar.last - ar.in) :
PAD_AVAIL_IN - (ar.in - ar.last));
strm->avail_out = (unsigned)(ar.out < ar.end ?
PAD_AVAIL_OUT + (ar.end - ar.out) :
PAD_AVAIL_OUT - (ar.out - ar.end));
state->hold = (unsigned long)ar.hold;
state->bits = ar.bits;
return;
}
#endif

View File

@@ -1,396 +1,396 @@
; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
; version for AMD64 on Windows using Microsoft C compiler
;
; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
; inffasx64.asm is called by inffas8664.c, which contain more info.
; to compile this file, I use option
; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
; with Microsoft Macro Assembler (x64) for AMD64
;
; This file compile with Microsoft Macro Assembler (x64) for AMD64
;
; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
;
; (you can get Windows WDK with ml64 for AMD64 from
; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
;
.code
inffas8664fnc PROC
; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
;
; All registers must be preserved across the call, except for
; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
mov [rsp-8],rsi
mov [rsp-16],rdi
mov [rsp-24],r12
mov [rsp-32],r13
mov [rsp-40],r14
mov [rsp-48],r15
mov [rsp-56],rbx
mov rax,rcx
mov [rax+8], rbp ; /* save regs rbp and rsp */
mov [rax], rsp
mov rsp, rax ; /* make rsp point to &ar */
mov rsi, [rsp+16] ; /* rsi = in */
mov rdi, [rsp+32] ; /* rdi = out */
mov r9, [rsp+24] ; /* r9 = last */
mov r10, [rsp+48] ; /* r10 = end */
mov rbp, [rsp+64] ; /* rbp = lcode */
mov r11, [rsp+72] ; /* r11 = dcode */
mov rdx, [rsp+80] ; /* rdx = hold */
mov ebx, [rsp+88] ; /* ebx = bits */
mov r12d, [rsp+100] ; /* r12d = lmask */
mov r13d, [rsp+104] ; /* r13d = dmask */
; /* r14d = len */
; /* r15d = dist */
cld
cmp r10, rdi
je L_one_time ; /* if only one decode left */
cmp r9, rsi
jne L_do_loop
L_one_time:
mov r8, r12 ; /* r8 = lmask */
cmp bl, 32
ja L_get_length_code_one_time
lodsd ; /* eax = *(uint *)in++ */
mov cl, bl ; /* cl = bits, needs it for shifting */
add bl, 32 ; /* bits += 32 */
shl rax, cl
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
jmp L_get_length_code_one_time
ALIGN 4
L_while_test:
cmp r10, rdi
jbe L_break_loop
cmp r9, rsi
jbe L_break_loop
L_do_loop:
mov r8, r12 ; /* r8 = lmask */
cmp bl, 32
ja L_get_length_code ; /* if (32 < bits) */
lodsd ; /* eax = *(uint *)in++ */
mov cl, bl ; /* cl = bits, needs it for shifting */
add bl, 32 ; /* bits += 32 */
shl rax, cl
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
L_get_length_code:
and r8, rdx ; /* r8 &= hold */
mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
mov cl, ah ; /* cl = this.bits */
sub bl, ah ; /* bits -= this.bits */
shr rdx, cl ; /* hold >>= this.bits */
test al, al
jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
mov r8, r12 ; /* r8 = lmask */
shr eax, 16 ; /* output this.val char */
stosb
L_get_length_code_one_time:
and r8, rdx ; /* r8 &= hold */
mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
L_dolen:
mov cl, ah ; /* cl = this.bits */
sub bl, ah ; /* bits -= this.bits */
shr rdx, cl ; /* hold >>= this.bits */
test al, al
jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
shr eax, 16 ; /* output this.val char */
stosb
jmp L_while_test
ALIGN 4
L_test_for_length_base:
mov r14d, eax ; /* len = this */
shr r14d, 16 ; /* len = this.val */
mov cl, al
test al, 16
jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
and cl, 15 ; /* op &= 15 */
jz L_decode_distance ; /* if (!op) */
L_add_bits_to_len:
sub bl, cl
xor eax, eax
inc eax
shl eax, cl
dec eax
and eax, edx ; /* eax &= hold */
shr rdx, cl
add r14d, eax ; /* len += hold & mask[op] */
L_decode_distance:
mov r8, r13 ; /* r8 = dmask */
cmp bl, 32
ja L_get_distance_code ; /* if (32 < bits) */
lodsd ; /* eax = *(uint *)in++ */
mov cl, bl ; /* cl = bits, needs it for shifting */
add bl, 32 ; /* bits += 32 */
shl rax, cl
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
L_get_distance_code:
and r8, rdx ; /* r8 &= hold */
mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
L_dodist:
mov r15d, eax ; /* dist = this */
shr r15d, 16 ; /* dist = this.val */
mov cl, ah
sub bl, ah ; /* bits -= this.bits */
shr rdx, cl ; /* hold >>= this.bits */
mov cl, al ; /* cl = this.op */
test al, 16 ; /* if ((op & 16) == 0) */
jz L_test_for_second_level_dist
and cl, 15 ; /* op &= 15 */
jz L_check_dist_one
L_add_bits_to_dist:
sub bl, cl
xor eax, eax
inc eax
shl eax, cl
dec eax ; /* (1 << op) - 1 */
and eax, edx ; /* eax &= hold */
shr rdx, cl
add r15d, eax ; /* dist += hold & ((1 << op) - 1) */
L_check_window:
mov r8, rsi ; /* save in so from can use it's reg */
mov rax, rdi
sub rax, [rsp+40] ; /* nbytes = out - beg */
cmp eax, r15d
jb L_clip_window ; /* if (dist > nbytes) 4.2% */
mov ecx, r14d ; /* ecx = len */
mov rsi, rdi
sub rsi, r15 ; /* from = out - dist */
sar ecx, 1
jnc L_copy_two ; /* if len % 2 == 0 */
rep movsw
mov al, [rsi]
mov [rdi], al
inc rdi
mov rsi, r8 ; /* move in back to %rsi, toss from */
jmp L_while_test
L_copy_two:
rep movsw
mov rsi, r8 ; /* move in back to %rsi, toss from */
jmp L_while_test
ALIGN 4
L_check_dist_one:
cmp r15d, 1 ; /* if dist 1, is a memset */
jne L_check_window
cmp [rsp+40], rdi ; /* if out == beg, outside window */
je L_check_window
mov ecx, r14d ; /* ecx = len */
mov al, [rdi-1]
mov ah, al
sar ecx, 1
jnc L_set_two
mov [rdi], al
inc rdi
L_set_two:
rep stosw
jmp L_while_test
ALIGN 4
L_test_for_second_level_length:
test al, 64
jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */
xor eax, eax
inc eax
shl eax, cl
dec eax
and eax, edx ; /* eax &= hold */
add eax, r14d ; /* eax += len */
mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
jmp L_dolen
ALIGN 4
L_test_for_second_level_dist:
test al, 64
jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */
xor eax, eax
inc eax
shl eax, cl
dec eax
and eax, edx ; /* eax &= hold */
add eax, r15d ; /* eax += dist */
mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
jmp L_dodist
ALIGN 4
L_clip_window:
mov ecx, eax ; /* ecx = nbytes */
mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */
neg ecx ; /* nbytes = -nbytes */
cmp eax, r15d
jb L_invalid_distance_too_far ; /* if (dist > wsize) */
add ecx, r15d ; /* nbytes = dist - nbytes */
cmp dword ptr [rsp+96], 0
jne L_wrap_around_window ; /* if (write != 0) */
mov rsi, [rsp+56] ; /* from = window */
sub eax, ecx ; /* eax -= nbytes */
add rsi, rax ; /* from += wsize - nbytes */
mov eax, r14d ; /* eax = len */
cmp r14d, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* eax -= nbytes */
rep movsb
mov rsi, rdi
sub rsi, r15 ; /* from = &out[ -dist ] */
jmp L_do_copy
ALIGN 4
L_wrap_around_window:
mov eax, [rsp+96] ; /* eax = write */
cmp ecx, eax
jbe L_contiguous_in_window ; /* if (write >= nbytes) */
mov esi, [rsp+92] ; /* from = wsize */
add rsi, [rsp+56] ; /* from += window */
add rsi, rax ; /* from += write */
sub rsi, rcx ; /* from -= nbytes */
sub ecx, eax ; /* nbytes -= write */
mov eax, r14d ; /* eax = len */
cmp eax, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* len -= nbytes */
rep movsb
mov rsi, [rsp+56] ; /* from = window */
mov ecx, [rsp+96] ; /* nbytes = write */
cmp eax, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* len -= nbytes */
rep movsb
mov rsi, rdi
sub rsi, r15 ; /* from = out - dist */
jmp L_do_copy
ALIGN 4
L_contiguous_in_window:
mov rsi, [rsp+56] ; /* rsi = window */
add rsi, rax
sub rsi, rcx ; /* from += write - nbytes */
mov eax, r14d ; /* eax = len */
cmp eax, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* len -= nbytes */
rep movsb
mov rsi, rdi
sub rsi, r15 ; /* from = out - dist */
jmp L_do_copy ; /* if (nbytes >= len) */
ALIGN 4
L_do_copy:
mov ecx, eax ; /* ecx = len */
rep movsb
mov rsi, r8 ; /* move in back to %esi, toss from */
jmp L_while_test
L_test_for_end_of_block:
test al, 32
jz L_invalid_literal_length_code
mov dword ptr [rsp+116], 1
jmp L_break_loop_with_status
L_invalid_literal_length_code:
mov dword ptr [rsp+116], 2
jmp L_break_loop_with_status
L_invalid_distance_code:
mov dword ptr [rsp+116], 3
jmp L_break_loop_with_status
L_invalid_distance_too_far:
mov dword ptr [rsp+116], 4
jmp L_break_loop_with_status
L_break_loop:
mov dword ptr [rsp+116], 0
L_break_loop_with_status:
; /* put in, out, bits, and hold back into ar and pop esp */
mov [rsp+16], rsi ; /* in */
mov [rsp+32], rdi ; /* out */
mov [rsp+88], ebx ; /* bits */
mov [rsp+80], rdx ; /* hold */
mov rax, [rsp] ; /* restore rbp and rsp */
mov rbp, [rsp+8]
mov rsp, rax
mov rsi,[rsp-8]
mov rdi,[rsp-16]
mov r12,[rsp-24]
mov r13,[rsp-32]
mov r14,[rsp-40]
mov r15,[rsp-48]
mov rbx,[rsp-56]
ret 0
; :
; : "m" (ar)
; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
; );
inffas8664fnc ENDP
;_TEXT ENDS
END
; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding
; version for AMD64 on Windows using Microsoft C compiler
;
; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c
; inffasx64.asm is called by inffas8664.c, which contain more info.
; to compile this file, I use option
; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm
; with Microsoft Macro Assembler (x64) for AMD64
;
; This file compile with Microsoft Macro Assembler (x64) for AMD64
;
; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK
;
; (you can get Windows WDK with ml64 for AMD64 from
; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price)
;
.code
inffas8664fnc PROC
; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and
; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp
;
; All registers must be preserved across the call, except for
; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch.
mov [rsp-8],rsi
mov [rsp-16],rdi
mov [rsp-24],r12
mov [rsp-32],r13
mov [rsp-40],r14
mov [rsp-48],r15
mov [rsp-56],rbx
mov rax,rcx
mov [rax+8], rbp ; /* save regs rbp and rsp */
mov [rax], rsp
mov rsp, rax ; /* make rsp point to &ar */
mov rsi, [rsp+16] ; /* rsi = in */
mov rdi, [rsp+32] ; /* rdi = out */
mov r9, [rsp+24] ; /* r9 = last */
mov r10, [rsp+48] ; /* r10 = end */
mov rbp, [rsp+64] ; /* rbp = lcode */
mov r11, [rsp+72] ; /* r11 = dcode */
mov rdx, [rsp+80] ; /* rdx = hold */
mov ebx, [rsp+88] ; /* ebx = bits */
mov r12d, [rsp+100] ; /* r12d = lmask */
mov r13d, [rsp+104] ; /* r13d = dmask */
; /* r14d = len */
; /* r15d = dist */
cld
cmp r10, rdi
je L_one_time ; /* if only one decode left */
cmp r9, rsi
jne L_do_loop
L_one_time:
mov r8, r12 ; /* r8 = lmask */
cmp bl, 32
ja L_get_length_code_one_time
lodsd ; /* eax = *(uint *)in++ */
mov cl, bl ; /* cl = bits, needs it for shifting */
add bl, 32 ; /* bits += 32 */
shl rax, cl
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
jmp L_get_length_code_one_time
ALIGN 4
L_while_test:
cmp r10, rdi
jbe L_break_loop
cmp r9, rsi
jbe L_break_loop
L_do_loop:
mov r8, r12 ; /* r8 = lmask */
cmp bl, 32
ja L_get_length_code ; /* if (32 < bits) */
lodsd ; /* eax = *(uint *)in++ */
mov cl, bl ; /* cl = bits, needs it for shifting */
add bl, 32 ; /* bits += 32 */
shl rax, cl
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
L_get_length_code:
and r8, rdx ; /* r8 &= hold */
mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
mov cl, ah ; /* cl = this.bits */
sub bl, ah ; /* bits -= this.bits */
shr rdx, cl ; /* hold >>= this.bits */
test al, al
jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
mov r8, r12 ; /* r8 = lmask */
shr eax, 16 ; /* output this.val char */
stosb
L_get_length_code_one_time:
and r8, rdx ; /* r8 &= hold */
mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */
L_dolen:
mov cl, ah ; /* cl = this.bits */
sub bl, ah ; /* bits -= this.bits */
shr rdx, cl ; /* hold >>= this.bits */
test al, al
jnz L_test_for_length_base ; /* if (op != 0) 45.7% */
shr eax, 16 ; /* output this.val char */
stosb
jmp L_while_test
ALIGN 4
L_test_for_length_base:
mov r14d, eax ; /* len = this */
shr r14d, 16 ; /* len = this.val */
mov cl, al
test al, 16
jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */
and cl, 15 ; /* op &= 15 */
jz L_decode_distance ; /* if (!op) */
L_add_bits_to_len:
sub bl, cl
xor eax, eax
inc eax
shl eax, cl
dec eax
and eax, edx ; /* eax &= hold */
shr rdx, cl
add r14d, eax ; /* len += hold & mask[op] */
L_decode_distance:
mov r8, r13 ; /* r8 = dmask */
cmp bl, 32
ja L_get_distance_code ; /* if (32 < bits) */
lodsd ; /* eax = *(uint *)in++ */
mov cl, bl ; /* cl = bits, needs it for shifting */
add bl, 32 ; /* bits += 32 */
shl rax, cl
or rdx, rax ; /* hold |= *((uint *)in)++ << bits */
L_get_distance_code:
and r8, rdx ; /* r8 &= hold */
mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */
L_dodist:
mov r15d, eax ; /* dist = this */
shr r15d, 16 ; /* dist = this.val */
mov cl, ah
sub bl, ah ; /* bits -= this.bits */
shr rdx, cl ; /* hold >>= this.bits */
mov cl, al ; /* cl = this.op */
test al, 16 ; /* if ((op & 16) == 0) */
jz L_test_for_second_level_dist
and cl, 15 ; /* op &= 15 */
jz L_check_dist_one
L_add_bits_to_dist:
sub bl, cl
xor eax, eax
inc eax
shl eax, cl
dec eax ; /* (1 << op) - 1 */
and eax, edx ; /* eax &= hold */
shr rdx, cl
add r15d, eax ; /* dist += hold & ((1 << op) - 1) */
L_check_window:
mov r8, rsi ; /* save in so from can use it's reg */
mov rax, rdi
sub rax, [rsp+40] ; /* nbytes = out - beg */
cmp eax, r15d
jb L_clip_window ; /* if (dist > nbytes) 4.2% */
mov ecx, r14d ; /* ecx = len */
mov rsi, rdi
sub rsi, r15 ; /* from = out - dist */
sar ecx, 1
jnc L_copy_two ; /* if len % 2 == 0 */
rep movsw
mov al, [rsi]
mov [rdi], al
inc rdi
mov rsi, r8 ; /* move in back to %rsi, toss from */
jmp L_while_test
L_copy_two:
rep movsw
mov rsi, r8 ; /* move in back to %rsi, toss from */
jmp L_while_test
ALIGN 4
L_check_dist_one:
cmp r15d, 1 ; /* if dist 1, is a memset */
jne L_check_window
cmp [rsp+40], rdi ; /* if out == beg, outside window */
je L_check_window
mov ecx, r14d ; /* ecx = len */
mov al, [rdi-1]
mov ah, al
sar ecx, 1
jnc L_set_two
mov [rdi], al
inc rdi
L_set_two:
rep stosw
jmp L_while_test
ALIGN 4
L_test_for_second_level_length:
test al, 64
jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */
xor eax, eax
inc eax
shl eax, cl
dec eax
and eax, edx ; /* eax &= hold */
add eax, r14d ; /* eax += len */
mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/
jmp L_dolen
ALIGN 4
L_test_for_second_level_dist:
test al, 64
jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */
xor eax, eax
inc eax
shl eax, cl
dec eax
and eax, edx ; /* eax &= hold */
add eax, r15d ; /* eax += dist */
mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/
jmp L_dodist
ALIGN 4
L_clip_window:
mov ecx, eax ; /* ecx = nbytes */
mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */
neg ecx ; /* nbytes = -nbytes */
cmp eax, r15d
jb L_invalid_distance_too_far ; /* if (dist > wsize) */
add ecx, r15d ; /* nbytes = dist - nbytes */
cmp dword ptr [rsp+96], 0
jne L_wrap_around_window ; /* if (write != 0) */
mov rsi, [rsp+56] ; /* from = window */
sub eax, ecx ; /* eax -= nbytes */
add rsi, rax ; /* from += wsize - nbytes */
mov eax, r14d ; /* eax = len */
cmp r14d, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* eax -= nbytes */
rep movsb
mov rsi, rdi
sub rsi, r15 ; /* from = &out[ -dist ] */
jmp L_do_copy
ALIGN 4
L_wrap_around_window:
mov eax, [rsp+96] ; /* eax = write */
cmp ecx, eax
jbe L_contiguous_in_window ; /* if (write >= nbytes) */
mov esi, [rsp+92] ; /* from = wsize */
add rsi, [rsp+56] ; /* from += window */
add rsi, rax ; /* from += write */
sub rsi, rcx ; /* from -= nbytes */
sub ecx, eax ; /* nbytes -= write */
mov eax, r14d ; /* eax = len */
cmp eax, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* len -= nbytes */
rep movsb
mov rsi, [rsp+56] ; /* from = window */
mov ecx, [rsp+96] ; /* nbytes = write */
cmp eax, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* len -= nbytes */
rep movsb
mov rsi, rdi
sub rsi, r15 ; /* from = out - dist */
jmp L_do_copy
ALIGN 4
L_contiguous_in_window:
mov rsi, [rsp+56] ; /* rsi = window */
add rsi, rax
sub rsi, rcx ; /* from += write - nbytes */
mov eax, r14d ; /* eax = len */
cmp eax, ecx
jbe L_do_copy ; /* if (nbytes >= len) */
sub eax, ecx ; /* len -= nbytes */
rep movsb
mov rsi, rdi
sub rsi, r15 ; /* from = out - dist */
jmp L_do_copy ; /* if (nbytes >= len) */
ALIGN 4
L_do_copy:
mov ecx, eax ; /* ecx = len */
rep movsb
mov rsi, r8 ; /* move in back to %esi, toss from */
jmp L_while_test
L_test_for_end_of_block:
test al, 32
jz L_invalid_literal_length_code
mov dword ptr [rsp+116], 1
jmp L_break_loop_with_status
L_invalid_literal_length_code:
mov dword ptr [rsp+116], 2
jmp L_break_loop_with_status
L_invalid_distance_code:
mov dword ptr [rsp+116], 3
jmp L_break_loop_with_status
L_invalid_distance_too_far:
mov dword ptr [rsp+116], 4
jmp L_break_loop_with_status
L_break_loop:
mov dword ptr [rsp+116], 0
L_break_loop_with_status:
; /* put in, out, bits, and hold back into ar and pop esp */
mov [rsp+16], rsi ; /* in */
mov [rsp+32], rdi ; /* out */
mov [rsp+88], ebx ; /* bits */
mov [rsp+80], rdx ; /* hold */
mov rax, [rsp] ; /* restore rbp and rsp */
mov rbp, [rsp+8]
mov rsp, rax
mov rsi,[rsp-8]
mov rdi,[rsp-16]
mov r12,[rsp-24]
mov r13,[rsp-32]
mov r14,[rsp-40]
mov r15,[rsp-48]
mov rbx,[rsp-56]
ret 0
; :
; : "m" (ar)
; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi",
; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
; );
inffas8664fnc ENDP
;_TEXT ENDS
END

View File

@@ -1,31 +1,31 @@
Summary
-------
This directory contains ASM implementations of the functions
longest_match() and inflate_fast(), for 64 bits x86 (both AMD64 and Intel EM64t),
for use with Microsoft Macro Assembler (x64) for AMD64 and Microsoft C++ 64 bits.
gvmat64.asm is written by Gilles Vollant (2005), by using Brian Raiter 686/32 bits
assembly optimized version from Jean-loup Gailly original longest_match function
inffasx64.asm and inffas8664.c were written by Chris Anderson, by optimizing
original function from Mark Adler
Use instructions
----------------
Assemble the .asm files using MASM and put the object files into the zlib source
directory. You can also get object files here:
http://www.winimage.com/zLibDll/zlib124_masm_obj.zip
define ASMV and ASMINF in your project. Include inffas8664.c in your source tree,
and inffasx64.obj and gvmat64.obj as object to link.
Build instructions
------------------
run bld_64.bat with Microsoft Macro Assembler (x64) for AMD64 (ml64.exe)
ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK
You can get Windows 2003 server DDK with ml64 and cl for AMD64 from
http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)
Summary
-------
This directory contains ASM implementations of the functions
longest_match() and inflate_fast(), for 64 bits x86 (both AMD64 and Intel EM64t),
for use with Microsoft Macro Assembler (x64) for AMD64 and Microsoft C++ 64 bits.
gvmat64.asm is written by Gilles Vollant (2005), by using Brian Raiter 686/32 bits
assembly optimized version from Jean-loup Gailly original longest_match function
inffasx64.asm and inffas8664.c were written by Chris Anderson, by optimizing
original function from Mark Adler
Use instructions
----------------
Assemble the .asm files using MASM and put the object files into the zlib source
directory. You can also get object files here:
http://www.winimage.com/zLibDll/zlib124_masm_obj.zip
define ASMV and ASMINF in your project. Include inffas8664.c in your source tree,
and inffasx64.obj and gvmat64.obj as object to link.
Build instructions
------------------
run bld_64.bat with Microsoft Macro Assembler (x64) for AMD64 (ml64.exe)
ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK
You can get Windows 2003 server DDK with ml64 and cl for AMD64 from
http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price)