summaryrefslogtreecommitdiff
path: root/source4/lib/zlib/contrib/masm686/match.asm
diff options
context:
space:
mode:
authorAndrew Bartlett <abartlet@samba.org>2008-08-12 17:46:17 +1000
committerAndrew Bartlett <abartlet@samba.org>2008-08-12 17:46:17 +1000
commit7177df5df95c9fc0d2bc36340b948697980f00d3 (patch)
tree43e9115d1a87dabe743cf5092de56a8bb239e05a /source4/lib/zlib/contrib/masm686/match.asm
parent5f873a4d8fbcd2eaeabb452ffba059338ed55dfc (diff)
parent0965b22ec561588201a3a79f1f1e316834c8ce0b (diff)
downloadsamba-7177df5df95c9fc0d2bc36340b948697980f00d3.tar.gz
samba-7177df5df95c9fc0d2bc36340b948697980f00d3.tar.bz2
samba-7177df5df95c9fc0d2bc36340b948697980f00d3.zip
Merge branch 'v4-0-test' of ssh://git.samba.org/data/git/samba into 4-0-local
(This used to be commit ce7b1424c711949e6feb0181d3759133b77391ff)
Diffstat (limited to 'source4/lib/zlib/contrib/masm686/match.asm')
-rw-r--r--source4/lib/zlib/contrib/masm686/match.asm413
1 files changed, 413 insertions, 0 deletions
diff --git a/source4/lib/zlib/contrib/masm686/match.asm b/source4/lib/zlib/contrib/masm686/match.asm
new file mode 100644
index 0000000000..4b03a71abd
--- /dev/null
+++ b/source4/lib/zlib/contrib/masm686/match.asm
@@ -0,0 +1,413 @@
+
+; match.asm -- Pentium-Pro optimized version of longest_match()
+;
+; Updated for zlib 1.1.3 and converted to MASM 6.1x
+; Copyright (C) 2000 Dan Higdon <hdan@kinesoft.com>
+; and Chuck Walbourn <chuckw@kinesoft.com>
+; Corrections by Cosmin Truta <cosmint@cs.ubbcluj.ro>
+;
+; This is free software; you can redistribute it and/or modify it
+; under the terms of the GNU General Public License.
+
+; Based on match.S
+; Written for zlib 1.1.2
+; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com>
+;
+; Modified by Gilles Vollant (2005) for add gzhead and gzindex
+
+ .686P
+ .MODEL FLAT
+
+;===========================================================================
+; EQUATES
+;===========================================================================
+
+MAX_MATCH EQU 258
+MIN_MATCH EQU 3
+MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1)
+MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7))
+
+;===========================================================================
+; STRUCTURES
+;===========================================================================
+
+; This STRUCT assumes a 4-byte alignment
+
+DEFLATE_STATE STRUCT
+ds_strm dd ?
+ds_status dd ?
+ds_pending_buf dd ?
+ds_pending_buf_size dd ?
+ds_pending_out dd ?
+ds_pending dd ?
+ds_wrap dd ?
+; gzhead and gzindex are added in zlib 1.2.2.2 (see deflate.h)
+ds_gzhead dd ?
+ds_gzindex dd ?
+ds_data_type db ?
+ds_method db ?
+ db ? ; padding
+ db ? ; padding
+ds_last_flush dd ?
+ds_w_size dd ? ; used
+ds_w_bits dd ?
+ds_w_mask dd ? ; used
+ds_window dd ? ; used
+ds_window_size dd ?
+ds_prev dd ? ; used
+ds_head dd ?
+ds_ins_h dd ?
+ds_hash_size dd ?
+ds_hash_bits dd ?
+ds_hash_mask dd ?
+ds_hash_shift dd ?
+ds_block_start dd ?
+ds_match_length dd ? ; used
+ds_prev_match dd ? ; used
+ds_match_available dd ?
+ds_strstart dd ? ; used
+ds_match_start dd ? ; used
+ds_lookahead dd ? ; used
+ds_prev_length dd ? ; used
+ds_max_chain_length dd ? ; used
+ds_max_laxy_match dd ?
+ds_level dd ?
+ds_strategy dd ?
+ds_good_match dd ? ; used
+ds_nice_match dd ? ; used
+
+; Don't need anymore of the struct for match
+DEFLATE_STATE ENDS
+
+;===========================================================================
+; CODE
+;===========================================================================
+_TEXT SEGMENT
+
+;---------------------------------------------------------------------------
+; match_init
+;---------------------------------------------------------------------------
+ ALIGN 4
+PUBLIC _match_init
+_match_init PROC
+ ; no initialization needed
+ ret
+_match_init ENDP
+
+;---------------------------------------------------------------------------
+; uInt longest_match(deflate_state *deflatestate, IPos curmatch)
+;---------------------------------------------------------------------------
+ ALIGN 4
+
+PUBLIC _longest_match
+_longest_match PROC
+
+; Since this code uses EBP for a scratch register, the stack frame must
+; be manually constructed and referenced relative to the ESP register.
+
+; Stack image
+; Variables
+chainlenwmask = 0 ; high word: current chain len
+ ; low word: s->wmask
+window = 4 ; local copy of s->window
+windowbestlen = 8 ; s->window + bestlen
+scanend = 12 ; last two bytes of string
+scanstart = 16 ; first two bytes of string
+scanalign = 20 ; dword-misalignment of string
+nicematch = 24 ; a good enough match size
+bestlen = 28 ; size of best match so far
+scan = 32 ; ptr to string wanting match
+varsize = 36 ; number of bytes (also offset to last saved register)
+
+; Saved Registers (actually pushed into place)
+ebx_save = 36
+edi_save = 40
+esi_save = 44
+ebp_save = 48
+
+; Parameters
+retaddr = 52
+deflatestate = 56
+curmatch = 60
+
+; Save registers that the compiler may be using
+ push ebp
+ push edi
+ push esi
+ push ebx
+
+; Allocate local variable space
+ sub esp,varsize
+
+; Retrieve the function arguments. ecx will hold cur_match
+; throughout the entire function. edx will hold the pointer to the
+; deflate_state structure during the function's setup (before
+; entering the main loop).
+
+ mov edx, [esp+deflatestate]
+ASSUME edx:PTR DEFLATE_STATE
+
+ mov ecx, [esp+curmatch]
+
+; uInt wmask = s->w_mask;
+; unsigned chain_length = s->max_chain_length;
+; if (s->prev_length >= s->good_match) {
+; chain_length >>= 2;
+; }
+
+ mov eax, [edx].ds_prev_length
+ mov ebx, [edx].ds_good_match
+ cmp eax, ebx
+ mov eax, [edx].ds_w_mask
+ mov ebx, [edx].ds_max_chain_length
+ jl SHORT LastMatchGood
+ shr ebx, 2
+LastMatchGood:
+
+; chainlen is decremented once beforehand so that the function can
+; use the sign flag instead of the zero flag for the exit test.
+; It is then shifted into the high word, to make room for the wmask
+; value, which it will always accompany.
+
+ dec ebx
+ shl ebx, 16
+ or ebx, eax
+ mov [esp+chainlenwmask], ebx
+
+; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead;
+
+ mov eax, [edx].ds_nice_match
+ mov ebx, [edx].ds_lookahead
+ cmp ebx, eax
+ jl SHORT LookaheadLess
+ mov ebx, eax
+LookaheadLess:
+ mov [esp+nicematch], ebx
+
+;/* register Bytef *scan = s->window + s->strstart; */
+
+ mov esi, [edx].ds_window
+ mov [esp+window], esi
+ mov ebp, [edx].ds_strstart
+ lea edi, [esi+ebp]
+ mov [esp+scan],edi
+
+;/* Determine how many bytes the scan ptr is off from being */
+;/* dword-aligned. */
+
+ mov eax, edi
+ neg eax
+ and eax, 3
+ mov [esp+scanalign], eax
+
+;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */
+;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */
+
+ mov eax, [edx].ds_w_size
+ sub eax, MIN_LOOKAHEAD
+ sub ebp, eax
+ jg SHORT LimitPositive
+ xor ebp, ebp
+LimitPositive:
+
+;/* int best_len = s->prev_length; */
+
+ mov eax, [edx].ds_prev_length
+ mov [esp+bestlen], eax
+
+;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */
+
+ add esi, eax
+ mov [esp+windowbestlen], esi
+
+;/* register ush scan_start = *(ushf*)scan; */
+;/* register ush scan_end = *(ushf*)(scan+best_len-1); */
+;/* Posf *prev = s->prev; */
+
+ movzx ebx, WORD PTR[edi]
+ mov [esp+scanstart], ebx
+ movzx ebx, WORD PTR[eax+edi-1]
+ mov [esp+scanend], ebx
+ mov edi, [edx].ds_prev
+
+;/* Jump into the main loop. */
+
+ mov edx, [esp+chainlenwmask]
+ jmp SHORT LoopEntry
+
+;/* do {
+; * match = s->window + cur_match;
+; * if (*(ushf*)(match+best_len-1) != scan_end ||
+; * *(ushf*)match != scan_start) continue;
+; * [...]
+; * } while ((cur_match = prev[cur_match & wmask]) > limit
+; * && --chain_length != 0);
+; *
+; * Here is the inner loop of the function. The function will spend the
+; * majority of its time in this loop, and majority of that time will
+; * be spent in the first ten instructions.
+; *
+; * Within this loop:
+; * %ebx = scanend
+; * %ecx = curmatch
+; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask)
+; * %esi = windowbestlen - i.e., (window + bestlen)
+; * %edi = prev
+; * %ebp = limit
+; */
+
+ ALIGN 4
+LookupLoop:
+ and ecx, edx
+ movzx ecx, WORD PTR[edi+ecx*2]
+ cmp ecx, ebp
+ jbe LeaveNow
+ sub edx, 000010000H
+ js LeaveNow
+
+LoopEntry:
+ movzx eax, WORD PTR[esi+ecx-1]
+ cmp eax, ebx
+ jnz SHORT LookupLoop
+
+ mov eax, [esp+window]
+ movzx eax, WORD PTR[eax+ecx]
+ cmp eax, [esp+scanstart]
+ jnz SHORT LookupLoop
+
+;/* Store the current value of chainlen. */
+
+ mov [esp+chainlenwmask], edx
+
+;/* Point %edi to the string under scrutiny, and %esi to the string we */
+;/* are hoping to match it up with. In actuality, %esi and %edi are */
+;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */
+;/* initialized to -(MAX_MATCH_8 - scanalign). */
+
+ mov esi, [esp+window]
+ mov edi, [esp+scan]
+ add esi, ecx
+ mov eax, [esp+scanalign]
+ mov edx, -MAX_MATCH_8
+ lea edi, [edi+eax+MAX_MATCH_8]
+ lea esi, [esi+eax+MAX_MATCH_8]
+
+;/* Test the strings for equality, 8 bytes at a time. At the end,
+; * adjust %edx so that it is offset to the exact byte that mismatched.
+; *
+; * We already know at this point that the first three bytes of the
+; * strings match each other, and they can be safely passed over before
+; * starting the compare loop. So what this code does is skip over 0-3
+; * bytes, as much as necessary in order to dword-align the %edi
+; * pointer. (%esi will still be misaligned three times out of four.)
+; *
+; * It should be confessed that this loop usually does not represent
+; * much of the total running time. Replacing it with a more
+; * straightforward "rep cmpsb" would not drastically degrade
+; * performance.
+; */
+
+LoopCmps:
+ mov eax, DWORD PTR[esi+edx]
+ xor eax, DWORD PTR[edi+edx]
+ jnz SHORT LeaveLoopCmps
+
+ mov eax, DWORD PTR[esi+edx+4]
+ xor eax, DWORD PTR[edi+edx+4]
+ jnz SHORT LeaveLoopCmps4
+
+ add edx, 8
+ jnz SHORT LoopCmps
+ jmp LenMaximum
+ ALIGN 4
+
+LeaveLoopCmps4:
+ add edx, 4
+
+LeaveLoopCmps:
+ test eax, 00000FFFFH
+ jnz SHORT LenLower
+
+ add edx, 2
+ shr eax, 16
+
+LenLower:
+ sub al, 1
+ adc edx, 0
+
+;/* Calculate the length of the match. If it is longer than MAX_MATCH, */
+;/* then automatically accept it as the best possible match and leave. */
+
+ lea eax, [edi+edx]
+ mov edi, [esp+scan]
+ sub eax, edi
+ cmp eax, MAX_MATCH
+ jge SHORT LenMaximum
+
+;/* If the length of the match is not longer than the best match we */
+;/* have so far, then forget it and return to the lookup loop. */
+
+ mov edx, [esp+deflatestate]
+ mov ebx, [esp+bestlen]
+ cmp eax, ebx
+ jg SHORT LongerMatch
+ mov esi, [esp+windowbestlen]
+ mov edi, [edx].ds_prev
+ mov ebx, [esp+scanend]
+ mov edx, [esp+chainlenwmask]
+ jmp LookupLoop
+ ALIGN 4
+
+;/* s->match_start = cur_match; */
+;/* best_len = len; */
+;/* if (len >= nice_match) break; */
+;/* scan_end = *(ushf*)(scan+best_len-1); */
+
+LongerMatch:
+ mov ebx, [esp+nicematch]
+ mov [esp+bestlen], eax
+ mov [edx].ds_match_start, ecx
+ cmp eax, ebx
+ jge SHORT LeaveNow
+ mov esi, [esp+window]
+ add esi, eax
+ mov [esp+windowbestlen], esi
+ movzx ebx, WORD PTR[edi+eax-1]
+ mov edi, [edx].ds_prev
+ mov [esp+scanend], ebx
+ mov edx, [esp+chainlenwmask]
+ jmp LookupLoop
+ ALIGN 4
+
+;/* Accept the current string, with the maximum possible length. */
+
+LenMaximum:
+ mov edx, [esp+deflatestate]
+ mov DWORD PTR[esp+bestlen], MAX_MATCH
+ mov [edx].ds_match_start, ecx
+
+;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */
+;/* return s->lookahead; */
+
+LeaveNow:
+ mov edx, [esp+deflatestate]
+ mov ebx, [esp+bestlen]
+ mov eax, [edx].ds_lookahead
+ cmp ebx, eax
+ jg SHORT LookaheadRet
+ mov eax, ebx
+LookaheadRet:
+
+; Restore the stack and return from whence we came.
+
+ add esp, varsize
+ pop ebx
+ pop esi
+ pop edi
+ pop ebp
+ ret
+
+_longest_match ENDP
+
+_TEXT ENDS
+END