inffastS.s   [plain text]


#if (defined __i386__)

/* this assembly was 1st compiled from inffast.c (assuming POSTINC defined, OFF=0) and then hand optimized */

	.cstring
LC0:
	.ascii "invalid distance too far back\0"
LC1:
	.ascii "invalid distance code\0"
LC2:
	.ascii "invalid literal/length code\0"
	.text
	.align 4,0x90


#ifdef  INFLATE_STRICT
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
#endif
.globl _inflate_fast
_inflate_fast:

	// set up ebp to refer to arguments strm and start
	pushl	%ebp
	movl	%esp, %ebp

	// push edi/esi/ebx into stack
	pushl	%edi
	pushl	%esi
	pushl	%ebx

	// allocate for local variables 92-12=80, + 12 to align %esp to 16-byte boundary
	subl	$92, %esp
	movl	8(%ebp), %ebx					

	/* definitions to help code readability */

	#define	bits	%edi
	#define	strm	%ebx
	#define	state	28(strm)		// state = (struct inflate_state FAR *)strm->state;	
	#define	in		-84(%ebp)		// in = strm->next_in - OFF; OFF=0
	#define	last	-80(%ebp)		// last = in + (strm->avail_in - 5);
	#define	out		-28(%ebp)		// out = strm->next_out - OFF;
	#define	beg		-76(%ebp)		// beg = out - (start - strm->avail_out);
	#define	end		-72(%ebp)		// end = out + (strm->avail_out - 257);
	#define	wsize	-68(%ebp)		// wsize = state->wsize;
	#define whave	-64(%ebp)		// whave = state->whave;
	#define write	-60(%ebp)		// write = state->write;
	#define window	-56(%ebp)		// window = state->window;
	#define	hold	-52(%ebp)		// hold = state->hold;
	#define	lcode	-48(%ebp)		// lcode = state->lencode;
	#define	dcode	-44(%ebp)		// dcode = state->distcode;
	#define	lmask	-40(%ebp)		// lmask = (1U << state->lenbits) - 1; 
	#define	dmask	-36(%ebp)		// dmask = (1U << state->distbits) - 1; 
	#define	len		-32(%ebp)
	#define dmax	-20(%ebp)		
	#define	dist	-16(%ebp)		// dist
	#define	write_wsize	-24(%ebp)	// write+wsize
	#define	write_1		-88(%ebp)	// write-1
	#define	op		-92(%ebp)		// op

	movl	(strm), %eax			// strm->next_in
	movl	%eax, in				// in = strm->next_in - OFF; OFF=0

	subl	$5, %eax				// in - 5;
	movl	4(strm), %ecx			// strm->avail_in
	addl	%ecx, %eax				// in + (strm->avail_in - 5);
	movl	%eax, last				// last = in + (strm->avail_in - 5);

	movl	12(strm), %esi			// strm->next_out
	movl	%esi, out				// out = strm->next_out - OFF;

	movl	16(strm), %ecx			// strm->avail_out
	movl	%esi, %eax				// out		
	subl	12(%ebp), %eax			// out - start
	addl	%ecx, %eax				// out - (start - strm->avail_out);
	movl	%eax, beg				// beg = out - (start - strm->avail_out);

	leal	-257(%esi,%ecx), %ecx	// out + (strm->avail_out - 257);
	movl	%ecx, end				// end = out + (strm->avail_out - 257);

	movl	state, %edx

#ifdef	INFLATE_STRICT
	movl	20(%edx), %ecx			// state->dmax
	movl	%ecx, dmax				// dmax = state->dmax;
#endif

	movl	40(%edx), %ecx			// state->wsize
	movl	%ecx, wsize				// wsize = state->wsize;

	movl	44(%edx), %ecx			// state->whave
	movl	%ecx, whave				// whave = state->whave;

	movl	48(%edx), %esi			// state->write
	movl	%esi, write				// write = state->write;

	movl	52(%edx), %eax			// state->window
	movl	%eax, window			// window = state->window;


	movl	56(%edx), %ecx			// state->hold
	movl	%ecx, hold				// hold = state->hold

	movl	60(%edx), bits			// bits = state->bits;

	movl	76(%edx), %esi			// state->lencode
	movl	%esi, lcode				// lcode = state->lencode;

	movl	80(%edx), %eax			// state->distcode
	movl	%eax, dcode				// dcode = state->distcode;

	movl	84(%edx), %ecx			// state->lenbits
	movl	$1, %eax
	movl	%eax, %esi				// a copy of 1
	sall	%cl, %esi				// 1 << state->lenbits
	decl	%esi					// (1U << state->lenbits) - 1;
	movl	%esi, lmask				// lmask = (1U << state->lenbits) - 1;

	movl	88(%edx), %ecx			// state->distbits
	sall	%cl, %eax				// 1 << state->distbits
	decl	%eax					// (1U << state->distbits) - 1;
	movl	%eax, dmask				// dmask = (1U << state->distbits) - 1;


	// these 2 might be used often, precomputed and saved in stack	
	movl	write, %eax
	addl	wsize, %eax
	movl	%eax, write_wsize		// write+wsize

	movl	write, %edx
	decl	%edx
	movl	%edx, write_1			// write-1


L_do_while_loop:						// do {

	cmpl	$15, bits
	jae		bits_ge_15					//		if (bits < 15) {
#if 0
	leal	8(bits), %esi				// esi = bits+8
	movl	in, %eax					// eax = in
	movzbl	(%eax), %edx				// edx = *in++
	movl	bits, %ecx					// cl = bits
	sall	%cl, %edx					// 1st *in << bits
	addl	hold, %edx					// hold += 1st *in << bits
	movzbl	1(%eax), %eax				// 2nd *in
	movl	%esi, %ecx					// cl = bits+8
	sall	%cl, %eax					// 2nd *in << (bits+8)
	addl	%eax, %edx					// hold += 2nd *in << (bits+8) 
	movl	%edx, hold					// update hold
	addl	$2, in						// in += 2
	addl	$16, bits					// bits += 16;
#else
	/* from simulation, this code segment performs better than the other case
		possibly, we are more often hit with aligned memory access */
	movl	in, %ecx					//			unsigned short *inp = (unsigned short *) (in+OFF);
	movzwl	(%ecx), %eax				// 			*((unsigned short *) in);
	movl	bits, %ecx					//			bits
	sall	%cl, %eax					// 			*((unsigned short *) in) << bits
	addl	%eax, hold					// 			hold += (unsigned long) *((unsigned short *) in) << bits;
	addl	$2, in						// 			in += 2;
	addl	$16, bits					// 			bits += 16;
#endif

bits_ge_15:								// 		}	/* bits < 15 */

	movl	hold, %eax					// 		hold
	andl	lmask, %eax					// 		hold & lmask;
	movl	lcode, %esi					// 		lcode[] : 4-byte aligned
	movl	(%esi,%eax,4), %eax			// 		this = lcode[hold&lmask];
	jmp		dolen
	.align 4,0x90
op_nonzero:
	movzbl	%al, %ecx					// a copy of op to cl
	testb	$16, %cl					// if op&16
	jne		Llength_base				// 		branch to length_base

	testb	$64, %cl					// elif op&64
	jne		length_2nd_level_else		//		branch to 2nd level length code else conditions

	// 2nd level length code

	movl	$1, %eax
	sall	%cl, %eax					// 1 << op
	decl	%eax						// ((1<<op) - 1)
	andl	hold, %eax					// hold & ((1U << op) - 1)
	movzwl	%si, %ecx					// this.val
	addl	%ecx, %eax					// this.val + (hold & ((1U << op) - 1))

	movl	lcode, %ecx					// lcode[] : 4-byte aligned
	movl	(%ecx,%eax,4), %eax			// this = lcode[this.val + (hold & ((1U << op) - 1))];
										// goto dolen (compiler rearranged the order of code)
dolen:
	movl	%eax, %esi					// make a copy of this (val 16-bit, bits 8-bit, op 8-bit)
	shrl	$16, %esi					// %esi = this.val;
	movzbl	%ah, %ecx					// op = (unsigned)(this.bits); 
	shrl	%cl, hold					// hold >>= op; 
	subl	%ecx, bits					// bits -= op;
	testb	%al, %al					// op = (unsigned)(this.op);
	jne		op_nonzero					// if op!=0, branch to op_nonzero 

	movl	%esi, %ecx					// this.val;
	movl	out, %eax					// out
	movb	%cl, (%eax)					// PUP(out) = (unsigned char)(this.val);
	incl	%eax						// out++;
	movl	%eax, out					// save out

L_tst_do_while_loop_end:
	movl	last, %eax					// last
	cmpl	%eax, in					// in vs last
	jae		return_unused_bytes 		// branch to return_unused_bytes if in >= last
	movl	end, %edx					// end
	cmpl	%edx, out					// out vs end
	jb		L_do_while_loop				// branch to do loop if out < end

return_unused_bytes:

	movl	bits, %eax					// bits
	shrl	$3, %eax					// len = bits >> 3
	movl	in, %edx					// in
	subl	%eax, %edx					// in -= len
	sall	$3, %eax					// len << 3
	movl	bits, %ecx					// bits
	subl	%eax, %ecx					// bits -= len << 3

	movl	%edx, (strm)				// strm->next_in = in + OFF;
	movl	out, %eax
	movl	%eax, 12(strm)				// strm->next_out = out + OFF;

	cmpl	%edx, last					// last vs in
	jbe		L67							// if (last <= in) branch to L67 and return to L69
	movl	last, %eax					// last
	addl	$5, %eax					// 5 + last
	subl	%edx, %eax					// 5 + last - in	
L69:
	movl	%eax, 4(strm)				// update strm->avail_in

	movl	end, %eax
	cmpl	%eax, out					// out vs end
	jae		L70							// if (out>=end) branch to L70, and return to L72
	addl	$257, %eax					// 257 + end
	subl	out, %eax					// 257 + end - out
L72:
	movl	%eax, 16(strm)				// update strm->avail_out

	movl	$1, %eax
	sall	%cl, %eax					// 1 << bits
	decl	%eax						// (1 << bits) -1
	andl	hold, %eax					// hold &= (1U << bits) - 1;
	movl	state, %esi
	movl	%eax, 56(%esi)				// state->hold = hold;
	movl	%ecx, 60(%esi)				// state->bits = bits;

	addl	$92, %esp					// pop out local from stack

	// restore saved registers and return
	popl	%ebx
	popl	%esi
	popl	%edi
	leave
	ret

	// this code segment is branched in from op_nonzero, with op in cl and this.value in esi
Llength_base:
	movzwl	%si, %esi			// this instruction might not be needed, pad here to give better performance
	movl	%esi, len			// len = (unsigned)(this.val);
 
	movl	%ecx, %esi			// leave a copy of op at ecx
	andl	$15, %esi			// op&=15;
	je		Lop_is_zero			// if (op) {
	cmpl	bits, %esi			//		op vs bits
	jbe		Lop_be_bits			//		if (bits < op) {
	movl	in, %edx			//			in
	movzbl	(%edx), %eax		//			*in
	movl	bits, %ecx			//			bits
	sall	%cl, %eax			//			*in << bits
	addl	%eax, hold			// 			hold += (unsigned long)(PUP(in)) << bits;
	incl	%edx				//			in++
	movl	%edx, in			//			update in
	addl	$8, bits			//			bits += 8
Lop_be_bits:					//		}
	movl	$1, %eax			//		1
	movl	%esi, %ecx			//		op
	sall	%cl, %eax			//		1 << op
	decl	%eax				// 		(1<<op)-1	
	andl	hold, %eax			//		hold & ((1U << op) - 1)
	addl	%eax, len			//		len += (unsigned)hold & ((1U << op) - 1);
	shrl	%cl, hold			//		hold >>= op;
	subl	%esi, bits			//		bits -= op;
Lop_is_zero:					// }
	cmpl	$14, bits			// if (bits < 15) {
	jbe		bits_le_14			//		branch to refill 16-bit into hold, and branch back to next
L19:							// }
	movl	hold, %eax			// hold
	andl	dmask, %eax			// hold&dmask
	movl	dcode, %esi			// dcode[] : 4-byte aligned
	movl	(%esi,%eax,4), %eax	// this = dcode[hold & dmask];
	jmp		dodist

Lop_16_zero:
	testb	$64, %cl					// op&64
	jne		Linvalid_distance_code		// if (op&64)!=0, branch to invalid distance code
	movl	$1, %eax					// 1
	sall	%cl, %eax					// (1<<op)
	decl	%eax						// (1<<op)-1 
	andl	hold, %eax					// (hold & ((1U << op) - 1))
	movzwl	%dx, %edx					// this.val
	addl	%edx, %eax					// this.val + (hold & ((1U << op) - 1))
	movl	dcode, %edx					// dcode[] : 4 byte aligned
	movl	(%edx,%eax,4), %eax			// this = dcode[this.val + (hold & ((1U << op) - 1))];
dodist:
	movl	%eax, %edx					// this : (val 16-bit, bits 8-bit, op 8-bit)
	shrl	$16, %edx					// edx = this.val
	movzbl	%ah, %ecx					// op = (unsigned)(this.bits); 
	shrl	%cl, hold					// hold >>= op;
	subl	%ecx, bits					// bits -= op;
	movzbl	%al, %ecx					// op = (unsigned)(this.op);
	testb	$16, %cl					// op & 16
	je		Lop_16_zero					// if (op&16)==0 goto test op&64

Ldistance_base:							// if (op&16) {		/* distance base */
	andl	$15, %ecx					//	  op &= 15; edx = dist = this.val;
	movl	%ecx, op					// 		save a copy of op
	cmpl	bits, %ecx					//		op vs bits
	jbe		0f							//		if (bits < op) {
	movl	in, %ecx					//			in
	movzbl	(%ecx), %eax				//			*in
	movl	bits, %ecx					//			bits
	sall	%cl, %eax					//			*in << bits
	addl	%eax, hold					//			hold += (unsigned long)(PUP(in)) << bits;
	incl	in							//			in++
	addl	$8, bits					//			bits += 8
	cmpl	bits, op					//			op vs bits
	jbe		0f							//			if (bits < op) {
	movl	in, %esi					//				i
	movzbl	(%esi), %eax				// 				*in
	movl	bits, %ecx					//				cl = bits
	sall	%cl, %eax					//				*in << bits
	addl	%eax, hold					//				hold += (unsigned long)(PUP(in)) << bits;
	incl	%esi						//				in++
	movl	%esi, in					//				update in
	addl	$8, bits					//				bits += 8
0:										// }		}

	movzwl	%dx, %edx					// dist = (unsigned)(this.val); 
	movl	$1, %eax					// 1
	movzbl	op, %ecx					// cl = op
	sall	%cl, %eax					// 1 << op
	decl	%eax						// ((1U << op) - 1)
	andl	hold, %eax					// (unsigned)hold & ((1U << op) - 1)
	addl	%edx, %eax					// dist += (unsigned)hold & ((1U << op) - 1);

#ifdef INFLATE_STRICT

	cmpl	dmax, %eax						// dist vs dmax
	ja		Linvalid_distance_too_far_back	// if (dist > dmax) break for invalid distance too far back	

#endif

	movl	%eax, dist						// save a copy of dist in stack
	shrl	%cl, hold						// hold >>= op; 
	subl	%ecx, bits						// bits -= op;

	movl	out, %eax
	subl	beg, %eax						// eax = op = out - beg
	cmpl	%eax, dist						// dist vs op
	jbe		Lcopy_direct_from_output		// if (dist <= op) branch to copy direct from output	

											// if (dist > op) {
	movl	dist, %ecx						//	dist
	subl	%eax, %ecx						//	esi = op = dist - op;
	cmpl	%ecx, whave						//  whave vs op
	jb		Linvalid_distance_too_far_back	//  if (op > whave) break for error;

	movl	write, %edx
	testl	%edx, %edx
	jne		Lwrite_non_zero					// if (write==0) {
	movl	wsize, %eax						//		wsize
	subl	%ecx, %eax						//		wsize-op
	movl	window, %esi					//		from=window-OFF
	addl	%eax, %esi						//		from += wsize-op
	movl	out, %edx						//		out
	cmpl	%ecx, len						//		len vs op
	jbe		L38								// 		if !(op < len) skip
    subl    %ecx, len						// len - op
0:											// do {
	movzbl  (%esi), %eax					//
    movb    %al, (%edx)						//	
    incl    %edx							//
    incl    %esi							//  	PUP(out) = PUP(from);
    decl    %ecx							//		--op;
    jne     0b								// } while (op);

    movl    %edx, out						// update out
    movl    %edx, %esi						// out 
    subl    dist, %esi						// esi = from = out - dist;

L38:			/* copy from output */

			//		while (len > 2) {
            //            PUP(out) = PUP(from);
            //            PUP(out) = PUP(from);
            //            PUP(out) = PUP(from);
            //            len -= 3;
            //        }
            //        if (len) {
            //            PUP(out) = PUP(from);
            //            if (len > 1)
            //                PUP(out) = PUP(from);
            //       }

	movl	len, %ecx						// len
	movl	out, %edx						// out
	subl	$3, %ecx						// pre-decrement len by 3
	jl		1f								// if len < 3, branch to 1f for remaining processing
0:											// while (len>2) {
	movzbl	(%esi), %eax
	movb	%al, (%edx)						// 		PUP(out) = PUP(from);
	movzbl	1(%esi), %eax
	movb	%al, 1(%edx)					//		PUP(out) = PUP(from);
	movzbl	2(%esi), %eax
	movb	%al, 2(%edx)					//		PUP(out) = PUP(from);
	addl	$3, %esi						//		from += 3;
	addl	$3, %edx						//		out += 3;
	subl	$3, %ecx						//		len -= 3;
	jge		0b								// }
	movl	%edx, out						// update out, in case len == 0
1:
	addl	$3, %ecx						// post-increment len by 3
	je		L_tst_do_while_loop_end			// if (len) {
	movzbl	(%esi), %eax					//
	movb	%al, (%edx)						//		PUP(out) = PUP(from);
	incl	%edx							//		out++
	movl	%edx, out						//		update out, in case len == 1
	cmpl	$2, %ecx						//
	jne		L_tst_do_while_loop_end			//		if len==1, break
	movzbl	1(%esi), %eax
	movb	%al, (%edx)						//		PUP(out) = PUP(from);
	incl	%edx							//		out++
	movl	%edx, out						//		update out
	jmp		L_tst_do_while_loop_end			//	}
	
	.align 4,0x90
length_2nd_level_else:
	andl	$32, %ecx						// test end-of-block
	je		invalid_literal_length_code		// if (op&32)==0, branch for invalid literal/length code break
	movl	state, %edx						// if (op&32), end-of-block is detected
	movl	$11, (%edx)						// state->mode = TYPE
	jmp		return_unused_bytes

L70:
	movl	out, %edx						// out
	subl	%edx, end						// (end-out)
	movl	end, %esi						// %esi = (end-out) = -(out - end);
	leal	257(%esi), %eax					// %eax = 257 + %esi = 257 - (out -end)
	jmp		L72								// return to update state and return

L67:										// %edx = in, to return 5 - (in - last) in %eax
	subl	%edx, last						// last - in 
	movl	last, %edx						// %edx = last - in = - (in - last);
	leal	5(%edx), %eax					// %eax = 5 + %edx = 5 - (in - last);
	jmp		L69								// return to update state and return

bits_le_14:
#if 1
	leal	8(bits), %esi				// esi = bits+8
	movl	in, %eax					// eax = in
	movzbl	(%eax), %edx				// edx = *in++
	movl	bits, %ecx					// cl = bits
	sall	%cl, %edx					// 1st *in << bits
	addl	hold, %edx					// hold += 1st *in << bits
	movzbl	1(%eax), %eax				// 2nd *in
	movl	%esi, %ecx					// cl = bits+8
	sall	%cl, %eax					// 2nd *in << (bits+8)
	addl	%eax, %edx					// hold += 2nd *in << (bits+8) 
	movl	%edx, hold					// update hold
	addl	$2, in						// in += 2
	addl	$16, bits					// bits += 16;
	jmp	L19
#else
	/* this code segment does not run as fast as the other original code segment, possibly the processor
		need extra time to handle unaligned short access */
	movl    in, %edx                    //          unsigned short *inp = (unsigned short *) (in+OFF);
    movzwl  (%edx), %eax                //          *((unsigned short *) in);
    movl    bits, %ecx                  //          bits
    sall    %cl, %eax                   //          *((unsigned short *) in) << bits
    addl    %eax, hold                  //          hold += (unsigned long) *((unsigned short *) in) << bits;
    addl    $2, %edx                    //          in += 2;
    addl    $16, %ecx                   //          bits += 16;
	movl	%edx, in
	movl	%ecx, bits
	jmp	L19
#endif
invalid_literal_length_code:
    call    0f
0:	popl    %eax
	leal	LC2-0b(%eax), %eax
	movl	%eax, 24(strm)
	movl	state, %esi
	movl	$27, (%esi)
	jmp		return_unused_bytes
Linvalid_distance_code:
    call    0f
0:	popl    %eax
	leal	LC1-0b(%eax), %eax
	movl	%eax, 24(strm)
	movl	state, %eax
	movl	$27, (%eax)
	jmp		return_unused_bytes

#ifdef	INFLATE_STRICT
	.align	4,0x90
	.byte	0
	.byte	0
	.byte	0
	.byte	0
	.byte	0
	.byte	0
	.byte	0
	.byte	0
	.byte	0
#endif
Lcopy_direct_from_output:
	movl	out, %edx							// out
	subl	dist, %edx							// from = out - dist
	movl	out, %ecx							// out
	movl	len, %esi							// len
	subl	$3, %esi							// pre-decement len by 3
0:												// do {
	movzbl	(%edx), %eax
	movb	%al, (%ecx)							// 	PUP(out) = PUP(from);
	movzbl	1(%edx), %eax
	movb	%al, 1(%ecx)						// 	PUP(out) = PUP(from);
	movzbl	2(%edx), %eax
	movb	%al, 2(%ecx)						// 	PUP(out) = PUP(from);
	addl	$3, %edx							// 	from += 3
	addl	$3, %ecx							// 	out += 3	
	subl	$3, %esi							// 	len -= 3
	jge		0b									// } while (len > 2);
	movl	%ecx, out							// update out in case len == 0
	addl	$3, %esi							// post-increment len by 3
	je		L_tst_do_while_loop_end				// if (len) {
	movzbl	(%edx), %eax
	movb	%al, (%ecx)							//		PUP(out) = PUP(from);
	incl	%ecx
	movl	%ecx, out							//		out++
	cmpl	$2, %esi							//
	jne		L_tst_do_while_loop_end				//		if (len>2)
	movzbl	1(%edx), %eax
	movb	%al, (%ecx)							//			PUP(out) = PUP(from);
	incl	%ecx
	movl	%ecx, out							//			out++
	jmp		L_tst_do_while_loop_end				// }

	.align 4,0x90
Lwrite_non_zero:								// %edx = write, %ecx = op
	movl	window, %esi						// from = window - OFF;
	cmp		%ecx, %edx							// write vs op, test for wrap around window or contiguous in window
	jae		Lcontiguous_in_window				// if (write >= op) branch to contiguous in window 

Lwrap_around_window: 							// wrap around window
	addl	write_wsize, %esi					// from += write+wsize
	subl	%ecx, %esi							// from += wsize + write - op;		
	subl	%edx, %ecx							// op -= write
	cmpl	%ecx, len							// len vs op
	jbe		L38									// if (len <= op) break to copy from output
	subl	%ecx, len							// len -= op;
	movl	out, %edx							// out
0:												// do {
	movzbl	(%esi), %eax						// 	*from
	movb	%al, (%edx)							// 	*out
	incl	%esi								// 	from++
	incl	%edx								// 	out++	
	decl	%ecx								// 	--op
	jne		0b									// } while (op);

	movl	%edx, out							// save out in case we need to break to L38
	movl	window, %esi						// from = window - OFF;
	movl	len, %eax							// len
	cmpl	%eax, write							// write vs len
	jae		L38									// if (write >= len) break to L38 

	movl	write, %ecx							// op = write
	subl	%ecx, len							// len -= op;
0:												// do {
	movzbl	(%esi), %eax						//	*from
	movb	%al, (%edx)							//  *out
	incl	%esi								//  from++
	incl	%edx								//	out++
	decl	%ecx								//  --op
	jne		0b									// } while (op);

	movl	%edx, %esi							// from = out
	movl	%edx, out							// save a copy of out
	subl	dist, %esi							// from = out - dist;
	jmp		L38									// break to copy from output

Lcontiguous_in_window:								// contiguous in window, edx = write, %ecx = op
	subl	%ecx, %edx								// write - op
	addl	%edx, %esi								// from += write - op;
	cmpl	%ecx, len								// len vs op
	jbe		L38										// if (len <= op) break to copy from output 
	movl	out, %edx								// out
	subl	%ecx, len								// len -= op;

0:													// do {
	movzbl	(%esi), %eax							// 	*from
	movb	%al, (%edx)								// 	*out
	incl	%esi									// 	from++
	incl	%edx									// 	out++
	decl	%ecx									// 	op-- 
	jne		0b										// } while (op); 

	movl	%edx, out								// update out
	movl	%edx, %esi								// from = out
	subl	dist, %esi								// from = out - dist;
	jmp		L38

Linvalid_distance_too_far_back:
    call    0f
0:	popl    %eax
	leal	LC0-0b(%eax), %eax
	movl	%eax, 24(strm)
	movl	state, %ecx
	movl	$27, (%ecx)
	jmp		return_unused_bytes

#endif

#if (defined __x86_64__)
	.cstring
LC0:
	.ascii "invalid distance too far back\0"
LC1:
	.ascii "invalid distance code\0"
LC2:
	.ascii "invalid literal/length code\0"
	.text
	.align 4,0x90

#ifdef  INFLATE_STRICT
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
	.byte 0
#endif

.globl _inflate_fast
_inflate_fast:

	// set up rbp
	pushq	%rbp
	movq	%rsp, %rbp

	// save registers in stack
	pushq	%r15
	pushq	%r14
	pushq	%r13
	pushq	%r12
	pushq	%rbx

	#define	strm		%r13
	#define	state		%rdi
	#define	in			%r12
	#define	in_d		%r12d
	#define	out			%r10
	#define	out_d		%r10d
	#define	write		%r15d
	#define hold		%r9
	#define holdd		%r9d
	#define	bits		%r8d
	#define	lcode		%r14
	#define	len			%ebx
	#define from		%rcx
	#define	dmax		%r11d

	#define	last		-104(%rbp)
	#define	beg			-96(%rbp)
	#define	end			-88(%rbp)
	#define	wsize		-80(%rbp)
	#define	whave		-76(%rbp)
	#define	window		-72(%rbp)
	#define	dcode		-64(%rbp)
	#define	lmask		-56(%rbp)
	#define	dmask		-112(%rbp)
	#define	wsize_write	-116(%rbp)
	#define	write_1		-128(%rbp)
	#define	dist		-44(%rbp)

	// reserve stack memory for local variables 128-40=88
	subq	$88, %rsp

	movq	%rdi, strm
	movq	56(%rdi), state						// state = (struct inflate_state FAR *)strm->state;	
	movq	(strm), in							// in = strm->next_in - OFF;
	movl	8(strm), %eax						// strm->avail_in
	subl	$5, %eax							// (strm->avail_in - 5)
	addq	in, %rax							// in + (strm->avail_in - 5)
	movq	%rax, last							// last = in + (strm->avail_in - 5)
	movq	24(strm), out						// out = strm->next_out
	movl	32(strm), %eax						// strm->avail_out
	subl	%eax, %esi							// (start - strm->avail_out);
	movq	out, %rdx							// strm->next_out
	subq	%rsi, %rdx							// out - (start - strm->avail_out); 
	movq	%rdx, beg							// beg = out - (start - strm->avail_out);
	subl	$257, %eax							// (strm->avail_out - 257)
	addq	out, %rax							// out + (strm->avail_out - 257); 
	movq	%rax, end							// end = out + (strm->avail_out - 257);

#ifdef INFLATE_STRICT
	movl	20(state), dmax						// dmax = state->dmax;
#endif

	movl	52(state), %ecx						// state->wsize
	movl	%ecx, wsize							// wsize = state->wsize;
	movl	56(state), %ebx						// state->whave;
	movl	%ebx, whave							// whave = state->whave;
	movl	60(state), write					// write = state->write;
	movq	64(state), %rax						// state->window
	movq	%rax, window						// window = state->window;
	movq	72(state), hold						// hold = state->hold;
	movl	80(state), bits						// bits = state->bits;

	movq	96(state), lcode					// lcode = state->lencode;
	movq	104(state), %rdx					// state->distcode;
	movq	%rdx, dcode							// dcode = state->distcode;

	movl	116(state), %ecx					// state->distbits
	movl	$1, %eax
	movl	%eax, %edx							// 1
	sall	%cl, %edx							// (1U << state->distbits)
	movl	112(state), %ecx					// state->lenbits
	sall	%cl, %eax							// (1U << state->lenbits)
	decl	%eax								// (1U << state->lenbits) - 1
	movq	%rax, lmask							// lmask = (1U << state->lenbits) - 1
	decl	%edx								// (1U << state->distbits) - 1
	movq	%rdx, dmask							// dmask = (1U << state->distbits) - 1

	movl	wsize, %ecx							// wsize
	addl	write, %ecx							// wsize + write
	movl	%ecx, wsize_write					// wsize_write = wsize + write

	leal	-1(%r15), %ebx						// write - 1
	movq	%rbx, write_1						// write_1 = write - 1

L_do_while_loop:
	cmpl	$14, bits							// bits vs 14
	ja		0f									// if (bits < 15) {
	movzwl	(in), %eax							//		read 2 bytes from in
	movl	bits, %ecx							//		set up cl = bits
	salq	%cl, %rax							//		(*in) << bits	
	addq	%rax, hold							// 		hold += (*in) << bits
	addq	$2, in								//		in += 2
	addl	$16, bits							//		bits += 16
0:												// }
	movq	lmask, %rax							//	lmask
	andq	hold, %rax							//	hold & lmask
	jmp		1f
	.align 4,0x90
Lop_nonzero:
	movzbl	%al, %ecx							// op in al and cl 
	testb	$16, %cl							// check for length base processing (op&16)
	jne		L_length_base						// if (op&16) branch to length base processing	
	testb	$64, %cl							// check for 2nd level length code (op&64==0)
	jne		L_end_of_block						// if (op&64)!=0, branch for end-of-block processing

	/* 2nd level length code : (op&64) == 0*/
L_2nd_level_length_code:
	movl	$1, %eax							// 1
	sall	%cl, %eax							// 1 << op
	decl	%eax								// ((1U << op) - 1)
	andq	hold, %rax							// (hold & ((1U << op) - 1))
	movzwl	%dx, %edx
	addq	%rdx, %rax							// this = lcode[this.val + (hold & ((1U << op) - 1))];
1:	
	movl	(lcode,%rax,4), %eax				// this = lcode[hold & lmask];
Ldolen:
	movl	%eax, %edx							// a copy of this
	shrl	$16, %edx							// edx = this.val;
	movzbl	%ah, %ecx							// op = this.bits
	shrq	%cl, hold							// hold >>= op; 
	subl	%ecx, bits							// bits -= op;
	testb	%al, %al							// op = (unsigned)(this.op);
	jne		Lop_nonzero							// if (op!-0) branch for copy operation
L_literal:
	movb	%dl, (out)							// *out = this.val
	incq	out									// out ++
L_do_while_loop_check:
	cmpq	last, in							// in vs last
	jae		L_return_unused_byte				// if in >= last, break to return unused byte processing
	cmpq	end, out							// out vs end
	jb		L_do_while_loop						// back to do_while_loop if out < end

	/* return unused bytes (on entry, bits < 8, so in won't go too far back) */

L_return_unused_byte:
	movl	out_d, %esi
	jmp		L34

L_length_base:				/* al = cl = op, edx = this.val, op&16 = 16 */ 
	movzwl	%dx, len							// len = (unsigned)(this.val);
	movl	%ecx, %edx							// op
	andl	$15, %edx							// op &= 15;
	je		1f									// if (op) {
	cmpl	bits, %edx							//		op vs bits
	jbe		0f									//		if (bits < op) {
	movzbl	(in), %eax							//			*in
	movl	bits, %ecx							//			cl = bits
	salq	%cl, %rax							//			*in << bits
	addq	%rax, hold							//			hold += (unsigned long)(PUP(in)) << bits;
	incq	in									//			in++
	addl	$8, bits							//			bits += 8
0:												//		}
	movl	$1, %eax							//		1
	movl	%edx, %ecx							//		cl = op
	sall	%cl, %eax							//		1 << op
	decl	%eax								//		(1 << op) - 1
	andl	holdd, %eax							//		 (unsigned)hold & ((1U << op) - 1);
	addl	%eax, len							//		len += (unsigned)hold & ((1U << op) - 1);
	shrq	%cl, hold							//		hold >>= op;
	subl	%edx, bits							//		bits -= op;
1:												// }
	cmpl	$14, bits							// bits vs 14
	jbe		L99									// if (bits < 15) go to loading to hold and return to L19
L19:												// }
	movq	dmask, %rax							// dmask
	andq	hold, %rax							// hold & dmask
	movq	dcode, %rdx							// dcode[]
	movl	(%rdx,%rax,4), %eax					// this = dcode[hold & dmask];
	jmp		L_dodist
	.align 4,0x90
0:												// op&16 == 0, test (op&64)==0 for 2nd level distance code
	testb	$64, %cl							// op&64	
	jne		L_invalid_distance_code				// if ((op&64)==0) { /* 2nd level distance code */
	movl	$1, %eax							//	1
	sall	%cl, %eax							//  1 << op 
	decl	%eax								// (1 << op) - 1
	andq	hold, %rax							// (hold & ((1U << op) - 1))
	movzwl	%dx, %edx							// this.val	
	addq	%rdx, %rax							// this.val + (hold & ((1U << op) - 1))
	movq	dcode, %rcx							// dcode[]
	movl	(%rcx,%rax,4), %eax					// this = dcode[this.val + (hold & ((1U << op) - 1))];
L_dodist:
	movl	%eax, %edx							// this
	shrl	$16, %edx							// dist = (unsigned)(this.val);
	movzbl	%ah, %ecx							// cl = op = this.bits
	shrq	%cl, hold							// hold >>= op;
	subl	%ecx, bits							// bits -= op;
	movzbl	%al, %ecx							// op = (unsigned)(this.op);
	testb	$16, %cl							// (op & 16)	test for distance base
	je		0b									// if (op&16) == 0, branch to check for 2nd level distance code

L_distance_base:								/* distance base */

	movl	%ecx, %esi							// op
	andl	$15, %esi							// op&=15
	cmpl	bits, %esi							// op vs bits
	jbe		1f									// if (bits < op) {
	movzbl	(in), %eax							//		*in
	movl	bits, %ecx							//		cl = bits
	salq	%cl, %rax							//		*in << bits
	addq	%rax, hold							//		hold += (unsigned long)(PUP(in)) << bits;
	incq	in									//		in++
	addl	$8, bits							//		bits += 8
	cmpl	bits, %esi							//		op vs bits
	jbe		1f									//		if (bits < op) {
	movzbl	(in), %eax							//			*in
	movl	bits, %ecx							//			cl = bits
	salq	%cl, %rax							//			*in << bits
	addq	%rax, hold							//			hold += (unsigned long)(PUP(in)) << bits;
	incq	in									//			in++
	addl	$8, bits							//			bits += 8
1:												// }	}

	movzwl	%dx, %edx							// dist
	movl	$1, %eax							// 1
	movl	%esi, %ecx							// cl = op
	sall	%cl, %eax							// (1 << op)
	decl	%eax								// (1 << op) - 1
	andl	holdd, %eax							// (unsigned)hold & ((1U << op) - 1)
	addl	%edx, %eax							// dist += (unsigned)hold & ((1U << op) - 1);
	movl	%eax, dist							// save a copy of dist in stack

#ifdef INFLATE_STRICT
	cmp		%eax, dmax							// dmax vs dist 
	jb		L_invalid_distance_too_far_back		// if (dmax < dist) break for invalid distance too far back
#endif

	shrq	%cl, hold							// hold >>= op;
	subl	%esi, bits							// bits -= op;
	movl	out_d, %esi							// out
	movl	out_d, %eax							// out
	subl	beg, %eax							// op = out - beg
	cmpl	%eax, dist							// dist vs op,  /* see if copy from window */
	jbe		L_copy_direct_from_output			// if (dist <= op) branch to copy direct from output

L_distance_back_in_window:			

	movl	dist, %edx							// dist
	subl	%eax, %edx							// op = dist - op;	/* distance back in window */

	cmpl	%edx, whave							// whave vs op
	jb		L_invalid_distance_too_far_back		// if (op > whave), break for invalid distance too far back

	testl	write, write						// if (write!=0)
	jne		L_wrap_around_window				//		branch to wrap around window

L_very_common_case:

	movl	wsize, %eax							//	wsize
	subl	%edx, %eax							//	wsize - op
	movq	window, from						//	from = window - OFF;
	addq	%rax, from							//	from += wsize - op;

	movl	%edx, %esi							//  op
	cmpl	%edx, len							//  len vs op
	ja		L_some_from_window					//  if (len > op), branch for aligned code block L_some_from_window
L38:
	subl	$3, len								// pre-decrement len by 3
	jge		0f									// if len >= 3, branch to the aligned code block 
1:	addl	$3, len								// post-increment len by 3
	je		L_do_while_loop_check				// if (len==0) break to L_do_while_loop_check
	movzbl	(from), %eax						// *from
	movb	%al, (out)							// *out
	incq	out									// out++
	cmpl	$2, len								// len vs 2
	jne		L_do_while_loop_check				// if len!=2 break to L_do_while_loop_check
	movzbl	1(from), %eax						// *from
	movb	%al, (out)							// *out
	incq	out									// out++
	jmp		L_do_while_loop_check				// break to L_do_while_loop_check

	.align 4,0x90
0:												// do {				
	movzbl	(from), %eax						//		*from
	movb	%al, (out)							//		*out
	movzbl	1(from), %eax						//		*from
	movb	%al, 1(out)							//		*out
	movzbl	2(from), %eax						//		*from
	movb	%al, 2(out)							//		*out
	addq	$3, out								//		out += 3
	addq	$3, from							//		from += 3
	subl	$3, len								//		len -= 3
	jge		0b									// } while (len>=0);
	jmp		1b									// branch back to the possibly unaligned code

	.align 4,0x90
L_end_of_block:
	andl	$32, %ecx							// op & 32
	jne		L101								// if (op&32) branch to end-of-block break
	leaq	LC2(%rip), from
	movq	from, 48(strm)						// state->mode
	movl	$27, (state)						// state->mode = BAD;
	movl	out_d, %esi

L34:
	movl	bits, %eax							// bits
	shrl	$3, %eax							// len = bits >> 3;
	mov		%eax, %edx							// len
	subq	%rdx, in							// in -= len
	sall	$3, %eax							// len << 3
	movl	bits, %ecx							// bits
	subl	%eax, %ecx							// bits -= len << 3
	movq	in, (strm)							// strm->next_in = in + OFF;
	movq	out, 24(strm)						// strm->next_out = out + OFF;
	cmpq	in, last							// last vs in
	jbe		L67									// if (last <= in) branch to L67 and return to L69
	movl	last, %eax							// last
	addl	$5, %eax							// last + 5
	subl	in_d, %eax							// 5 + last - in
L69:
	movl	%eax, 8(strm)						// update strm->avail_in

	cmpq	end, out							// out vs end
	jae		L70									// if out<=end branch to L70 and return to L72
	movl	end, %eax							// end
	addl	$257, %eax							// 257 + end
	subl	%esi, %eax							// 257 + end - out;
L72:
	movl	%eax, 32(strm)						// update strm->avail_out

	movl	$1, %eax							// 1
	sall	%cl, %eax							// 1 << bits
	decl	%eax								// (1U << bits) - 1
	andq	hold, %rax							// hold &= (1U << bits) - 1;
	movq	%rax, 72(state)						// state->hold = hold;
	movl	%ecx, 80(state)						// state->bits = bits;

	// clear stack memory for local variables
	addq	$88, %rsp

	// restore registers from stack 
	popq	%rbx
	popq	%r12
	popq	%r13
	popq	%r14
	popq	%r15

	// return to caller
	leave
	ret

	.align 4,0x90
L99:
	leal	8(bits), %esi						//		esi = bits+8
	movzbl	(in), %edx							//		1st *in	
	movl	bits, %ecx							//		cl = bits
	salq	%cl, %rdx							//		1st *in << 8
	addq	%rdx, hold							// 		1st hold += (unsigned long)(PUP(in)) << bits;
	movzbl	1(in), %eax							//		2nd *in
	movl	%esi, %ecx							//		cl = bits + 8
	salq	%cl, %rax							//		2nd *in << bits+8	
	addq	%rax, hold							// 		2nd hold += (unsigned long)(PUP(in)) << bits;
	addq	$2, in								//		in += 2
	addl	$16, bits							//		bits += 16
	jmp		L19

L101:
	movl	$11, (state)
	movl	out_d, %esi
	jmp	L34
	.align 4,0x90
L70:
	movl	end, %eax							// end
	subl	%esi, %eax							// end - out
	addl	$257, %eax							// 257 + end - out
	jmp		L72
	.align 4,0x90
L67:
	movl	last, %eax							// last
	subl	in_d, %eax							// last - in
	addl	$5, %eax							// 5 + last - in
	jmp		L69


	.align 4,0x90

	// stuffing the following 4 bytes to align the major loop to a 16-byte boundary to give the better performance
	.byte 0
	.byte 0
	.byte 0
	.byte 0
L_copy_direct_from_output:
	mov		dist, %eax						// dist
	movq	out, %rdx						// out
	subq	%rax, %rdx						// from = out - dist;
	subl	$3, len							// pre-decrement len by 3
											// do {
0:	movzbl	(%rdx), %eax					// 	*from
	movb	%al, (out)						//	*out
	movzbl	1(%rdx), %eax					//	*from
	movb	%al, 1(out)						//	*out
	movzbl	2(%rdx), %eax					//	*from
	movb	%al, 2(out)						//	*out
	addq	$3, out							//	out+=3
	addq	$3, %rdx						//  from+=3
	subl	$3, len							//  len-=3
	jge		0b								// } while (len>=0);
1:	addl	$3, len							// post-increment len by 3
	je		L_do_while_loop_check			// if len==0, branch to do_while_loop_check

	movzbl	(%rdx), %eax					// *from
	movb	%al, (out)						// *out
	incq	out								// out++
	cmpl	$2, len							// len == 2 ?
	jne		L_do_while_loop_check			// if len==1, branch to do_while_loop_check

	movzbl	1(%rdx), %eax					// *from
	movb	%al, (out)						// *out
	incq	out								// out++
	jmp	L_do_while_loop_check				// branch to do_while_loop_check

	.align 4,0x90
L_some_from_window:		// from : from, out, %esi/%edx = op
									// do {
	movzbl	(from), %eax			// 	*from
	movb	%al, (out)				// 	*out
	incq	from					// 	from++
	incq	out						// 	out++
	decl	%esi					// 	--op
	jne		L_some_from_window		// } while (op);
	subl	%edx, len				// len -= op;
	mov		dist, %eax				// dist
	movq	out, from				// out
	subq	%rax, from				// from = out - dist;
	jmp		L38						// copy from output

	.align 4,0x90
L_wrap_around_window:
	cmpl	%edx, write					// write vs op
	jae		L_contiguous_in_window		// if (write >= op) branch to contiguous in window
	movl	wsize_write, %eax			// wsize+write
	subl	%edx, %eax					// wsize+write-op
	movq	window, from				// from = window - OFF
	addq	%rax, from					// from += wsize+write-op
	subl	write, %edx					// op -= write
	cmpl	%edx, len					// len vs op
	jbe		L38							// if (len<=op) branch to copy from output
 
	subl	%edx, len					// len -= op;
0:										// do {
	movzbl	(from), %eax				//		*from
	movb	%al, (out)					//		*out
	incq	from						//		from++
	incq	out							//		out++
	decl	%edx						//		op--
	jne		0b							// } while (op);
	movq	window, from

	cmpl	len, write					// write vs len
	jae		L38							// if (write >= len) branch to copy from output
	movl	write, %esi					// op = write
	subl	write, len					// len -= op
1:										// do {
	movzbl	(from), %eax				//		*from	
	movb	%al, (out)					//		*out
	incq	from						//		from++
	incq	out							//		out++
	decl	%esi						//		op--
	jne		1b							// } while (op);
	mov		dist, %eax					// dist
	movq	out, from					// out
	subq	%rax, from					// from = out - dist;
	jmp		L38

	.align 4,0x90
L_contiguous_in_window:
	movl	write, %eax					// write
	subl	%edx, %eax					// write - op
	movq	window, from				// from = window - OFF
	addq	%rax, from					// from += write - op
	cmpl	%edx, len					// len vs op
	jbe		L38							// if (len <= op) branch to copy from output
	subl    %edx, len					// len -= op;
2:										// do {
	movzbl	(from), %eax				// 	*from
	movb	%al, (out)					// 	*out
	incq	from						// 	from++
	incq	out							// 	out++
	decl	%edx						// 	op--
	jne		2b							// } while (op);

	mov		dist, %eax					// dist
	movq	out, from					// out
	subq	%rax, from					// from = out - dist;
	jmp		L38							// copy from output

	.align 4,0x90
L_invalid_distance_code:
	leaq	LC1(%rip), %rdx
	movq	%rdx, 48(strm)
	movl	$27, (state)
	movl	out_d, %esi
	jmp		L34

L_invalid_distance_too_far_back:
	leaq	LC0(%rip), %rbx
	movq	%rbx, 48(strm)				// error message
	movl	$27, (state)				// state->mode = BAD
	jmp		L34

#endif