;-----------------------------------------------------------------------------
;S404 highly optimized data_decruncher_lib 1.0_turbo for use in stc.library
;23.11.93 by Marcus 'Cozine' Ottosson
;
; This is a highly optimized library decruncher.  The code does not fit into
; the instruction cache (286 bytes).  Generally, it's NOT possible to use all
; the 256 bytes available in the instruction cache.  The caches in the 68020
; and the 68030 consist of 16 rows, each of which contains 16 bytes.
; AllocMem() and AllocPooled(), the memory allocation routines used for
; loading executables, guarantee 8-byte alignment.
; On the 68000 my large testfile decrunches 3.7% faster in this version than
; the standard version.  Some files may decrunch much slower on the 68020 and
; the 68030.  I don't know the maximum loss, probably about 10-20%.  The loss
; will not be very high when we are multitasking, since the interrupts cause a
; partial or complete cache flush anyway.  I want some some test reports!!!
;
;-----------------------------------------------------------------------------
;S404 data_decruncher v0.2
;(c) 1993 by Jouni 'Mr.Spiv' Korhonen (SWSW)
;-----------------------------------------------------------------------------
;call with registers: a2 = destination address
;                     a1 = crunched data
;-----------------------------------------------------------------------------

;Uses d0-d7/a0-a6


decrunch:
l0:		addq	#8,a1
edDCopyableStart:
		move.l	a2,a5
		add.l	(a1)+,a2
		add.l	(a1),a1
edDCopyableStart2:
		moveq	#0,d4
		moveq	#16,d5
		movem	(a1),d2/d6/d7
		not	d4
		lea	loff6(pc),a3
		lea	llen5a(pc),a4
		moveq	#1,d0
		moveq	#-1,d3
		bra.s	ltest1

		cnop	0,8			; Use if main loop>=244 bytes

;*** Here's the start of the instruction cache

lins:		subq	#8,d7
		bpl.s	lins2
lins1:		move	d7,d1
		addq	#8,d7
		lsl.l	d7,d6
		move	-(a1),d6
		neg	d1
		lsl.l	d1,d6
		addq	#8,d7
		swap	d6
		move.b	d6,-(a2)
		swap	d6
		cmp.l	a2,a5
		dbhs	d7,lmain
		bra.s	lexma

lins2:		rol	#8,d6
		move.b	d6,-(a2)
ltest1:		cmp.l	a2,a5
		dbhs	d7,lmain
lexma		bhs.s	lexit

lmain1:		move	-(a1),d6
		moveq	#15,d7
lmain:		add	d6,d6
		bcc.s	lins

		dbf	d7,llen1
		move	-(a1),d6
		moveq	#15,d7
llen1:		add	d6,d6
		bcs.s	llen6
		moveq	#2,d1
		moveq	#4-2,d3
		dbf	d7,llen2
		move	-(a1),d6
		moveq	#15,d7
llen2:		add	d6,d6
		bcs.s	llen5
		dbf	d7,llen3
		move	-(a1),d6
		moveq	#15,d7
llen3:		add	d6,d6
		bcc.s	llen4
		moveq	#4,d1
		moveq	#8-2,d3
		lea	llen3a(pc),a6
		bra.s	lbits
llen3a:		add	d1,d3
		cmp	#15,d1
		blo.s	loff1

		moveq	#5,d1
		moveq	#14-1,d3
		lea	llen3b(pc),a6
		bra.s	lbits

llen4:		moveq	#23-2,d3
lloop:		moveq	#8,d1
llen5:		move.l	a4,a6
		bra.s	lbits
llen5a:		add	d1,d3
		not.b	d1
		dbeq	d7,loff2
		bne.s	loff2a
		bra.s	lloop

loff6:		add	d1,a0
		move.b	(a0),-(a2)
lcopy:		move.b	-(a0),-(a2)
		dbf	d3,lcopy
ltest:		cmp.l	a2,a5
		dbhs	d7,lmain
		blo.s	lmain1
lexit:		rts

llen6:		dbf	d7,llen7
		move	-(a1),d6
		moveq	#15,d7
llen7:		add	d6,d6
		addx	d0,d3
loff1:		dbf	d7,loff2
loff2a:		move	-(a1),d6
		moveq	#15,d7
loff2:		add	d6,d6
		bcs.s	loff3

		dbf	d7,loff4
		move	-(a1),d6
		moveq	#15,d7
loff4:		moveq	#9,d1
		lea	32(a2),a0
		add	d6,d6
		bcc.s	loff5
		moveq	#5,d1
		move.l	a2,a0
		bra.s	loff5
loff3:		lea	544(a2),a0
		move	d2,d1
loff5:		move.l	a3,a6

lbits:		and.l	d4,d6
		sub	d1,d7
		bpl.s	lbits2
		add	d7,d1
		lsl.l	d1,d6
		move	d7,d1
		move	-(a1),d6
		neg	d1
		add	d5,d7
lbits2:		lsl.l	d1,d6
		move.l	d6,d1
		swap	d1
		jmp	(a6)

; This part is not executed very often.  Some files may decrunch much slower
; on the 68020/68030.

llen3b:		add	d1,d3
l2ins:		subq	#8,d7
		bmi.s	l2ins1
		rol	#8,d6
		move.b	d6,-(a2)
		dbf	d3,l2ins
		bra.s	ltest

l2ins1:		move	d7,d1
		addq	#8,d7
		lsl.l	d7,d6
		move	-(a1),d6
		neg	d1
		lsl.l	d1,d6
		addq	#8,d7
		swap	d6
		move.b	d6,-(a2)
		swap	d6
		dbf	d3,l2ins
		bra	ltest