]> git.sur5r.net Git - cc65/commitdiff
Optimize inflatemem. 379/head
authorPiotr Fusik <fox@scene.pl>
Sun, 12 Feb 2017 10:41:11 +0000 (11:41 +0100)
committerPiotr Fusik <fox@scene.pl>
Sun, 12 Feb 2017 10:41:11 +0000 (11:41 +0100)
The new version is 30% shorter and 10% faster.
It also avoids the indirect-X addressing mode, which was a problem for PC-Engine.

libsrc/zlib/inflatemem.s

index b3a0510c5d91b64649427b2650b9ae9909d19820..27802fbff13bc9b4b6fc109384fa8700dc31d4aa 100644 (file)
@@ -1,43 +1,41 @@
 ;
-; 2003-09-21, Piotr Fusik
-; 2017-02-07, Greg King
+; 2017-02-12, Piotr Fusik
 ;
 ; unsigned __fastcall__ inflatemem (char* dest, const char* source);
 ;
-
-; Two "lda (0,x)" instructions can't be assembled for the PC-Engine library
-; because an implied ".setdp $2000" changes $00 into a non-zero-page address.
-; Therefore, this file isn't assembled for that target.
-.ifndef        __PCE__
+; NOTE: Be extremely careful with modifications, because this code is heavily
+; optimized for size (for example assumes certain register and flag values
+; when its internal routines return). Test with the gunzip65 sample.
+;
 
         .export         _inflatemem
 
         .import         incsp2
-        .importzp       sp, sreg, ptr1, ptr2, ptr3, ptr4, tmp1
-
-        .macpack        cpu
+        .importzp       sp, sreg, ptr1, ptr2, ptr3, ptr4
 
 ; --------------------------------------------------------------------------
 ;
 ; Constants
 ;
 
-; Maximum length of a Huffman code.
-MAX_BITS      = 15
-
-; All Huffman trees are stored in the bitsCount, bitsPointer_l
-; and bitsPointer_h arrays.  There may be two trees: the literal/length tree
-; and the distance tree, or just one - the temporary tree.
+; Argument values for getBits.
+GET_1_BIT           = $81
+GET_2_BITS          = $82
+GET_3_BITS          = $84
+GET_4_BITS          = $88
+GET_5_BITS          = $90
+GET_6_BITS          = $a0
+GET_7_BITS          = $c0
 
-; Index in the mentioned arrays for the beginning of the literal/length tree
-; or the temporary tree.
-PRIMARY_TREE  = 0
+; Huffman trees.
+TREE_SIZE           = 16
+PRIMARY_TREE        = 0
+DISTANCE_TREE       = TREE_SIZE
 
-; Index in the mentioned arrays for the beginning of the distance tree.
-DISTANCE_TREE = MAX_BITS
-
-; Size of each array.
-TREES_SIZE    = 2*MAX_BITS
+; Alphabet.
+LENGTH_SYMBOLS      = 1+29+2    ; EOF, 29 length symbols, two unused symbols
+DISTANCE_SYMBOLS    = 30
+CONTROL_SYMBOLS     = LENGTH_SYMBOLS+DISTANCE_SYMBOLS
 
 
 ; --------------------------------------------------------------------------
@@ -46,30 +44,26 @@ TREES_SIZE    = 2*MAX_BITS
 ;
 
 ; Pointer to the compressed data.
-inputPointer            :=      ptr1    ; 2 bytes
+inputPointer                :=  ptr1    ; 2 bytes
 
 ; Pointer to the uncompressed data.
-outputPointer           :=      ptr2    ; 2 bytes
+outputPointer               :=  ptr2    ; 2 bytes
 
 ; Local variables.
 ; As far as there is no conflict, same memory locations are used
 ; for different variables.
 
-inflateDynamicBlock_cnt :=      ptr3    ; 1 byte
-inflateCodes_src        :=      ptr3    ; 2 bytes
-buildHuffmanTree_src    :=      ptr3    ; 2 bytes
-getNextLength_last      :=      ptr3    ; 1 byte
-getNextLength_index     :=      ptr3+1  ; 1 byte
-
-buildHuffmanTree_ptr    :=      ptr4    ; 2 bytes
-fetchCode_ptr           :=      ptr4    ; 2 bytes
-getBits_tmp             :=      ptr4    ; 1 byte
-
-moveBlock_len           :=      sreg    ; 2 bytes
-inflateDynamicBlock_np  :=      sreg    ; 1 byte
-inflateDynamicBlock_nd  :=      sreg+1  ; 1 byte
-
-getBit_hold             :=      tmp1    ; 1 byte
+inflateStored_pageCounter   :=  ptr3    ; 1 byte
+inflateDynamic_symbol       :=  ptr3    ; 1 byte
+inflateDynamic_lastLength   :=  ptr3+1  ; 1 byte
+        .assert ptr4 = ptr3 + 2, error, "Need three bytes for inflateDynamic_tempCodes"
+inflateDynamic_tempCodes    :=  ptr3+1  ; 3 bytes
+inflateDynamic_allCodes     :=  inflateDynamic_tempCodes+1 ; 1 byte
+inflateDynamic_primaryCodes :=  inflateDynamic_tempCodes+2 ; 1 byte
+inflateCodes_sourcePointer  :=  ptr3    ; 2 bytes
+inflateCodes_lengthMinus2   :=  ptr4    ; 1 byte
+getBits_base                :=  sreg    ; 1 byte
+getBit_buffer               :=  sreg+1  ; 1 byte
 
 
 ; --------------------------------------------------------------------------
@@ -83,45 +77,59 @@ _inflatemem:
         sta     inputPointer
         stx     inputPointer+1
 ; outputPointer = dest
-.if (.cpu & CPU_ISET_65SC02)
-        lda     (sp)
         ldy     #1
-.else
-        ldy     #0
-        lda     (sp),y
-        iny
-.endif
-        sta     outputPointer
         lda     (sp),y
         sta     outputPointer+1
+        dey
+        lda     (sp),y
+        sta     outputPointer
+
+;       ldy     #0
+        sty     getBit_buffer
 
-;       ldy     #1
-        sty     getBit_hold
-inflatemem_1:
+inflate_blockLoop:
 ; Get a bit of EOF and two bits of block type
-        ldx     #3
-        lda     #0
+;       ldy     #0
+        sty     getBits_base
+        lda     #GET_3_BITS
         jsr     getBits
         lsr     a
 ; A and Z contain block type, C contains EOF flag
 ; Save EOF flag
         php
-; Go to the routine decompressing this block
-        jsr     callExtr
+        bne     inflateCompressed
+
+; Decompress a 'stored' data block.
+;       ldy     #0
+        sty     getBit_buffer   ; ignore bits until byte boundary
+        jsr     getWord         ; skip the length we don't need
+        jsr     getWord         ; get the two's complement length
+        sta     inflateStored_pageCounter
+        bcs     inflateStored_firstByte ; jmp
+inflateStored_copyByte:
+        jsr     getByte
+;       sec
+inflateStoreByte:
+        jsr     storeByte
+        bcc     inflateCodes_loop
+inflateStored_firstByte:
+        inx
+        bne     inflateStored_copyByte
+        inc     inflateStored_pageCounter
+        bne     inflateStored_copyByte
+
+; Block decompressed.
+inflate_nextBlock:
         plp
-        bcc     inflatemem_1
-; C flag is set!
+        bcc     inflate_blockLoop
 
-; return outputPointer - dest;
+; Decompression complete.
+; return outputPointer - dest
         lda     outputPointer
-.if (.cpu & CPU_ISET_65SC02)
-        sbc     (sp)            ; C flag is set
-        ldy     #1
-.else
-        ldy     #0
-        sbc     (sp),y          ; C flag is set
+;       ldy     #0
+;       sec
+        sbc     (sp),y
         iny
-.endif
         pha
         lda     outputPointer+1
         sbc     (sp),y
@@ -130,442 +138,346 @@ inflatemem_1:
 ; pop dest
         jmp     incsp2
 
-; --------------------------------------------------------------------------
-; Go to proper block decoding routine.
-
-callExtr:
-        bne     inflateCompressedBlock
-
-; --------------------------------------------------------------------------
-; Decompress a 'stored' data block.
-
-inflateCopyBlock:
-; Ignore bits until byte boundary
-        ldy     #1
-        sty     getBit_hold
-; Get 16-bit length
-        ldx     #inputPointer
-        lda     (0,x)
-        sta     moveBlock_len
-        lda     (inputPointer),y
-        sta     moveBlock_len+1
-; Skip the length and one's complement of it
-        lda     #4
-        clc
-        adc     inputPointer
-        sta     inputPointer
-        bcc     moveBlock
-        inc     inputPointer+1
-;       jmp     moveBlock
-
-; --------------------------------------------------------------------------
-; Copy block of length moveBlock_len from (0,x) to the output.
-
-moveBlock:
-        ldy     moveBlock_len
-        beq     moveBlock_1
-.if (.cpu & CPU_ISET_65SC02)
-.else
-        ldy     #0
-.endif
-        inc     moveBlock_len+1
-moveBlock_1:
-        lda     (0,x)
-.if (.cpu & CPU_ISET_65SC02)
-        sta     (outputPointer)
-.else
-        sta     (outputPointer),y
-.endif
-        inc     0,x
-        bne     moveBlock_2
-        inc     1,x
-moveBlock_2:
-        inc     outputPointer
-        bne     moveBlock_3
-        inc     outputPointer+1
-moveBlock_3:
-.if (.cpu & CPU_ISET_65SC02)
-        dey
-.else
-        dec     moveBlock_len
-.endif
-        bne     moveBlock_1
-        dec     moveBlock_len+1
-        bne     moveBlock_1
-        rts
-
-; --------------------------------------------------------------------------
+inflateCompressed:
 ; Decompress a Huffman-coded data block
-; (A = 1: fixed, A = 2: dynamic).
+; A=1: fixed block, initialize with fixed codes
+; A=2: dynamic block, start by clearing all code lengths
+; A=3: invalid compressed data, not handled in this routine
+        eor     #2
 
-inflateCompressedBlock:
-        lsr     a
-        bne     inflateDynamicBlock
-; Note: inflateDynamicBlock may assume that A = 1
-
-; --------------------------------------------------------------------------
-; Decompress a Huffman-coded data block with default Huffman trees
-; (defined by the DEFLATE format):
-; literalCodeLength:  144 times 8, 112 times 9
-; endCodeLength:      7
-; lengthCodeLength:   23 times 7, 6 times 8
-; distanceCodeLength: 30 times 5+DISTANCE_TREE, 2 times 8
-;                     (two 8-bit codes from the primary tree are not used).
-
-inflateFixedBlock:
-        ldx     #159
-        stx     distanceCodeLength+32
-        lda     #8
-inflateFixedBlock_1:
-        sta     literalCodeLength-1,x
-        sta     literalCodeLength+159-1,x
-        dex
-        bne     inflateFixedBlock_1
-        ldx     #112
-;       lda     #9
-inflateFixedBlock_2:
-        inc     literalCodeLength+144-1,x       ; sta
-        dex
-        bne     inflateFixedBlock_2
-        ldx     #24
-;       lda     #7
-inflateFixedBlock_3:
-        dec     endCodeLength-1,x       ; sta
-        dex
-        bne     inflateFixedBlock_3
-        ldx     #30
-        lda     #5+DISTANCE_TREE
-inflateFixedBlock_4:
-        sta     distanceCodeLength-1,x
-        dex
-        bne     inflateFixedBlock_4
-        beq     inflateCodes            ; branch always
-
-; --------------------------------------------------------------------------
-; Decompress a Huffman-coded data block, reading Huffman trees first.
-
-inflateDynamicBlock:
-; numberOfPrimaryCodes = 257 + getBits(5)
-        ldx     #5
-;       lda     #1
-        jsr     getBits
-        sta     inflateDynamicBlock_np
-; numberOfDistanceCodes = 1 + getBits(5)
-        ldx     #5
-        lda     #1+29+1
-        jsr     getBits
-        sta     inflateDynamicBlock_nd
-; numberOfTemporaryCodes = 4 + getBits(4)
-        lda     #4
+;       ldy     #0
+inflateCompressed_setCodeLengths:
         tax
-        jsr     getBits
-        sta     inflateDynamicBlock_cnt
-; Get lengths of temporary codes in the order stored in tempCodeLengthOrder
-        txa                     ; lda #0
-        tay
-inflateDynamicBlock_1:
-        ldx     #3              ; A = 0
-        jsr     getBits         ; does not change Y
-inflateDynamicBlock_2:
-        ldx     tempCodeLengthOrder,y
-        sta     literalCodeLength,x
-        lda     #0
+        beq     inflateCompressed_setLiteralCodeLength
+; fixed Huffman literal codes:
+; 144 8-bit codes
+; 112 9-bit codes
+        lda     #4
+        cpy     #144
+        rol     a
+inflateCompressed_setLiteralCodeLength:
+        sta     literalSymbolCodeLength,y
+        beq     inflateCompressed_setControlCodeLength
+; fixed Huffman control codes:
+; 24 7-bit codes
+;  6 8-bit codes
+;  2 meaningless 8-bit codes
+; 30 5-bit distance codes
+        lda     #5+DISTANCE_TREE
+        cpy     #LENGTH_SYMBOLS
+        bcs     inflateCompressed_setControlCodeLength
+        cpy     #24
+        adc     #$100+2-DISTANCE_TREE
+inflateCompressed_setControlCodeLength:
+        cpy     #CONTROL_SYMBOLS
+        bcs     inflateCompressed_noControlSymbol
+        sta     controlSymbolCodeLength,y
+inflateCompressed_noControlSymbol:
         iny
-        cpy     inflateDynamicBlock_cnt
-        bcc     inflateDynamicBlock_1
-        cpy     #19
-        bcc     inflateDynamicBlock_2
-        ror     literalCodeLength+19    ; C flag is set, so this will set b7
-; Build the tree for temporary codes
-        jsr     buildHuffmanTree
+        bne     inflateCompressed_setCodeLengths
 
-; Use temporary codes to get lengths of literal/length and distance codes
-        ldx     #0
-        ldy     #1
-        stx     getNextLength_last
-inflateDynamicBlock_3:
-        jsr     getNextLength
-        sta     literalCodeLength,x
-        inx
-        bne     inflateDynamicBlock_3
-inflateDynamicBlock_4:
-        jsr     getNextLength
-inflateDynamicBlock_5:
-        sta     endCodeLength,x
-        inx
-        cpx     inflateDynamicBlock_np
-        bcc     inflateDynamicBlock_4
-        lda     #0
-        cpx     #1+29
-        bcc     inflateDynamicBlock_5
-inflateDynamicBlock_6:
-        jsr     getNextLength
-        cmp     #0
-        beq     inflateDynamicBlock_7
-        adc     #DISTANCE_TREE-1        ; C flag is set
-inflateDynamicBlock_7:
-        sta     endCodeLength,x
-        inx
-        cpx     inflateDynamicBlock_nd
-        bcc     inflateDynamicBlock_6
-        ror     endCodeLength,x         ; C flag is set, so this will set b7
-;       jmp     inflateCodes
-
-; --------------------------------------------------------------------------
-; Decompress a data block basing on given Huffman trees.
+        tax
+        beq     inflateDynamic
 
+; Decompress a block
 inflateCodes:
         jsr     buildHuffmanTree
-inflateCodes_1:
+inflateCodes_loop:
         jsr     fetchPrimaryCode
-        bcs     inflateCodes_2
-; Literal code
-.if (.cpu & CPU_ISET_65SC02)
-        sta     (outputPointer)
-.else
-        ldy     #0
-        sta     (outputPointer),y
-.endif
-        inc     outputPointer
-        bne     inflateCodes_1
-        inc     outputPointer+1
-        bcc     inflateCodes_1  ; branch always
-; End of block
-inflateCodes_ret:
-        rts
-inflateCodes_2:
-        beq     inflateCodes_ret
-; Restore a block from the look-behind buffer
-        jsr     getValue
-        sta     moveBlock_len
+        bcc     inflateStoreByte
+        beq     inflate_nextBlock
+; Copy sequence from look-behind buffer
+;       ldy     #0
+        sty     getBits_base
+        cmp     #9
+        bcc     inflateCodes_setSequenceLength
         tya
-        jsr     getBits
-        sta     moveBlock_len+1
+;       lda     #0
+        cpx     #1+28
+        bcs     inflateCodes_setSequenceLength
+        dex
+        txa
+        lsr     a
+        ror     getBits_base
+        inc     getBits_base
+        lsr     a
+        rol     getBits_base
+        jsr     getAMinus1BitsMax8
+;       sec
+        adc     #0
+inflateCodes_setSequenceLength:
+        sta     inflateCodes_lengthMinus2
         ldx     #DISTANCE_TREE
         jsr     fetchCode
-        jsr     getValue
-        sec
+        cmp     #4
+        bcc     inflateCodes_setOffsetLowByte
+        inc     getBits_base
+        lsr     a
+        jsr     getAMinus1BitsMax8
+inflateCodes_setOffsetLowByte:
         eor     #$ff
-        adc     outputPointer
-        sta     inflateCodes_src
-        php
-        tya
+        sta     inflateCodes_sourcePointer
+        lda     getBits_base
+        cpx     #10
+        bcc     inflateCodes_setOffsetHighByte
+        lda     getNPlus1Bits_mask-10,x
         jsr     getBits
-        plp
+        clc
+inflateCodes_setOffsetHighByte:
         eor     #$ff
+;       clc
         adc     outputPointer+1
-        sta     inflateCodes_src+1
-        ldx     #inflateCodes_src
-        jsr     moveBlock
-        beq     inflateCodes_1  ; branch always
-
-; --------------------------------------------------------------------------
-; Build Huffman trees basing on code lengths (in bits).
-; stored in the *CodeLength arrays.
-; A byte with its highest bit set marks the end.
+        sta     inflateCodes_sourcePointer+1
+        jsr     copyByte
+        jsr     copyByte
+inflateCodes_copyByte:
+        jsr     copyByte
+        dec     inflateCodes_lengthMinus2
+        bne     inflateCodes_copyByte
+        beq     inflateCodes_loop ; jmp
+
+inflateDynamic:
+; Decompress a block reading Huffman trees first
+;       ldy     #0
+; numberOfPrimaryCodes = 257 + getBits(5)
+; numberOfDistanceCodes = 1 + getBits(5)
+; numberOfTemporaryCodes = 4 + getBits(4)
+        ldx     #3
+inflateDynamic_getHeader:
+        lda     inflateDynamic_headerBits-1,x
+        jsr     getBits
+;       sec
+        adc     inflateDynamic_headerBase-1,x
+        sta     inflateDynamic_tempCodes-1,x
+        dex
+        bne     inflateDynamic_getHeader
 
-buildHuffmanTree:
-        lda     #<literalCodeLength
-        sta     buildHuffmanTree_src
-        lda     #>literalCodeLength
-        sta     buildHuffmanTree_src+1
-; Clear bitsCount and bitsPointer_l
-        ldy     #2*TREES_SIZE+1
-        lda     #0
-buildHuffmanTree_1:
-        sta     bitsCount-1,y
-        dey
-        bne     buildHuffmanTree_1
-        beq     buildHuffmanTree_3      ; branch always
-; Count number of codes of each length
-buildHuffmanTree_2:
-        tax
-        inc     bitsPointer_l,x
-        iny
-        bne     buildHuffmanTree_3
-        inc     buildHuffmanTree_src+1
-buildHuffmanTree_3:
-        lda     (buildHuffmanTree_src),y
-        bpl     buildHuffmanTree_2
-; Calculate a pointer for each length
-        ldx     #0
-        lda     #<sortedCodes
-        ldy     #>sortedCodes
-        clc
-buildHuffmanTree_4:
-        sta     bitsPointer_l,x
-        tya
-        sta     bitsPointer_h,x
-        lda     bitsPointer_l+1,x
-        adc     bitsPointer_l,x         ; C flag is zero
-        bcc     buildHuffmanTree_5
-        iny
-buildHuffmanTree_5:
-        inx
-        cpx     #TREES_SIZE
-        bcc     buildHuffmanTree_4
-        lda     #>literalCodeLength
-        sta     buildHuffmanTree_src+1
+; Get lengths of temporary codes in the order stored in inflateDynamic_tempSymbols
+;       ldx     #0
+inflateDynamic_getTempCodeLengths:
+        lda     #GET_3_BITS
+        jsr     getBits
+        ldy     inflateDynamic_tempSymbols,x
+        sta     literalSymbolCodeLength,y
         ldy     #0
-        bcs     buildHuffmanTree_9      ; branch always
-; Put codes into their place in sorted table
-buildHuffmanTree_6:
-        beq     buildHuffmanTree_7
-        tax
-        lda     bitsPointer_l-1,x
-        sta     buildHuffmanTree_ptr
-        lda     bitsPointer_h-1,x
-        sta     buildHuffmanTree_ptr+1
-        tya
-        ldy     bitsCount-1,x
-        inc     bitsCount-1,x
-        sta     (buildHuffmanTree_ptr),y
-        tay
-buildHuffmanTree_7:
-        iny
-        bne     buildHuffmanTree_9
-        inc     buildHuffmanTree_src+1
-        ldx     #MAX_BITS-1
-buildHuffmanTree_8:
-        lda     bitsCount,x
-        sta     literalCount,x
-        dex
-        bpl     buildHuffmanTree_8
-buildHuffmanTree_9:
-        lda     (buildHuffmanTree_src),y
-        bpl     buildHuffmanTree_6
-        rts
+        inx
+        cpx     inflateDynamic_tempCodes
+        bcc     inflateDynamic_getTempCodeLengths
 
-; --------------------------------------------------------------------------
-; Decode next code length using temporary codes.
+; Build the tree for temporary codes
+        jsr     buildHuffmanTree
 
-getNextLength:
-        stx     getNextLength_index
-        dey
-        bne     getNextLength_1
+; Use temporary codes to get lengths of literal/length and distance codes
+;       ldx     #0
+;       sec
+inflateDynamic_decodeLength:
+; C=1: literal codes
+; C=0: control codes
+        stx     inflateDynamic_symbol
+        php
 ; Fetch a temporary code
         jsr     fetchPrimaryCode
 ; Temporary code 0..15: put this length
-        ldy     #1
-        cmp     #16
-        bcc     getNextLength_2
+        bpl     inflateDynamic_storeLengths
 ; Temporary code 16: repeat last length 3 + getBits(2) times
 ; Temporary code 17: put zero length 3 + getBits(3) times
 ; Temporary code 18: put zero length 11 + getBits(7) times
-        tay
-        ldx     tempExtraBits-16,y
-        lda     tempBaseValue-16,y
+        tax
         jsr     getBits
-        cpy     #17
+        cpx     #GET_3_BITS
+        bcc     inflateDynamic_code16
+        beq     inflateDynamic_code17
+;       sec
+        adc     #7
+inflateDynamic_code17:
+;       ldy     #0
+        sty     inflateDynamic_lastLength
+inflateDynamic_code16:
         tay
-        txa                     ; lda #0
-        bcs     getNextLength_2
-getNextLength_1:
-        lda     getNextLength_last
-getNextLength_2:
-        sta     getNextLength_last
-        ldx     getNextLength_index
-        rts
+        lda     inflateDynamic_lastLength
+        iny
+        iny
+inflateDynamic_storeLengths:
+        iny
+        plp
+        ldx     inflateDynamic_symbol
+inflateDynamic_storeLength:
+        bcc     inflateDynamic_controlSymbolCodeLength
+        sta     literalSymbolCodeLength,x
+        inx
+        cpx     #1
+inflateDynamic_storeNext:
+        dey
+        bne     inflateDynamic_storeLength
+        sta     inflateDynamic_lastLength
+        beq     inflateDynamic_decodeLength ; jmp
+inflateDynamic_controlSymbolCodeLength:
+        cpx     inflateDynamic_primaryCodes
+        bcc     inflateDynamic_storeControl
+; the code lengths we skip here were zero-initialized
+; in inflateCompressed_setControlCodeLength
+        bne     inflateDynamic_noStartDistanceTree
+        ldx     #LENGTH_SYMBOLS
+inflateDynamic_noStartDistanceTree:
+        ora     #DISTANCE_TREE
+inflateDynamic_storeControl:
+        sta     controlSymbolCodeLength,x
+        inx
+        cpx     inflateDynamic_allCodes
+        bcc     inflateDynamic_storeNext
+        dey
+;       ldy     #0
+        jmp     inflateCodes
 
-; --------------------------------------------------------------------------
-; Read a code basing on the primary tree.
+; Build Huffman trees basing on code lengths (in bits)
+; stored in the *SymbolCodeLength arrays
+buildHuffmanTree:
+; Clear nBitCode_totalCount, nBitCode_literalCount, nBitCode_controlCount
+        tya
+;       lda     #0
+buildHuffmanTree_clear:
+        sta     nBitCode_clearFrom,y
+        iny
+        bne     buildHuffmanTree_clear
+; Count number of codes of each length
+;       ldy     #0
+buildHuffmanTree_countCodeLengths:
+        ldx     literalSymbolCodeLength,y
+        inc     nBitCode_literalCount,x
+        inc     nBitCode_totalCount,x
+        cpy     #CONTROL_SYMBOLS
+        bcs     buildHuffmanTree_noControlSymbol
+        ldx     controlSymbolCodeLength,y
+        inc     nBitCode_controlCount,x
+        inc     nBitCode_totalCount,x
+buildHuffmanTree_noControlSymbol:
+        iny
+        bne     buildHuffmanTree_countCodeLengths
+; Calculate offsets of symbols sorted by code length
+;       lda     #0
+        ldx     #$100-3*TREE_SIZE
+buildHuffmanTree_calculateOffsets:
+        sta     nBitCode_literalOffset+3*TREE_SIZE-$100,x
+        clc
+        adc     nBitCode_literalCount+3*TREE_SIZE-$100,x
+        inx
+        bne     buildHuffmanTree_calculateOffsets
+; Put symbols in their place in the sorted array
+;       ldy     #0
+buildHuffmanTree_assignCode:
+        tya
+        ldx     literalSymbolCodeLength,y
+        ldy     nBitCode_literalOffset,x
+        inc     nBitCode_literalOffset,x
+        sta     codeToLiteralSymbol,y
+        tay
+        cpy     #CONTROL_SYMBOLS
+        bcs     buildHuffmanTree_noControlSymbol2
+        ldx     controlSymbolCodeLength,y
+        ldy     nBitCode_controlOffset,x
+        inc     nBitCode_controlOffset,x
+        sta     codeToControlSymbol,y
+        tay
+buildHuffmanTree_noControlSymbol2:
+        iny
+        bne     buildHuffmanTree_assignCode
+        rts
 
+; Read Huffman code using the primary tree
 fetchPrimaryCode:
         ldx     #PRIMARY_TREE
-;       jmp     fetchCode
-
-; --------------------------------------------------------------------------
-; Read a code from input basing on the tree specified in X.
+; Read a code from input using the tree specified in X.
 ; Return low byte of this code in A.
-; For the literal/length tree, the C flag is set if the code is non-literal.
-
+; Return C flag reset for literal code, set for length code.
 fetchCode:
-        lda     #0
-fetchCode_1:
+;       ldy     #0
+        tya
+fetchCode_nextBit:
         jsr     getBit
         rol     a
         inx
         sec
-        sbc     bitsCount-1,x
-        bcs     fetchCode_1
-        adc     bitsCount-1,x   ; C flag is zero
-        cmp     literalCount-1,x
-        sta     fetchCode_ptr
-        ldy     bitsPointer_l-1,x
-        lda     bitsPointer_h-1,x
-        sta     fetchCode_ptr+1
-        lda     (fetchCode_ptr),y
+        sbc     nBitCode_totalCount,x
+        bcs     fetchCode_nextBit
+;       clc
+        adc     nBitCode_controlCount,x
+        bcs     fetchCode_control
+;       clc
+        adc     nBitCode_literalOffset,x
+        tax
+        lda     codeToLiteralSymbol,x
+        clc
+        rts
+fetchCode_control:
+;       sec
+        adc     nBitCode_controlOffset-1,x
+        tax
+        lda     codeToControlSymbol-1,x
+        and     #$1f    ; make distance symbols zero-based
+        tax
+        sec
         rts
 
-; --------------------------------------------------------------------------
-; Decode low byte of a value (length or distance), basing on the code in A.
-; The result is the base value for this code plus some bits read from input.
-
-getValue:
-        tay
-        ldx     lengthExtraBits-1,y
-        lda     lengthBaseValue_l-1,y
-        pha
-        lda     lengthBaseValue_h-1,y
-        tay
-        pla
-;       jmp     getBits
-
-; --------------------------------------------------------------------------
-; Read X-bit number from the input and add it to A.
-; Increment Y if overflow.
-; If X > 8, read only 8 bits.
-; On return X holds number of unread bits: X = (X > 8 ? X - 8 : 0);
-
+; Read A minus 1 bits, but no more than 8
+getAMinus1BitsMax8:
+        rol     getBits_base
+        tax
+        cmp     #9
+        bcs     getByte
+        lda     getNPlus1Bits_mask-2,x
 getBits:
-        cpx     #0
-        beq     getBits_ret
-.if (.cpu & CPU_ISET_65SC02)
-        stz     getBits_tmp
-        dec     getBits_tmp
-.else
-        pha
-        lda     #$ff
-        sta     getBits_tmp
-        pla
-.endif
-getBits_1:
-        jsr     getBit
-        bcc     getBits_2
-        sbc     getBits_tmp     ; C flag is set
-        bcc     getBits_2
-        iny
-getBits_2:
-        dex
-        beq     getBits_ret
-        asl     getBits_tmp
-        bmi     getBits_1
-getBits_ret:
+        jsr     getBits_loop
+getBits_normalizeLoop:
+        lsr     getBits_base
+        ror     a
+        bcc     getBits_normalizeLoop
         rts
 
-; --------------------------------------------------------------------------
-; Read a single bit from input, return it in the C flag.
+; Read 16 bits
+getWord:
+        jsr     getByte
+        tax
+; Read 8 bits
+getByte:
+        lda     #$80
+getBits_loop:
+        jsr     getBit
+        ror     a
+        bcc     getBits_loop
+        rts
 
+; Read one bit, return in the C flag
 getBit:
-        lsr     getBit_hold
-        bne     getBit_ret
+        lsr     getBit_buffer
+        bne     getBit_return
         pha
-.if (.cpu & CPU_ISET_65SC02)
-        lda     (inputPointer)
-.else
-        sty     getBit_hold
-        ldy     #0
+;       ldy     #0
         lda     (inputPointer),y
-        ldy     getBit_hold
-.endif
         inc     inputPointer
-        bne     getBit_1
+        bne     getBit_samePage
         inc     inputPointer+1
-getBit_1:
-        ror     a               ; (C flag was set)
-        sta     getBit_hold
+getBit_samePage:
+        sec
+        ror     a
+        sta     getBit_buffer
         pla
-getBit_ret:
+getBit_return:
+        rts
+
+; Copy a previously written byte
+copyByte:
+        ldy     outputPointer
+        lda     (inflateCodes_sourcePointer),y
+        ldy     #0
+; Write a byte
+storeByte:
+;       ldy     #0
+        sta     (outputPointer),y
+        inc     outputPointer
+        bne     storeByte_return
+        inc     outputPointer+1
+        inc     inflateCodes_sourcePointer+1
+storeByte_return:
         rts
 
 
@@ -575,57 +487,17 @@ getBit_ret:
 ;
 
         .rodata
-; --------------------------------------------------------------------------
-; Arrays for the temporary codes.
-
-; Order, in which lengths of the temporary codes are stored.
-tempCodeLengthOrder:
-        .byte   16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15
 
-; Base values.
-tempBaseValue:
-        .byte   3,3,11
+getNPlus1Bits_mask:
+        .byte   GET_1_BIT,GET_2_BITS,GET_3_BITS,GET_4_BITS,GET_5_BITS,GET_6_BITS,GET_7_BITS
 
-; Number of extra bits to read.
-tempExtraBits:
-        .byte   2,3,7
+inflateDynamic_tempSymbols:
+        .byte   GET_2_BITS,GET_3_BITS,GET_7_BITS,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15
 
-; --------------------------------------------------------------------------
-; Arrays for the length and distance codes.
-
-; Base values.
-lengthBaseValue_l:
-        .byte   <3,<4,<5,<6,<7,<8,<9,<10
-        .byte   <11,<13,<15,<17,<19,<23,<27,<31
-        .byte   <35,<43,<51,<59,<67,<83,<99,<115
-        .byte   <131,<163,<195,<227,<258
-distanceBaseValue_l:
-        .byte   <1,<2,<3,<4,<5,<7,<9,<13
-        .byte   <17,<25,<33,<49,<65,<97,<129,<193
-        .byte   <257,<385,<513,<769,<1025,<1537,<2049,<3073
-        .byte   <4097,<6145,<8193,<12289,<16385,<24577
-lengthBaseValue_h:
-        .byte   >3,>4,>5,>6,>7,>8,>9,>10
-        .byte   >11,>13,>15,>17,>19,>23,>27,>31
-        .byte   >35,>43,>51,>59,>67,>83,>99,>115
-        .byte   >131,>163,>195,>227,>258
-distanceBaseValue_h:
-        .byte   >1,>2,>3,>4,>5,>7,>9,>13
-        .byte   >17,>25,>33,>49,>65,>97,>129,>193
-        .byte   >257,>385,>513,>769,>1025,>1537,>2049,>3073
-        .byte   >4097,>6145,>8193,>12289,>16385,>24577
-
-; Number of extra bits to read.
-lengthExtraBits:
-        .byte   0,0,0,0,0,0,0,0
-        .byte   1,1,1,1,2,2,2,2
-        .byte   3,3,3,3,4,4,4,4
-        .byte   5,5,5,5,0
-distanceExtraBits:
-        .byte   0,0,0,0,1,1,2,2
-        .byte   3,3,4,4,5,5,6,6
-        .byte   7,7,8,8,9,9,10,10
-        .byte   11,11,12,12,13,13
+inflateDynamic_headerBits:
+        .byte   GET_4_BITS,GET_5_BITS,GET_5_BITS
+inflateDynamic_headerBase:
+        .byte   3,LENGTH_SYMBOLS,0
 
 
 ; --------------------------------------------------------------------------
@@ -635,46 +507,28 @@ distanceExtraBits:
 
         .bss
 
-; Number of literal codes of each length in the primary tree
-; (MAX_BITS bytes, overlap with literalCodeLength).
-literalCount:
-
-; --------------------------------------------------------------------------
-; Data for building the primary tree.
+; Data for building trees.
 
-; Lengths of literal codes.
-literalCodeLength:
+literalSymbolCodeLength:
         .res    256
-; Length of the end code.
-endCodeLength:
-        .res    1
-; Lengths of length codes.
-lengthCodeLength:
-        .res    29
-
-; --------------------------------------------------------------------------
-; Data for building the distance tree.
-
-; Lengths of distance codes.
-distanceCodeLength:
-        .res    30
-; For two unused codes in the fixed trees and an 'end' mark.
-        .res    3
-
-; --------------------------------------------------------------------------
-; The Huffman trees.
-
-; Number of codes of each length.
-bitsCount:
-        .res    TREES_SIZE
-; Pointers to sorted codes of each length.
-bitsPointer_l:
-        .res    TREES_SIZE+1
-bitsPointer_h:
-        .res    TREES_SIZE
-
-; Sorted codes.
-sortedCodes:
-        .res    256+1+29+30+2
-
-.endif
+controlSymbolCodeLength:
+        .res    CONTROL_SYMBOLS
+
+; Huffman trees.
+
+nBitCode_clearFrom:
+nBitCode_totalCount:
+        .res    2*TREE_SIZE
+nBitCode_literalCount:
+        .res    TREE_SIZE
+nBitCode_controlCount:
+        .res    2*TREE_SIZE
+nBitCode_literalOffset:
+        .res    TREE_SIZE
+nBitCode_controlOffset:
+        .res    2*TREE_SIZE
+
+codeToLiteralSymbol:
+        .res    256
+codeToControlSymbol:
+        .res    CONTROL_SYMBOLS