Updated 2017-11-19 15:24:27 by dbohdan

LZ4 is a byte-oriented lossless compression algorithm optimized for speed. Its low-level implementations provide extremely fast decompression compared to the competitors.

Tcl implementation  edit

The following pure Tcl implementation of the LZ4 decoder has been tested in Tcl 8.5, Tcl 8.6, and Jim Tcl (now in builds with as well as without UTF-8 support).

To decompress data compressed with LZ4 use the command ::lz4::decode $data $verify. If $verify is true and the command ::xxhash::xxhash32 is available the decoder will verify the header and the content checksum in LZ4 frames, which lets you detect data corruption. A module implementing ::xxhash::xxhash32 is available on the wiki page xxHash. It requires Critcl to be fast. An alternative is to set $verify to zero, checksum the data before compression with a checksum algorithm implemented in C (e.g., zlib crc32 in Tcl 8.6/Jim Tcl 0.77+ or one of the C-accelerated checksum commands in Tcllib) and verify its integrity yourself afterwards.

Download with wiki-reaper: wiki-reaper 48789 0 > lz4-0.2.4.tm

Code

# A pure Tcl LZ4 decoder.
# Copyright (c) 2017 dbohdan
# License: MIT
# This decoder implements version 1.5 of the LZ4 frame spec. It can verify
# checksums if the command ::xxhash::xxhash32 is available. It is known to work
# in Tcl 8.5, Tcl 8.6, and Jim Tcl 0.74-0.77.
namespace eval ::lz4 {
    variable version 0.2.4
    # The following variable will be true in Jim Tcl and false in Tcl 8.x.
    variable jim [expr {![catch {info version}]}]
}

if {$::lz4::jim} {
    proc ::lz4::byte-range {bytes start end} {
        tailcall string byterange $bytes $start $end
    }
} else {
    # Benchmarking shows this version to be faster than a tailcall in Tcl 8.6.7.
    proc ::lz4::byte-range {bytes start end} {
        return [string range $bytes $start $end]
    }
}

proc ::lz4::decode-block {data ptr endPtr window} {
    set result {}
    while 1 {
        if {![binary scan $data "@$ptr cu" token]} {
            error {data truncated}
        }
        incr ptr 1
        set litLen   [expr {($token >> 4) & 0x0F}]
        set matchLen [expr {$token & 0x0F}]
        if {$litLen == 15} {
            while 1 {
                if {![binary scan $data "@$ptr cu" byte]} {
                    error {data truncated}
                }
                incr ptr 1
                incr litLen $byte
                if {$byte < 255} break
            }
        }
        if {![binary scan $data "@$ptr a$litLen" literals]} {
            error {data truncated}
        }
        incr ptr $litLen
        append window $literals
        append result $literals
        # The last sequence is incomplete.
        if {$ptr < $endPtr} {
            if {![binary scan $data "@$ptr su" offset]} {
                error {data truncated}
            }
            incr ptr 2
            if {$matchLen == 15} {
                while 1 {
                    if {![binary scan $data "@$ptr cu" byte]} {
                        error {data truncated}
                    }
                    incr ptr 1
                    incr matchLen $byte
                    if {$byte < 255} break
                }
            }
            incr matchLen 4
            incr offset -1
            set endOffset [expr {
                $offset - $matchLen > 0 ? $offset - $matchLen : 0
            }]
            set overlapLen [expr {
                $offset - $matchLen > 0 ? 0 : $matchLen - $offset
            }]
            set match [byte-range $window end-$offset end-$endOffset]
            set matchRepeated [string repeat $match [expr {
                ($overlapLen / ($offset - $endOffset + 1)) + 2
            }]]
            set matchWithOverlap [byte-range $matchRepeated 0 $matchLen-1]
            append window $matchWithOverlap
            append result $matchWithOverlap
        }
        if {$ptr == $endPtr} break
        if {$ptr > $endPtr} {
            error {read beyond block end}
        }
    }
    return [list $ptr $window $result]
}

proc ::lz4::decode-frame {data ptr verify} {
    # Decode and validate the header.
    if {![binary scan $data "@$ptr i" magic]} {
        error {data truncated}
    }
    incr ptr 4
    set fieldsStartPtr $ptr
    if {$magic == 0x184D2204} {
        # Normal frame.
    } elseif {(0x184D2A50 <= $magic) && ($magic <= 0x184D2A5F)} {
        # Skippable frame.
        if {![binary scan $data "@$ptr iu" frameSize]} {
            error {data truncated}
        }
        incr ptr 4
        incr ptr $frameSize
        return [list $ptr {}]
    } else {
        error "unexpected magic number: $magic"
    }
    set flags {}
    if {![binary scan $data "@$ptr cu cu" flags blockDescr]} {
        error {data truncated}
    }
    incr ptr 2
    set flagsReserved      [expr {($flags & 0b00000011) == 0}]
    set hasContentChecksum [expr {($flags & 0b00000100) == 0b00000100}]
    set hasContentSize     [expr {($flags & 0b00001000) == 0b00001000}]
    set hasBlockChecksums  [expr {($flags & 0b00010000) == 0b00010000}]
    set blockIndep         [expr {($flags & 0b00100000) == 0b00100000}]
    set version            [expr {($flags & 0b11000000) == 0b01000000}]
    if {!$flagsReserved} {
        error {FLG reserved bits aren't zero}
    }
    if {!$version} {
        error {frame version isn't "01"}
    }
    set blockDescrReserved [expr {($blockDescr & 0b10001111) == 0}]
    set blockMaxSize       [expr {$blockDescr >> 4}]
    if {!$blockDescrReserved} {
        error {BD reserved bits aren't zero}
    }
    if {$blockMaxSize < 4} {
        error "invalid block maximum size ($blockMaxSize < 4)"
    }
    if {$hasContentSize} {
        if {![binary scan $data "@$ptr wu" uncompressedSize]} {
            error {data truncated}
        }
        incr ptr 8
    }
    if {![binary scan $data "@$ptr cu" headerChecksum]} {
        error {data truncated}
    }
    if {$verify} {
        if {![binary scan $data \
                          "@$fieldsStartPtr a[expr {$ptr - $fieldsStartPtr}]" \
                          header]} {
            error {can't scan header fields to verify checksum\
                   (this shouldn't happen)}
        }
        if {(([::xxhash::xxhash32 $header 0] >> 8) & 0xff) != $headerChecksum} {
            error {frame header doesn't match checksum}
        }
    }
    incr ptr 1

    # Decode the blocks.
    set window {}
    while 1 {
        if {![binary scan $data "@$ptr iu" blockSize]} {
            error {data truncated}
        }
        incr ptr 4
        set compressed [expr {!($blockSize >> 31)}]
        set blockSize [expr {$blockSize & 0x7fffffff}] ;# Zero the highest bit.
        if {$blockSize == 0} break

        if {$compressed} {
            lassign [decode-block $data \
                                  $ptr \
                                  [expr {$ptr + $blockSize}] $window] \
                    ptr \
                    window \
                    decodedBlock
            if {$blockIndep} {
                set window {}
            } else {
                set window [string range $window end-0xFFFF end]
            }
        } else {
            if {![binary scan $data "@$ptr a$blockSize" decodedBlock]} {
                error {data truncated}
            }
            incr ptr $blockSize
        }
        append result $decodedBlock
    }

    # Decode the checksum.
    if {$hasContentChecksum} {
        if {![binary scan $data "@$ptr iu" contentChecksum]} {
            error {data truncated}
        }
        incr ptr 4
        if {$verify && ([::xxhash::xxhash32 $result 0] != $contentChecksum)} {
            error {decoded data doesn't match checksum}
        }
    }

    return [list $ptr $result]
}

proc ::lz4::decode {data verify} {
    if {$verify && ([info commands ::xxhash::xxhash32] eq {})} {
        error {asked to verify checksums but [::xxhash::xxhash32] is absent}
    }
    set ptr 0
    set result {}
    set len [string length $data]
    while {$ptr < $len} {
        lassign [decode-frame $data $ptr $verify] ptr frame
        append result $frame
    }
    return $result
}

proc ::lz4::assert-equal {actual expected} {
    if {$actual ne $expected} {
        if {[string length $actual] > 200} {
            set actual [string range $actual 0 199]...
        }
        if {[string length $expected] > 200} {
            set expected [string range $expected 0 199]...
        }
        error "expected \"$expected\",\n\
               but got \"$actual\""
    }
}

proc ::lz4::file-test {path canHash} {
    if {![file exists $path]} {
        puts stderr "can't find file \"$path\" -- skipping test"
        return
    }
    # Can't use -ignorestderr because Jim Tcl doesn't support it.
    if {[catch {exec lz4 --version 2>@1}]} {
        puts stderr {can't run lz4 -- skipping test}
        return
    }
    set ch [open $path rb]
    set data [read $ch]
    close $ch
    set ch [open [list |lz4 -c -9 $path]]
    fconfigure $ch -translation binary
    set dataCompressed [read $ch]
    close $ch
    assert-equal [decode $dataCompressed 0] $data
    if {$canHash} {
        assert-equal [decode $dataCompressed 1] $data
    }
}

proc ::lz4::value-test {compressed original canHash} {
    assert-equal [decode $compressed 0] $original
    if {$canHash} {
        assert-equal [decode $compressed 1] $original
    }
}

proc ::lz4::test {} {
    set canHash 1
    if {[info commands ::xxhash::xxhash32] eq {}} {
        set canHash 0
        puts stderr [join {
            {warning: [::xxhash::xxhash32] is absent; }
            {tests with checksum verification will be skipped}
        } {}]
    }

    set hello {Hello, World!}
    set helloCompressed [join {
        \x04\x22 \x4d\x18 \x64\x40 \xa7\x0d \x00\x00 \x80\x48 \x65\x6c \x6c\x6f
        \x2c\x20 \x57\x6f \x72\x6c \x64\x21 \x00\x00 \x00\x00 \xe8\x43 \xd0\x9e
    } {}]

    set seq {}
    for {set i 0} {$i < 4} {incr i} {
        append seq [string repeat $i 64]
    }
    set seqCompressed [join {
        \x04\x22 \x4d\x18 \x64\x40 \xa7\x1a \x00\x00 \x00\x1f \x30\x01 \x00\x2c
        \x1f\x31 \x01\x00 \x2c\x1f \x32\x01 \x00\x2c \x1f\x33 \x01\x00 \x27\x50
        \x33\x33 \x33\x33 \x33\x00 \x00\x00 \x00\x80 \xf5\x97 \x31
    } {}]

    set abc "abcabcabc 123123123 abcabcabc 123123123 abcabcabc123123123\n"
    set abcCompressed [join {
        \x04 \x22 \x4d \x18 \x64 \x40 \xa7 \x1d \x00 \x00 \x00 \x32
        \x61 \x62 \x63 \x03 \x00 \x42 \x20 \x31 \x32 \x33 \x03 \x00
        \x1f \x20 \x14 \x00 \x0a \xa0 \x31 \x32 \x33 \x31 \x32 \x33
        \x31 \x32 \x33 \x0a \x00 \x00 \x00 \x00 \xc3 \x67 \x9d \xbf
    } {}]

    set blah [join {
        {1blah2HELLOblah3blah4blah foo bar blah !!!! }
        {213218372132-------------------}
    } {}]
    set blahCompressed [join {
        \x04 \x22 \x4d \x18 \x64 \x40 \xa7 \x3c \x00 \x00 \x00 \xb0
        \x31 \x62 \x6c \x61 \x68 \x32 \x48 \x45 \x4c \x4c \x4f \x0a
        \x00 \x10 \x33 \x05 \x00 \x10 \x34 \x05 \x00 \x91 \x20 \x66
        \x6f \x6f \x20 \x62 \x61 \x72 \x20 \x0d \x00 \xd0 \x21 \x21
        \x21 \x21 \x20 \x32 \x31 \x33 \x32 \x31 \x38 \x33 \x37 \x08
        \x00 \x19 \x2d \x01 \x00 \x50 \x2d \x2d \x2d \x2d \x2d \x00
        \x00 \x00 \x00 \xa4 \xeb \xf6 \xac
    } {}]

    puts stderr {running tests}
    puts stderr {--- hello}
    value-test $helloCompressed $hello 0
    if {$canHash} {
        puts stderr {--- checksum error}
        catch {decode $helloCompressed 1} err
        assert-equal $err {decoded data doesn't match checksum}
    }
    puts stderr {--- seq}
    value-test $seqCompressed $seq $canHash
    puts stderr {--- abc}
    value-test $abcCompressed $abc $canHash
    puts stderr {--- blah}
    value-test $blahCompressed $blah $canHash
    puts stderr {--- empty}
    value-test {} {} $canHash
    puts stderr {--- truncated}
    catch {decode [string range $seqCompressed 0 20] 0} err
    assert-equal $err {data truncated}
    puts stderr {--- multiple frames}
    assert-equal [decode $helloCompressed$seqCompressed$helloCompressed 0] \
                 $hello$seq$hello
    puts stderr {--- passwd}
    file-test /etc/passwd $canHash
    puts stderr {--- sh (binary)}
    file-test /bin/sh $canHash
}

# If this is the main script...
if {[info exists argv0] && ([file tail [info script]] eq [file tail $argv0])} {
    if {$argv eq {--test}} {
        ::lz4::test
    } elseif {[lindex $argv 0] ne {}} {
        set verify 1
        set argv [lassign $argv a]
        if {$a eq {--ignore-checksums}} {
            set verify 0
            set argv [lassign $argv a]
        }
        if {$a eq {-}} {
            set ch stdin
            fconfigure stdin -translation binary
        } else {
            set ch [open $a rb]
        }
        set data [read $ch]
        close $ch
        fconfigure stdout -translation binary
        puts -nonewline [::lz4::decode $data $verify]
    } else {
        puts "usage: [info script] (--test | \[--ignore-checksums\] filename)"
        puts "Use \"-\" as the filename to read from the standard input."
    }
}

Performance comparison

> lz4 --version
*** LZ4 command line interface 64-bits r128, by Yann Collet (Apr 18 2016) ***
> echo 'puts [info patchlevel]' | tclsh
8.6.5
> jimsh -e 'info patchlevel' 
0.75
> lz4 --keep -9 ffmpeg.exe
Compressed 40859136 bytes into 18846155 bytes ==> 46.12%
> time lz4 --decompress --to-stdout --no-frame-crc ffmpeg.exe.lz4 > /dev/null
0.07user 0.00system 0:00.08elapsed 92%CPU
> time tclsh lz4.tcl --ignore-checksums ffmpeg.exe.lz4 > /dev/null
21.81user 0.35system 0:22.36elapsed 99%CPU
> time jimsh lz4.tcl --ignore-checksums ffmpeg.exe.lz4 > /dev/null
195.18user 0.45system 3:17.84elapsed 98%CPU

See also  edit