Adaptive
huffman coding has the advantage over static coding that the entire dataset does not need to be known in advance and the dictionary does not need to be transmitted separately from the data. It is also trickier to implement.
It would be wrong to call this implementation adaptive huffman coding because well, it isn't. It is
an adaptive encoding, but my tree balancing routines are buggy and I've already spent enough time on this. To make it a true huffman encoding, just fix
balance-FGK or
balance-V.
The performance is hideously slow — around 200 characters per second on my machine. I think most of this is due to all the tree manipulation.
I intended to mate this code with the code in
LZW, but the combination works pretty badly, probably because of my lack of a good idea for encoding the higher-ordered symbols effeciently.
JR
# this uses the tree data structure from tcllib
package require struct
namespace eval ::huffman {
# reverse a string
# This is used to store partial words in big-endian order. 'binary
# format' will only store bits starting at the msb or lsb of a word, so we
# do the formatting little-endian and reverse it
proc reverse {str} {
set rv {}
foreach c [split $str ""] {
set rv [linsert $rv 0 $c]
}
return [join $rv ""]
}
# some tree helper functions
# create a tree and set up symbol->node and node->symbol arrays
proc mktree {} {
variable trees
variable symbols
set tree [::struct::tree]
set trees(e0) root
set symbols(root) e0
$tree set root 0
return $tree
}
# delete the tree and arrays
proc deltree {tree} {
variable trees
variable symbols
$tree destroy
unset trees
unset symbols
}
# given a node, find the bit path to that node from the root
proc getpath {tree node} {
set p {}
while {$node != "root"} {
append p [$tree index $node]
set node [$tree parent $node]
}
return [reverse $p]
}
# 'tree swap' doesn't move child trees, so we move the two nodes
# separately using this helper
proc swap {tree node1 node2} {
set np [$tree parent $node1]
set ni [$tree index $node1]
$tree move [$tree parent $node2] [$tree index $node2] $node1
$tree move $np $ni $node2
}
# dump out the tree
# for debugging
proc dump-tree {tree} {
variable symbols
$tree walk root -type bfs -order pre -command {
if {[info exists symbols(%n)]} {set sym $symbols(%n)} else {set sym "-"}
puts "[string repeat " " [%t depth %n]]%n($sym [%t get %n]): [%t children %n]"
}
}
# increment weight of node, rebalance tree
proc update-tree {tree node} {
balance-FGK $tree $node
}
# update the node weights and rebalance the tree using algorithm FGK.
# not quite right, doesn't always find the correct sibling.
proc balance-FGK {tree node} {
set cn $node
while {$cn != "root"} {
set next [$tree parent $cn]
$tree set $cn [expr {[$tree get $cn]+1}]
set sib [$tree next $cn]
if {$sib == ""} {
# if we're at the end of a row then the next sibling is the
# start of the previous row.
# this works if the tree is only 2 deep, otherwise it's really
# wrong
set sib [$tree previous $next]
}
if {$sib != "" && [$tree get $cn] > [$tree get $sib]} {
swap $tree $cn $sib
}
set cn $next
}
# update root node
$tree set $cn [expr {[$tree get $cn]+1}]
}
# update the node weights and rebalance the tree using algorithm V.
# pretty much completely wrong
proc balance-V {tree node} {
# update the weights on the path to the added node
set cn $node
while {$cn != ""} {
$tree set $cn [expr {[$tree get $cn]+1}]
set cn [$tree parent $cn]
}
set pn {}
# this is an attempt to walk the tree in 'implicit' order.
# it doesn't work.
$tree walk root -order post -type bfs -command {
# if weight of current node is less than weight of previous node,
# then swap
if {$pn != "" && [%t get %n] < [%t get $pn]} {
swap %t %n $pn
}
set pn %n
}
}
# encode symbol (char/integer) as bitstring
# handle int range 0-2^11
# the range is beyond 0-255 because this is intended to handle a LZW
# dictionary rather than just characters.
# there's got to be a better way.
# symbol is encoded as a length bit (0 = 8 bits, 1 = 11 bits) followed by
# the data in big-endian order
proc encsym {sym} {
set ebs "" ;# encoded bit-string
if {$sym < 256} {
append ebs 0
set si [binary format c $sym]
binary scan $si B8 sv
append ebs $sv
} else {
append ebs 1
set si [binary format s $sym]
# format in little-endian then reverse
binary scan $si b11 sv
append ebs [reverse $sv]
}
return $ebs
}
# decode the next symbol in the bitstring starting at index 'ind' using
# the given tree
# updates the tree
# returns the symbol and the new index
proc getsym {tree bits ind} {
variable symbols
set n root
set bl [llength $bits]
while {![$tree isleaf $n] && $ind < $bl} {
set n [lindex [$tree children $n] [lindex $bits $ind]]
incr ind
}
if {$symbols($n) == "e0"} {
# new symbol
set lb [lindex $bits $ind] ;# length bit
incr ind
if {$lb == "0"} {
set sb [join [lrange $bits $ind [expr {$ind+7}]] ""]
incr ind 8
binary scan [binary format B8 $sb] c sym
# mask to keep result unsigned
set sym [expr {$sym & 0xff}]
} else {
set sb [join [lrange $bits $ind [expr {$ind+10}]] ""]
incr ind 11
# reverse the bitstring and decode as little-endian
binary scan [binary format b11 [reverse $sb]] s sym
# mask to keep result unsigned
set sym [expr {$sym & 0x7ff}]
}
set symbols($n) $sym
} else {
set sym $symbols($n)
}
# add the symbol to the tree and/or update the tree
addsym $tree $sym
return [list $sym $ind]
}
# insert the symbol into the tree if it doesn't exist
# update the tree and the mapping arrays
# returns code for symbol
proc addsym {tree sym} {
variable trees
variable symbols
set rs {}
if {![info exists trees($sym)]} {
# if symbol doesn't exist: split e0 into new node and new e0
append rs [getpath $tree $trees(e0)] [encsym $sym]
set n $trees(e0)
set trees($sym) [$tree insert $n 0]
$tree set $trees($sym) 0
set trees(e0) [$tree insert $n 1]
$tree set $trees(e0) 0
set symbols($trees($sym)) $sym
set symbols($trees(e0)) e0
} else {
# symbol exists, just emit code and update
append rs [getpath $tree $trees($sym)]
}
update-tree $tree $trees($sym)
return $rs
}
# input is a list of integers
proc encode {rval} {
set tree [mktree]
set l [llength $rval]
set c 0
foreach rv $rval {
append bs [addsym $tree $rv]
incr c
# progress meter
puts -nonewline stderr [format "%3.2f%%\r" [expr {double($c)/$l*100}]]
}
deltree $tree
return $bs
}
# decode bitstring into list of integers
proc decode {bs} {
set tree [mktree]
set bits [split $bs ""]
set ind 0
set ds {}
set bl [llength $bits]
while {$ind < $bl} {
set si [getsym $tree $bits $ind]
lappend ds [lindex $si 0]
set ind [lindex $si 1]
# progress meter
puts -nonewline stderr [format "%3.2f%%\r" [expr {double($ind)/$bl*100}]]
}
return $ds
}
}
# end namespace eval ::huffman
# convert string to list
proc s2l {str} {
set l {}
foreach c [split $str ""] {
binary scan $c c num
lappend l $num
}
return $l
}
# convert list to string
proc l2s {list} {
set s ""
foreach num $list {
append s [binary format c $num]
}
return $s
}
# some testing
set st "Hello, world!"
# set st [read stdin]
puts "input length: [string length $st] bytes"
set cst [huffman::encode [s2l $st]]
puts "compressed length: [expr {[string length $cst]/8}] (bytes) [format "%3.2f%%" [expr {(1-([string length $cst]/8)/double([string length $st]))*100}]]"
set dst [l2s [huffman::decode $cst]]
if {[string compare $dst $st] == 0} {
puts "input and output match"
} else {
puts "input and output differ"
}