# tdelta.tcl -- # # Produce an rdiff-style delta signature of one file with respect to another, # and re-create one file by applying the delta to the other. # # Written by Stephen Huntley (stephen.huntley@alum.mit.edu) # # Usage: # # tdelta <reference file | channel> <target file | channel> [sizecheck [fingerprint]] # Returns a delta of the target file with respect to the reference file. # i.e., using patch to apply the delta to the target file will re-create the reference file. # # sizecheck and fingerprint are booleans which enable time-saving checks: # # if sizecheck is True then if the file size is # less than five times the block size, then no delta calculation is done and the # signature contains the full reference file contents. # # if fingerprint is True then 10 small strings ("fingerprints") are taken from the target # file and searched for in the reference file. If at least three aren't found, then # no delta calculation is done and the signature contains the full reference file contents. # # tpatch <target file | channel> <delta signature> <output file (duplicate of reference file) | channel> # Reconstitute original reference file by applying delta to target file. # # # global variables: # # blockSize # Size of file segments to compare. # Smaller blockSize tends to create smaller delta. # Larger blockSize tends to take more time to compute delta. # md5Size # Substring of md5 checksum to store in delta signature. # If security is less of a concern, set md5Size to a number # between 1-32 to create a more compact signature. namespace eval ::trsync { if ![info exists blockSize] {variable blockSize 100} if ![info exists Mod] {variable Mod [expr pow(2,16)]} if ![info exists md5Size] {variable md5Size 32} variable temp if ![info exists temp] { catch {set temp $::env(TMP)} catch {set temp $::env(TEMP)} catch {set temp $::env(TRSYNC_TEMP)} if [catch {file mkdir $temp}] {set temp [pwd]} } if ![file writable $temp] {error "temp location not writable"} proc Backup {args} { return } proc ConstructFile {copyinstructions {eolNative 0} {backup {}}} { if [catch {package present md5 2}] {package forget md5 ; package require md5 2} set fileToConstruct [lindex $copyinstructions 0] set existingFile [lindex $copyinstructions 1] set blockSize [lindex $copyinstructions 2] array set fileStats [lindex $copyinstructions 3] array set digestInstructionArray [DigestInstructionsExpand [lindex $copyinstructions 4] $blockSize] array set dataInstructionArray [lindex $copyinstructions 5] unset copyinstructions if {[lsearch [file channels] $existingFile] == -1} { set existingFile [FileNameNormalize $existingFile] if {$fileToConstruct == {}} {file delete -force $existingFile ; return} catch { set existingID [open $existingFile r] fconfigure $existingID -translation binary } } else { set existingID $existingFile fconfigure $existingID -translation binary } set temp $::trsync::temp if {[lsearch [file channels] $fileToConstruct] == -1} { set fileToConstruct [FileNameNormalize $fileToConstruct] set constructTag "trsync.[md5::md5 -hex "[clock seconds] [clock clicks]"]" set constructID [open $temp/$constructTag w] } else { set constructID $fileToConstruct } fconfigure $constructID -translation binary if $eolNative {set eolNative [string is ascii -strict [array get dataInstructionArray]]} set filePointer 1 while {$filePointer <= $fileStats(size)} { if {[array names dataInstructionArray $filePointer] != {}} { puts -nonewline $constructID $dataInstructionArray($filePointer) set segmentLength [string length $dataInstructionArray($filePointer)] array unset dataInstructionArray $filePointer set filePointer [expr $filePointer + $segmentLength] } elseif {[array names digestInstructionArray $filePointer] != {}} { if ![info exists existingID] {error "Corrupt copy instructions."} set blockNumber [lindex $digestInstructionArray($filePointer) 0] set blockMd5Sum [lindex $digestInstructionArray($filePointer) 1] seek $existingID [expr $blockNumber * $blockSize] set existingBlock [read $existingID $blockSize] set existingBlockMd5Sum [string range [md5::md5 -hex -- $existingBlock] 0 [expr [string length $blockMd5Sum] - 1]] if ![string equal -nocase $blockMd5Sum $existingBlockMd5Sum] {error "digest file contents mismatch"} puts -nonewline $constructID $existingBlock if $eolNative {set eolNative [string is ascii -strict $existingBlock]} unset existingBlock set filePointer [expr $filePointer + $blockSize] } else { error "Corrupt copy instructions." } } catch {close $existingID} set fileStats(eolNative) $eolNative if {[lsearch [file channels] $fileToConstruct] > -1} {return [array get fileStats]} close $constructID if $eolNative { fcopy [set fin [open $temp/$constructTag r]] [set fout [open $temp/${constructTag}fcopy w]] close $fin close $fout file delete -force $temp/$constructTag set constructTag "${constructTag}fcopy" } catch {file attributes $temp/$constructTag -readonly 0} result catch {file attributes $temp/$constructTag -permissions rw-rw-rw-} result catch {file attributes $temp/$constructTag -owner $fileStats(uid)} result catch {file attributes $temp/$constructTag -group $fileStats(gid)} result catch {file mtime $temp/$constructTag $fileStats(mtime)} result catch {file atime $temp/$constructTag $fileStats(atime)} result if [string equal $fileToConstruct $existingFile] { catch {file attributes $existingFile -readonly 0} result catch {file attributes $existingFile -permissions rw-rw-rw-} result } Backup $backup $fileToConstruct file mkdir [file dirname $fileToConstruct] file rename -force $temp/$constructTag $fileToConstruct array set attributes $fileStats(attributes) array set attrConstruct [file attributes $fileToConstruct] foreach attr [array names attributes] { if [string equal [array get attributes $attr] [array get attrConstruct $attr]] {continue} if {[string equal $attr "-longname"] || [string equal $attr "-shortname"] || [string equal $attr "-permissions"]} {continue} catch {file attributes $fileToConstruct $attr $attributes($attr)} result } catch {file attributes $fileToConstruct -permissions $fileStats(mode)} result return } proc CopyInstructions {filename digest} { if [catch {package present md5 2}] {package forget md5 ; package require md5 2} if {[lsearch [file channels] $filename] == -1} { set filename [FileNameNormalize $filename] file stat $filename fileStats array set fileAttributes [file attributes $filename] array unset fileAttributes -longname array unset fileAttributes -shortname set arrayadd attributes ; lappend arrayadd [array get fileAttributes] ; array set fileStats $arrayadd set f [open $filename r] } else { set f $filename set fileStats(attributes) {} } fconfigure $f -translation binary seek $f 0 end set fileSize [tell $f] seek $f 0 set fileStats(size) $fileSize set digestFileName [lindex $digest 0] set blockSize [lindex $digest 1] set digest [lrange $digest 2 end] if {[lsearch -exact $digest fingerprints] > -1} { set fingerPrints [lindex $digest end] set digest [lrange $digest 0 end-2] set fileContents [read $f] set matchCount 0 foreach fP $fingerPrints { if {[string first $fP $fileContents] > -1} {incr matchCount} if {$matchCount > 3} {break} } unset fileContents seek $f 0 if {$matchCount < 3} {set digest {}} } set digestLength [llength $digest] for {set i 0} {$i < $digestLength} {incr i} { set arrayadd [lindex [lindex $digest $i] 1] lappend arrayadd $i array set Checksums $arrayadd } set digestInstructions {} set dataInstructions {} set weakChecksum {} set startBlockPointer 0 set endBlockPointer 0 if ![array exists Checksums] { set dataInstructions 1 lappend dataInstructions [read $f] set endBlockPointer $fileSize } while {$endBlockPointer < $fileSize} { set endBlockPointer [expr $startBlockPointer + $blockSize] incr startBlockPointer if {$weakChecksum == {}} { set blockContents [read $f $blockSize] set blockNumberSequence [SequenceBlock $blockContents] set weakChecksumInfo [WeakChecksum $blockNumberSequence] set weakChecksum [format %.0f [lindex $weakChecksumInfo 0]] set startDataPointer $startBlockPointer set endDataPointer $startDataPointer set dataBuffer {} } if {[array names Checksums $weakChecksum] != {}} { set md5Sum [md5::md5 -hex -- $blockContents] set blockIndex $Checksums($weakChecksum) set digestmd5Sum [lindex [lindex $digest $blockIndex] 0] if [string equal -nocase $digestmd5Sum $md5Sum] { if {$endDataPointer > $startDataPointer} { lappend dataInstructions $startDataPointer lappend dataInstructions $dataBuffer } lappend digestInstructions $startBlockPointer lappend digestInstructions "$blockIndex [string range $md5Sum 0 [expr $::trsync::md5Size - 1]]" set weakChecksum {} set startBlockPointer $endBlockPointer continue } } if {$endBlockPointer >= $fileSize} { lappend dataInstructions $startDataPointer lappend dataInstructions $dataBuffer$blockContents break } set rollChar [read $f 1] binary scan $rollChar c* rollNumber set rollNumber [expr ($rollNumber + 0x100)%0x100] lappend blockNumberSequence $rollNumber set blockNumberSequence [lrange $blockNumberSequence 1 end] binary scan $blockContents a1a* rollOffChar blockContents set blockContents $blockContents$rollChar set dataBuffer $dataBuffer$rollOffChar incr endDataPointer set weakChecksumInfo "[eval RollChecksum [lrange $weakChecksumInfo 1 5] $rollNumber] [lindex $blockNumberSequence 0]" set weakChecksum [format %.0f [lindex $weakChecksumInfo 0]] } close $f lappend copyInstructions $filename lappend copyInstructions $digestFileName lappend copyInstructions $blockSize lappend copyInstructions [array get fileStats] lappend copyInstructions [DigestInstructionsCompress $digestInstructions $blockSize] lappend copyInstructions $dataInstructions return $copyInstructions } proc Digest {filename blockSize {sizecheck 0} {fingerprint 0}} { if [catch {package present md5 2}] {package forget md5 ; package require md5 2} set digest "[list $filename] $blockSize" if {[lsearch [file channels] $filename] == -1} { set filename [FileNameNormalize $filename] set digest "[list $filename] $blockSize" if {!([file isfile $filename] && [file readable $filename])} {return $digest} set f [open $filename r] } else { set f $filename } fconfigure $f -translation binary seek $f 0 end set fileSize [tell $f] seek $f 0 if {$sizecheck && ($fileSize < [expr $blockSize * 5])} {close $f ; return $digest} while {![eof $f]} { set blockContents [read $f $blockSize] set md5Sum [md5::md5 -hex -- $blockContents] set blockNumberSequence [SequenceBlock $blockContents] set weakChecksum [lindex [WeakChecksum $blockNumberSequence] 0] lappend digest "$md5Sum [format %.0f $weakChecksum]" } if $fingerprint { set fileIncrement [expr $fileSize/10] set fpLocation [expr $fileSize - 21] set i 0 while {$i < 10} { if {$fpLocation < 0} {set fpLocation 0} seek $f $fpLocation lappend fingerPrints [read $f 20] set fpLocation [expr $fpLocation - $fileIncrement] incr i } lappend digest fingerprints lappend digest [lsort -unique $fingerPrints] } close $f return $digest } proc DigestInstructionsCompress {digestInstructions blockSize} { if [string equal $digestInstructions {}] {return {}} set blockSpan $blockSize foreach {pointer blockInfo} $digestInstructions { if ![info exists currentBlockInfo] { set currentPointer $pointer set currentBlockInfo $blockInfo set md5Size [string length [lindex $blockInfo 1]] continue } if {$pointer == [expr $currentPointer + $blockSpan]} { set md5 [lindex $blockInfo 1] lappend currentBlockInfo $md5 incr blockSpan $blockSize } else { lappend newDigestInstructions $currentPointer lappend newDigestInstructions "[lindex $currentBlockInfo 0] [list "$md5Size [string map {{ } {}} [lrange $currentBlockInfo 1 end]]"]" set currentPointer $pointer set currentBlockInfo $blockInfo set blockSpan $blockSize } } lappend newDigestInstructions $currentPointer lappend newDigestInstructions "[lindex $currentBlockInfo 0] [list "$md5Size [string map {{ } {}} [lrange $currentBlockInfo 1 end]]"]" return $newDigestInstructions } proc DigestInstructionsExpand {digestInstructions blockSize} { if [string equal $digestInstructions {}] {return {}} foreach {pointer blockInfo} $digestInstructions { set blockNumber [lindex $blockInfo 0] set md5Size [lindex [lindex $blockInfo 1] 0] set blockString [lindex [lindex $blockInfo 1] 1] set blockLength [string length $blockString] set expandedBlock {} for {set i $md5Size} {$i <= $blockLength} {incr i $md5Size} { append expandedBlock " [string range $blockString [expr $i - $md5Size] [expr $i - 1]]" } set blockInfo "$blockNumber $expandedBlock" foreach md5 [lrange $blockInfo 1 end] { lappend newDigestInstructions $pointer lappend newDigestInstructions "$blockNumber $md5" incr pointer $blockSize incr blockNumber } } return $newDigestInstructions } proc FileNameNormalize {filename} { file normalize $filename } proc RollChecksum {a(k,l)_ b(k,l)_ k l Xsub_k Xsub_l+1 } { set Mod $trsync::Mod set a(k+1,l+1)_ [expr ${a(k,l)_} - $Xsub_k + ${Xsub_l+1}] set b(k+1,l+1)_ [expr ${b(k,l)_} - (($l - $k + 1) * $Xsub_k) + ${a(k+1,l+1)_}] set a(k+1,l+1)_ [expr fmod(${a(k+1,l+1)_},$Mod)] set b(k+1,l+1)_ [expr fmod(${b(k+1,l+1)_},$Mod)] set s(k+1,l+1)_ [expr ${a(k+1,l+1)_} + ($Mod * ${b(k+1,l+1)_})] return "${s(k+1,l+1)_} ${a(k+1,l+1)_} ${b(k+1,l+1)_} [incr k] [incr l]" } proc SequenceBlock {blockcontents} { binary scan $blockcontents c* blockNumberSequence set blockNumberSequenceLength [llength $blockNumberSequence] for {set i 0} {$i < $blockNumberSequenceLength} {incr i} { set blockNumberSequence [lreplace $blockNumberSequence $i $i [expr ([lindex $blockNumberSequence $i] + 0x100)%0x100]] } return $blockNumberSequence } proc WeakChecksum {Xsub_k...Xsub_l} { set a(k,i)_ 0 set b(k,i)_ 0 set Mod $trsync::Mod set k 1 set l [llength ${Xsub_k...Xsub_l}] for {set i $k} {$i <= $l} {incr i} { set Xsub_i [lindex ${Xsub_k...Xsub_l} [expr $i - 1]] set a(k,i)_ [expr ${a(k,i)_} + $Xsub_i] set b(k,i)_ [expr ${b(k,i)_} + (($l - $i + 1) * $Xsub_i)] } set a(k,l)_ [expr fmod(${a(k,i)_},$Mod)] set b(k,l)_ [expr fmod(${b(k,i)_},$Mod)] set s(k,l)_ [expr ${a(k,l)_} + ($Mod * ${b(k,l)_})] return "${s(k,l)_} ${a(k,l)_} ${b(k,l)_} $k $l [lindex ${Xsub_k...Xsub_l} 0]" } proc tdelta {referenceFile targetFile blockSize {sizecheck 0} {fingerprint 0}} { if {$::trsync::md5Size < 1} {error "md5Size must be greater than zero."} set signature [Digest $targetFile $blockSize $sizecheck $fingerprint] return [CopyInstructions $referenceFile $signature] } proc tpatch {targetFile copyInstructions fileToConstruct {eolNative 0}} { set copyInstructions [lreplace $copyInstructions 0 1 $fileToConstruct $targetFile] return [ConstructFile $copyInstructions $eolNative] } namespace export tdelta tpatch } # end namespace eval ::trsync
This is very cool ... how does it work? :) BTW, did you see in tcllib there's a longest-sublist-match thing which is talked about in this connection, and might be useful.Are you thinking what I'm thinking? First a delta versioning FS, then a networked delta FS with optimistic checkin (CVS' big innovation, IMHO) leading to a really good pure-tcl s/w distribution mechanism? Or even a distributed VFS for text? - CMcC 20041029SEH -- There are many possibilities, I wish I could code them faster. Right now I'm going to use this code to create a complementary virtual filesystem on which to stack a versioning virtual filesystem, which can serve as the backbone of a personal backup-archive utility. From there a distributed development-archive service shouldn't be a big leap.jcw - By hacking things and adding the following at the end, I was able to try it out as command-line utility:
source kitten.kit switch [llength $argv] { 2 { set cmd tdelta lappend argv $blockSize } 3 { set cmd patch set argv [lreplace $argv 1 1 [read [open [lindex $argv 1]]]] } default { puts stderr " Usage: $argv0 rfile tfile >delta or: $argv0 tfile delta ofile " exit 1 } } puts -nonewline [eval [linsert $argv 0 $cmd]]
Could this be a useful mechanism to bring wikit difference history into the wikit.tkd database file? It sure looks intriguing...