Richard Suchenwirth 2007-02-02 - Just to prove that I'm not always writing very short procs and scripts, here's a utility to cross-tabulate data, in which each line stands for one "case" characterized by attributes in fields separated by a specific character, e.g.
CSV. For example, this little data file (saved as test.csv)
John;M;soccer
Jane;F;tennis
Tom;M;football
Dick;M;soccer
Harry;M;tennis
Mary;F;baseball
Jeff;M;baseball
Jane;F;tennis
can yield the following tabulation:
$ crosstab.tcl test.csv -c 2,3
1\2 F M Total
------------------------------------------------
baseball 1 1 2
football 0 1 1
soccer 0 2 2
tennis 2 1 3
Total 3 5 8
which might make the point that tennis is more popular with females than with males, etc.
A later addition is that you can also specify a third column as "z axis", where for all values of z a table like the above is produced. The script is also a demonstration of the
set usage ...; proc main ...; ...; main pattern that I usually follow.
#!/usr/bin/env tclsh
set usage {$Id: 17641,v 1.7 2007-02-05 19:00:22 jcw Exp $
usage: crosstab.tcl infile ?-c x,y? ?-sep '\t'? -w 12
Reads the infile (or stdin if infile is "-") for cases
Prints on stdout a cross-tabulation for the specified columns.
-c x,y : columns to use (1...), default is 1,2
-sep x : column separator, default is ";"
-w n : output column width, default is 12
}
if {[llength $argv] == 0} {puts stderr $usage; exit 1}
proc main argv {
getopt argv -sep sep ";"
getopt argv -c cols 1,2
getopt argv -w w 12
set infile [lindex $argv 0]
if {$infile eq "-"} {
set f stdin
} elseif {[file exists $infile]} {
set f [open $infile]
} else {puts stderr "no such file $infile\n$::usage"; exit 1}
foreach {x y z} [split $cols ,] break
incr x -1; incr y -1
if {$z ne ""} {incr z -1}
while {[gets $f line] >= 0} {
set fields [split $line $sep]
set key [lindex $fields $x],[lindex $fields $y]
if {$z ne ""} {append key ,[lindex $fields $z]}
inc N($key)
}
if {$f ne "stdout"} {close $f}
set xs [get_values N 0]
set ys [get_values N 1]
if {$z ne ""} {set zs [get_values N 2]} else {set zs {{}}}
foreach zv $zs {
puts $zv
set header [format %-${w}s $x\\$y]
foreach xv [concat $xs Total] {append header [format %${w}s $xv]}
puts $header
puts [string repeat - [string length $header]]
foreach yv [concat $ys Total] {
set line [format %-${w}s $yv]
set sum 0
foreach xv $xs {
set key $xv,$yv
if {$z ne ""} {append key ,$zv}
set n [get N($key)]
append line [format %${w}d $n]
incr sum $n
set key $xv,Total
if {$z ne ""} {append key ,$zv}
inc N($key) $n
}
append line [format %${w}d $sum]
if {$yv eq "Total"} {puts ""}
puts $line
}
puts ""
}
}
#-- retrieve a variable value, if existing, else return 0
proc get _var {
upvar 1 $_var var
if {[info exists var]} {set var} else {return 0}
}
#-- enumerate values used in array keys at one position (comma-separated)
proc get_values {_arr pos} {
upvar 1 $_arr arr
set values {}
foreach i [array names arr] {
lappend values [lindex [split $i ,] $pos]
}
lsort -unique $values
}
#-- option retriever, see [getopt] page
proc getopt {_argv name {_var ""} {default ""}} {
upvar 1 $_argv argv $_var var
set pos [lsearch -regexp $argv ^$name]
if {$pos>=0} {
set to $pos
if {$_var ne ""} {
set var [lindex $argv [incr to]]
}
set argv [lreplace $argv $pos $to]
return 1
} else {
if {[llength [info level 0]] == 5} {set var $default}
return 0
}
}
#-- From Tcl 8.5, [incr] will auto-initialize, so this workaround will no longer be needed
proc inc {_var {amount 1}} {
upvar 1 $_var var
if ![info exists var] {set var 0}
incr var $amount
}
main $argv
See also
crosstab again for a more functional rewrite.