# htmlutil.tcl - by Jean-Claude Wippler, September 2001
package provide htmlutil 0.1
# parse HTML text, setting array elements along the way
proc htmlparse {text {aref html} {ignorecase 1}} {
upvar $aref avar
set avar() ""
regsub -all {<!--.*?-->} $text {} text
append text </>
set tags ""
set hist ""
foreach {a b c} [regexp -all -inline {(.*?)<(.*?)>} $text ] {
set avar(<text>) $b
set d ""
regexp {^(\w+)\s(.*)} $c - c d
if {$ignorecase} {
set c [string toupper $c]
}
if {[regexp {^/(.*)} $c - e]} {
set t "/"
while {[llength $tags]} {
set t [lindex $tags end]
set avar(/$t) [lindex $hist end]
set tags [lreplace $tags end end]
set hist [lreplace $hist end end]
if {[string equal $t $e]} break
}
# comment out line below to ignore unbalanced closing tags
#if {![string equal $t $e]} { set avar($c) {} }
} else {
set avar($c) $d
lappend tags $c
lappend hist $d
}
}
}
# code below runs when this is launched as the main script
if {[file root [file tail $argv0]] == "htmlutil"} {
proc show {r e op} {
upvar $r a
puts [list set html($e) $a($e)]
}
trace var html w show
set in {a<b c>d<e f>g<e h>i</e>j</e>k<e l>m</b>n</o>p}
puts "Parsing: $in"
puts [htmlparse $in]
}Output: Parsing: a<b c>d<e f>g<e h>i</e>j</e>k<e l>m</b>n</o>p
set html() {}
set html(<text>) a
set html(B) c
set html(<text>) d
set html(E) f
set html(<text>) g
set html(E) h
set html(<text>) i
set html(/E) h
set html(<text>) j
set html(/E) f
set html(<text>) k
set html(E) l
set html(<text>) m
set html(/E) l
set html(/B) c
set html(<text>) n
set html(<text>) pDec 2003: Found Carsten Zerbst' article about tDOM on Linux Magazine from 2002, which also shows an HTML -> DOM -> XPath example http://www.linux-magazine.com/issue/20/tDOM.pdf

Tclgumbo is a Tcl extension for parsing HTML.

