package require tdom package require http # get the html page set token [http::geturl http://aspn.activestate.com/ASPN/Cookbook/Tcl/] set data [http::data $token] # parse the html set doc [dom parse -html $data] set root [$doc documentElement] # get all option nodes set optionList [$root selectNodes {//select/option}] set result {} # loop through all the options foreach option $optionList { set text [[$option nextSibling] nodeValue] set value [$option getAttribute value] lappend result [list $text $value] } puts $resultif 0 {which shows all the options
{{this section} Subsection} {{all ASPN} ASPN} {Products Products} {Recipes Recipes} {News NewsFeeds} {Modules Modules} {{Mailing Lists} Archive} {{The Perl Journal} TPJ} {Reference Reference} from this html code fragment ... <select name="type"> <option value="Subsection">this section</option> <option value="ASPN">all ASPN</option> <option value="Products">Products</option> <option value="Recipes">Recipes</option> <option value="NewsFeeds">News</option> <option value="Modules">Modules</option> <option value="Archive">Mailing Lists</option> <option value="TPJ">The Perl Journal</option> <option value="Reference">Reference</option> </select> ...The result can be used in a tcltest proc or however.of course can code can be shorter, but I think it explains more this way.This is my first wiki contribution, any feedback is appreciatedDMG 20-Aug-2003 asks: Offhand (and this is a general tdom/XML query) why use:
set text [[$option nextSibling] nodeValue]versus
set text [$option text]??}
Here's something DG did trying to inline fix bad HTML from RSS newsfeeds, which tends to be the norm from the big news sites these days.
itcl::body newsFeedDecoder::validateHTML {body {norecurse 0}} { if {[catch {dom parse -html $body} htmlDoc]} { # un-parsable! return "<!-- BROKEN HTML! (tmlrss) -->$body" } set htmlRoot [$htmlDoc documentElement] if {$htmlRoot == ""} { # have arbitrary text, not html.. return [encTxt $body] } # Check for partial HTML content where a true root node is missing, # but was mis-interpreted (slashdot's rss feed). # if {!$norecurse && "[string index $body 1]" != "[string index [$htmlRoot nodeName] 0]"} { $htmlDoc delete return [validateHTML "<span>$body</span>" 1] } # If the root node is a <p>, replace it with a <span> as I don't like # how it affects the formatting. # if {"[$htmlRoot nodeName]" == "p"} { set newDoc [dom createDocument span] set newRoot [$newDoc documentElement] deepCopy $newRoot $htmlRoot $htmlDoc delete set htmlDoc $newDoc set htmlRoot $newRoot } set imgNodes [$htmlRoot selectNodes //img] # make sure all <img> tags have a require alt attribute foreach imgNode $imgNodes { if {![$imgNode hasAttribute alt]} { $imgNode setAttribute alt {} } } # make sure all <img> tags use the title attribute for textual info foreach imgNode $imgNodes { if {![$imgNode hasAttribute title] && "[$imgNode @alt]" != ""} { $imgNode setAttribute title [$imgNode @alt] } } # replace all <nobr> container elements with standards complient # <span style="white-space: nowrap"> # set nobrNodes [$htmlRoot selectNodes //nobr] foreach nobrNode $nobrNodes { set parent [$nobrNode parentNode] set newSpan [$htmlDoc createElement span] $newSpan setAttribute style "white-space: nowrap" deepCopy $newSpan $nobrNode $parent replaceChild $newSpan $nobrNode } set html [$htmlDoc asHTML -htmlEntities] $htmlDoc delete return $html } itcl::body newsFeedDecoder::encTxt {txt} { return [string map { & & < < > > \" " } $txt] } itcl::body newsFeedDecoder::deepCopy {to from} { foreach child [$from childNodes] { $to appendChild [$child cloneNode -deep] } }