package require tdom
package require http
# get the html page
set token [http::geturl http://aspn.activestate.com/ASPN/Cookbook/Tcl/]
set data [http::data $token]
# parse the html
set doc [dom parse -html $data]
set root [$doc documentElement]
# get all option nodes
set optionList [$root selectNodes {//select/option}]
set result {}
# loop through all the options
foreach option $optionList {
set text [[$option nextSibling] nodeValue]
set value [$option getAttribute value]
lappend result [list $text $value]
}
puts $resultif 0 {which shows all the options
{{this section} Subsection}
{{all ASPN} ASPN}
{Products Products}
{Recipes Recipes}
{News NewsFeeds}
{Modules Modules}
{{Mailing Lists} Archive}
{{The Perl Journal} TPJ}
{Reference Reference}
from this html code fragment
...
<select name="type">
<option value="Subsection">this section</option>
<option value="ASPN">all ASPN</option>
<option value="Products">Products</option>
<option value="Recipes">Recipes</option>
<option value="NewsFeeds">News</option>
<option value="Modules">Modules</option>
<option value="Archive">Mailing Lists</option>
<option value="TPJ">The Perl Journal</option>
<option value="Reference">Reference</option>
</select>
...The result can be used in a tcltest proc or however.of course can code can be shorter, but I think it explains more this way.This is my first wiki contribution, any feedback is appreciatedDMG 20-Aug-2003 asks: Offhand (and this is a general tdom/XML query) why use:set text [[$option nextSibling] nodeValue]versus
set text [$option text]??}
Here's something DG did trying to inline fix bad HTML from RSS newsfeeds, which tends to be the norm from the big news sites these days.
itcl::body newsFeedDecoder::validateHTML {body {norecurse 0}} {
if {[catch {dom parse -html $body} htmlDoc]} {
# un-parsable!
return "<!-- BROKEN HTML! (tmlrss) -->$body"
}
set htmlRoot [$htmlDoc documentElement]
if {$htmlRoot == ""} {
# have arbitrary text, not html..
return [encTxt $body]
}
# Check for partial HTML content where a true root node is missing,
# but was mis-interpreted (slashdot's rss feed).
#
if {!$norecurse && "[string index $body 1]" != "[string index [$htmlRoot nodeName] 0]"} {
$htmlDoc delete
return [validateHTML "<span>$body</span>" 1]
}
# If the root node is a <p>, replace it with a <span> as I don't like
# how it affects the formatting.
#
if {"[$htmlRoot nodeName]" == "p"} {
set newDoc [dom createDocument span]
set newRoot [$newDoc documentElement]
deepCopy $newRoot $htmlRoot
$htmlDoc delete
set htmlDoc $newDoc
set htmlRoot $newRoot
}
set imgNodes [$htmlRoot selectNodes //img]
# make sure all <img> tags have a require alt attribute
foreach imgNode $imgNodes {
if {![$imgNode hasAttribute alt]} {
$imgNode setAttribute alt {}
}
}
# make sure all <img> tags use the title attribute for textual info
foreach imgNode $imgNodes {
if {![$imgNode hasAttribute title] && "[$imgNode @alt]" != ""} {
$imgNode setAttribute title [$imgNode @alt]
}
}
# replace all <nobr> container elements with standards complient
# <span style="white-space: nowrap">
#
set nobrNodes [$htmlRoot selectNodes //nobr]
foreach nobrNode $nobrNodes {
set parent [$nobrNode parentNode]
set newSpan [$htmlDoc createElement span]
$newSpan setAttribute style "white-space: nowrap"
deepCopy $newSpan $nobrNode
$parent replaceChild $newSpan $nobrNode
}
set html [$htmlDoc asHTML -htmlEntities]
$htmlDoc delete
return $html
}
itcl::body newsFeedDecoder::encTxt {txt} {
return [string map { & & < < > > \" " } $txt]
}
itcl::body newsFeedDecoder::deepCopy {to from} {
foreach child [$from childNodes] {
$to appendChild [$child cloneNode -deep]
}
}
