using tdom to check generated html

if 0 {phk 2003-08-18 Let's assume your application is generating html pages.

tdom can help in a nice way to test the output.

Let's get all options from a html select tag:}

 package require tdom
 package require http

 # get the html page
 set token [http::geturl http://aspn.activestate.com/ASPN/Cookbook/Tcl/]
 set data [http::data $token]

 # parse the html
 set doc [dom parse -html $data]
 set root [$doc documentElement]
 
 # get all option nodes
 set optionList [$root selectNodes {//select/option}]

 set result {}
 # loop through all the options
 foreach option $optionList {
    set text [[$option nextSibling] nodeValue]
    set value [$option getAttribute value]
    lappend result [list $text $value]

 }

 puts $result

if 0 {which shows all the options

 {{this section} Subsection}
 {{all ASPN} ASPN}
 {Products Products}
 {Recipes Recipes}
 {News NewsFeeds}
 {Modules Modules}
 {{Mailing Lists} Archive}
 {{The Perl Journal} TPJ}
 {Reference Reference}

 from this html code fragment

 ...

 <select name="type">
  <option value="Subsection">this section</option>
  <option value="ASPN">all ASPN</option>
  <option value="Products">Products</option>
  <option value="Recipes">Recipes</option>
  <option value="NewsFeeds">News</option>
  <option value="Modules">Modules</option>
  <option value="Archive">Mailing Lists</option>
  <option value="TPJ">The Perl Journal</option>
  <option value="Reference">Reference</option>
 </select>

 ...

The result can be used in a tcltest proc or however.

of course can code can be shorter, but I think it explains more this way.

This is my first wiki contribution, any feedback is appreciated

DMG 20-Aug-2003 asks: Offhand (and this is a general tdom/XML query) why use:

   set text [[$option nextSibling] nodeValue]

versus

   set text [$option text]

}

Here's something DG did trying to inline fix bad HTML from RSS newsfeeds, which tends to be the norm from the big news sites these days.

 itcl::body newsFeedDecoder::validateHTML {body {norecurse 0}} {

    if {[catch {dom parse -html $body} htmlDoc]} {
        # un-parsable!
        return "<!-- BROKEN HTML! (tmlrss) -->$body"
    }

    set htmlRoot [$htmlDoc documentElement]
    if {$htmlRoot == ""} {
        # have arbitrary text, not html..
        return [encTxt $body]
    }

    # Check for partial HTML content where a true root node is missing,
    # but was mis-interpreted (slashdot's rss feed).
    #
    if {!$norecurse && "[string index $body 1]" != "[string index [$htmlRoot nodeName] 0]"} {
        $htmlDoc delete
        return [validateHTML "<span>$body</span>" 1]
    }

    # If the root node is a <p>, replace it with a <span> as I don't like
    # how it affects the formatting.
    #
    if {"[$htmlRoot nodeName]" == "p"} {
        set newDoc [dom createDocument span]
        set newRoot [$newDoc documentElement]
        deepCopy $newRoot $htmlRoot
        $htmlDoc delete
        set htmlDoc $newDoc
        set htmlRoot $newRoot
    }

    set imgNodes [$htmlRoot selectNodes //img]

    # make sure all <img> tags have a require alt attribute
    foreach imgNode $imgNodes {
        if {![$imgNode hasAttribute alt]} {
            $imgNode setAttribute alt {}
        }
    }

    # make sure all <img> tags use the title attribute for textual info
    foreach imgNode $imgNodes {
        if {![$imgNode hasAttribute title] && "[$imgNode @alt]" != ""} {
            $imgNode setAttribute title [$imgNode @alt]
        }
    }

    # replace all <nobr> container elements with standards complient
    # <span style="white-space: nowrap">
    #
    set nobrNodes [$htmlRoot selectNodes //nobr]
    foreach nobrNode $nobrNodes {
        set parent [$nobrNode parentNode]
        set newSpan [$htmlDoc createElement span]
        $newSpan setAttribute style "white-space: nowrap"
        deepCopy $newSpan $nobrNode
        $parent replaceChild $newSpan $nobrNode
    }

    set html [$htmlDoc asHTML -htmlEntities]
    $htmlDoc delete
    return $html
 }

 itcl::body newsFeedDecoder::encTxt {txt} {
    return [string map { & &amp; < &lt; > &gt; \" &quot; } $txt]
 }

 itcl::body newsFeedDecoder::deepCopy {to from} {
    foreach child [$from childNodes] {
        $to appendChild [$child cloneNode -deep]
    }
 }

Category XML