treeselect is a module prototype for accessing
htmlparse data stored in a
[tree
] using a syntax a lot like that of CSS selectors. There is one semantic difference, however: each selector operates only on the direct descendants of the selected nodes, so
.class1 .class2 works like
.class1 > .class2 would in CSS.
The query syntax is
?selector ...? where each
selector can be either
tag.class#id:nth-child(n) or
*, which selects everything. In
tag.class#id:nth-child(n) each of
tag,
.class,
#id and
:nth-child(n) can be omitted but the ones used must follow in the specified order. If your data was produced by htmlparse the first selectors will be
hmstart as in
hmstart html body ....
# treeselect version 0.3.2
# Copyright (c) 2015 dbohdan
# License: MIT
package require Tcl 8.5
package require struct
package require htmlparse
package require http
package require fileutil
catch {
package require tls
http::register https 443 [list ::tls::socket -tls1 1]
}
namespace eval ::treeselect {
variable version [lindex [split [file root [file tail [info script]]] -] 1]
variable debug 0
proc file-to-tree {path} {
set documentTree [::struct::tree]
set html [::fileutil::cat $path]
htmlparse::2tree $html $documentTree
return $documentTree
}
proc url-to-tree {url} {
set documentTree [::struct::tree]
set conn [::http::geturl $url]
set html [::http::data $conn]
::http::cleanup $conn
htmlparse::2tree $html $documentTree
return $documentTree
}
proc get {tree nodes key} {
set result {}
foreach node $nodes {
lappend result [$tree get $node $key]
}
return $result
}
proc parse-attributes {data} {
set attributes {}
foreach pair $data {
lassign [split $pair =] key value
set firstChar [string index $value 0]
# Unquote value.
if {($firstChar eq [string index $value end]) &&
(($firstChar eq "'") || ($firstChar eq "\""))} {
set value [string range $value 1 end-1]
}
dict set attributes $key $value
}
return $attributes
}
proc matches-selector? {selector tree node} {
variable debug
if {$selector eq "*"} {
return 1
}
regexp {([^ .#:]*)?(?:\.([^ .#:]+))?(?:#([^ .#:]+))?} \
$selector _ tag class id nth
set requirements {}
foreach varName {class id} {
set value [set $varName]
if {$value ne ""} {
dict set requirements $varName $value
}
}
set all [$tree getall $node]
set type [dict get $all type]
if {($type ne $tag) && ($tag ne "")} {
return 0
}
set attributes {}
if {[dict exists $all data]} {
set attributes [parse-attributes [dict get $all data]]
}
if {$debug} {
puts "matches-selector: $type [list $requirements $attributes]"
}
foreach {key value} $requirements {
if {![dict exists $attributes $key] ||
([dict get $attributes $key] ne $value)} {
return 0
}
}
return 1
}
# Usage: tree {?selector ...?} where each selector can be
# either tag.class#id:nth-child(n) with n >= 1 or "*".
proc query {tree query {start {}}} {
variable debug
if {$start eq ""} {
set nodes [$tree rootname]
} else {
set nodes $start
}
while {[llength $query] > 0} {
set newNodes {}
set selector [lindex $query 0]
foreach node $nodes {
lappend newNodes {*}[$tree children $node filter \
[list ::treeselect::matches-selector? $selector]]
}
if {[regexp {[^:]*(?:\:nth-child\(([1-9][0-9]*)\))} $selector _ n]} {
set newNodes [lindex $newNodes [expr {$n - 1}]]
}
set nodes $newNodes
if ($debug) {
puts "query: $nodes"
}
set query [lrange $query 1 end]
}
if ($debug) {
puts "query: result: $nodes"
}
return $nodes
}
}
Use examples can be found on
Web Scraping with htmlparse and
Hacker News.
See also edit
---
RLE (2015-01-22): Added http::cleanup to url-to-tree proc to release resources consumed by the http::geturl call.
dbohdan 2015-01-22: Thanks. I've bumped the version to account for that.