# Lie like a senator for google to stop giving me a 403..
http::config -useragent {Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8) Gecko/20051111 Firefox/1.5}
# Should the server uses this info, give it what we want to receive.
http::config -accept "text/xml,application/xml,application/rss+xml,application/rdf+xml,application/atom+xml"
# Returns the DOM object of the RSS feed.
proc tmlrss::fetchXML {uri {recurse_limit 4}} {
set token [http::geturl $uri]
upvar #0 $token state
if {[http::status $token] != "ok" || [http::ncode $token] != 200} {
# was the error a redirect? If so, do it..
if {[http::ncode $token] == 302 && [incr recurse_limit -1] > 0} {
array set meta $state(meta)
set result [fetchXML $meta(Location) $recurse_limit]
http::cleanup $token
return $result
}
set err [http::code $token]
http::cleanup $token
return -code error $err
}
set xml [http::data $token]
array set meta $state(meta)
http::cleanup $token
# Do we need to do encoding conversions or was it already done
# in transit?
if {[info exist meta(Content-Type)] && \
[regexp -nocase {charset\s*=\s*(\S+)} $meta(Content-Type)]} {
# Socket channel encodings already performed! No Work to do
# here. See section 5.2.2 of the html spec. Server set
# encodings win.
} else {
# Read and perform charset mappings of the document rather than
# hand off the literal string to [dom parse] as Tcl itself is
# more encoding capable.
set xml [encoding convertfrom [getXmlEncoding $xml] $xml]
}
# Strip the XML declaration, should it exist. The encoding
# conversions have already been performed. Let's not let Expat
# try to convert it a second time as the work has already been
# performed. As the parser assumes utf-8 by default without a
# declaration, and the fact that tdom is handing expat the utf-8
# string rep, this will work even though it doesn't look correct.
return [dom parse -baseurl [uriBase $uri] [stripXmlDecl $xml]]
}
proc tmlrss::uriBase {uri} {
array set info [uri::split $uri]
set info(path) [file dirname $info(path)]
return [eval uri::join [array get info]]
}
proc tmlrss::stripXmlDecl {xml} {
if {![binary scan [string range $xml 0 3] "H8" firstBytes]} {
# very short (< 4 Bytes) string
return $xml
}
# If the entity has an XML Declaration, the first four characters
# must be "<?xm".
switch $firstBytes {
"3c3f786d" {
# Try to find the end of the XML Declaration
set closeIndex [string first ">" $xml]
if {$closeIndex == -1} {
error "Weird XML data or not XML data at all"
}
set xml [string range $xml [expr {$closeIndex+1}] end]
}
default {
# no declaration.
}
}
return $xml
}
proc tmlrss::getXmlEncoding {xml} {
# The autodetection of the encoding follows
# XML Recomendation, Appendix F
if {![binary scan $xml "H8" firstBytes]} {
# very short (< 4 Bytes) file
return iso8859-1
}
# If the entity has a XML Declaration, the first four characters
# must be "<?xm".
switch $firstBytes {
"3c3f786d" {
# UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS,
# EUC, or any other 7-bit, 8-bit, or mixed-width encoding which
# ensures that the characters of ASCII have their normal positions,
# width and values; the actual encoding declaration must be read to
# detect which of these applies, but since all of these encodings
# use the same bit patterns for the ASCII characters, the encoding
# declaration itself can be read reliably.
# Try to find the end of the XML Declaration
set closeIndex [string first ">" $xml]
if {$closeIndex == -1} {
error "Weird XML data or not XML data at all"
}
set xmlDeclaration [string range $xml 0 [expr {$closeIndex}]]
# extract the encoding information
set pattern {^[^>]+encoding=[\x20\x9\xd\xa]*["']([^ "']+)['"]}
# emacs: "
if {![regexp $pattern $xml - encStr]} {
# Probably something like <?xml version="1.0"?>.
# Without encoding declaration, pass-thru
return iso8859-1
} else {
set encoding [tDOM::IANAEncoding2TclEncoding $encStr]
}
}
default {
# pass-thru
set encoding iso8859-1
}
}
return $encoding
}HaO What happens, if the server sends an XML-Document in unicode starting with a BOM (byte order mark) ? Or is this not relevant in this context ?

