# Lie like a senator for google to stop giving me a 403.. http::config -useragent {Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8) Gecko/20051111 Firefox/1.5} # Should the server uses this info, give it what we want to receive. http::config -accept "text/xml,application/xml,application/rss+xml,application/rdf+xml,application/atom+xml" # Returns the DOM object of the RSS feed. proc tmlrss::fetchXML {uri {recurse_limit 4}} { set token [http::geturl $uri] upvar #0 $token state if {[http::status $token] != "ok" || [http::ncode $token] != 200} { # was the error a redirect? If so, do it.. if {[http::ncode $token] == 302 && [incr recurse_limit -1] > 0} { array set meta $state(meta) set result [fetchXML $meta(Location) $recurse_limit] http::cleanup $token return $result } set err [http::code $token] http::cleanup $token return -code error $err } set xml [http::data $token] array set meta $state(meta) http::cleanup $token # Do we need to do encoding conversions or was it already done # in transit? if {[info exist meta(Content-Type)] && \ [regexp -nocase {charset\s*=\s*(\S+)} $meta(Content-Type)]} { # Socket channel encodings already performed! No Work to do # here. See section 5.2.2 of the html spec. Server set # encodings win. } else { # Read and perform charset mappings of the document rather than # hand off the literal string to [dom parse] as Tcl itself is # more encoding capable. set xml [encoding convertfrom [getXmlEncoding $xml] $xml] } # Strip the XML declaration, should it exist. The encoding # conversions have already been performed. Let's not let Expat # try to convert it a second time as the work has already been # performed. As the parser assumes utf-8 by default without a # declaration, and the fact that tdom is handing expat the utf-8 # string rep, this will work even though it doesn't look correct. return [dom parse -baseurl [uriBase $uri] [stripXmlDecl $xml]] } proc tmlrss::uriBase {uri} { array set info [uri::split $uri] set info(path) [file dirname $info(path)] return [eval uri::join [array get info]] } proc tmlrss::stripXmlDecl {xml} { if {![binary scan [string range $xml 0 3] "H8" firstBytes]} { # very short (< 4 Bytes) string return $xml } # If the entity has an XML Declaration, the first four characters # must be "<?xm". switch $firstBytes { "3c3f786d" { # Try to find the end of the XML Declaration set closeIndex [string first ">" $xml] if {$closeIndex == -1} { error "Weird XML data or not XML data at all" } set xml [string range $xml [expr {$closeIndex+1}] end] } default { # no declaration. } } return $xml } proc tmlrss::getXmlEncoding {xml} { # The autodetection of the encoding follows # XML Recomendation, Appendix F if {![binary scan $xml "H8" firstBytes]} { # very short (< 4 Bytes) file return iso8859-1 } # If the entity has a XML Declaration, the first four characters # must be "<?xm". switch $firstBytes { "3c3f786d" { # UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, # EUC, or any other 7-bit, 8-bit, or mixed-width encoding which # ensures that the characters of ASCII have their normal positions, # width and values; the actual encoding declaration must be read to # detect which of these applies, but since all of these encodings # use the same bit patterns for the ASCII characters, the encoding # declaration itself can be read reliably. # Try to find the end of the XML Declaration set closeIndex [string first ">" $xml] if {$closeIndex == -1} { error "Weird XML data or not XML data at all" } set xmlDeclaration [string range $xml 0 [expr {$closeIndex}]] # extract the encoding information set pattern {^[^>]+encoding=[\x20\x9\xd\xa]*["']([^ "']+)['"]} # emacs: " if {![regexp $pattern $xml - encStr]} { # Probably something like <?xml version="1.0"?>. # Without encoding declaration, pass-thru return iso8859-1 } else { set encoding [tDOM::IANAEncoding2TclEncoding $encStr] } } default { # pass-thru set encoding iso8859-1 } } return $encoding }
HaO What happens, if the server sends an XML-Document in unicode starting with a BOM (byte order mark) ? Or is this not relevant in this context ?