US You need a sitemap file, listing all URLs of your site?
This Tcl program generates a sitemap.txt file, which is also suitable for the
Google sitemap programme. It's very simple and probably needs a lot of improvement, but it works. Suggestions, improvements and bug fixes welcome. If you don't want to watch it working, just remove the puts to stdout.
To validate all pages in your sitemap file use the
Sitemap Validator.
Of course, it is public domain, do whatever you want with it, but don't blame me if it doesn't do what it should.
#!/usr/bin/env tclsh
package require http
proc get_urls {url} {
global urls
global inv_urls
global exc_urls
global urlptr
# get protocol and site
if {![regexp {^(http://[^/]+)(/.*)?$} $url --> psite x]} {
# reject incomplete urls
return
}
puts "getting $url"
# get page 'url'
set p [::http::geturl $url]
if {[set status [::http::ncode $p]] != 200} {
# update visit counter and status
lappend inv_urls $url
::http::cleanup $p
return
}
set new_urls [list]
# find all hrefs to same domain
set re {<a\s.*?href="?([^" >]+)[" >]} ;# "
foreach {href new_url} [regexp -all -inline -- $re [::http::data $p]] {
lappend new_urls [lindex [split $new_url ?] 0]
}
# Uncomment the following lines if you need to reach a page
# behind a query form via its action.
# set re {<form\s.*?action="?([^" >]+)[" >]} ;# "
# foreach {href new_url} [regexp -all -inline -- $re [::http::data $p]] {
# lappend new_urls [lindex [split $new_url ?] 0]
# }
foreach new_url $new_urls {
puts "found $new_url"
# don't visit excluded urls
set drop 0
foreach ex $exc_urls {
if {[string match *${ex}* $new_url]} {
puts "excluded $new_url"
set drop 1
}
}
if {$drop} {
continue
}
if {[string index $new_url 0] eq "/"} {
puts "completing $new_url"
set new_url "${psite}$new_url"
}
if {![string match ${psite}* $new_url]} {
puts "dropped $new_url"
continue
}
# insert into db (unique!)
if {[lsearch -exact $urls $new_url] == -1} {
lappend urls $new_url
}
}
::http::cleanup $p
# select the first unvisited url from db
if {[llength $urls] > $urlptr} {
set next_url [lindex $urls $urlptr]
incr urlptr
# call geturls
get_urls $next_url
}
return
}
# init_db urls.db
set urls [list]
set inv_urls [list]
set urlptr 0
# A list of pages you don't want to have scanned
set exc_urls {contact}
get_urls [lindex $argv 0]
# cleanup invalid urls
foreach iu $inv_urls {
set idx [lsearch -exact $urls $iu]
set urls [lreplace $urls $idx $idx]
}
set fd [open sitemap.txt w]
foreach url [lsort $urls] {
puts $fd $url
}
close $fd