Updated 2016-07-10 11:30:43 by dkf

US You need a sitemap file, listing all URLs of your site?

This Tcl program generates a sitemap.txt file, which is also suitable for the Google sitemap programme. It's very simple and probably needs a lot of improvement, but it works. Suggestions, improvements and bug fixes welcome. If you don't want to watch it working, just remove the puts to stdout.

To validate all pages in your sitemap file use the Sitemap Validator.

Of course, it is public domain, do whatever you want with it, but don't blame me if it doesn't do what it should.
#!/usr/bin/env tclsh

package require http

proc get_urls {url} {
   global urls
   global inv_urls
   global exc_urls
   global urlptr
   # get protocol and site
   if {![regexp {^(http://[^/]+)(/.*)?$} $url --> psite x]} {
     # reject incomplete urls
     return
   }
   puts "getting $url"
   # get page 'url'
   set p [::http::geturl $url]
   if {[set status [::http::ncode $p]] != 200} {
     # update visit counter and status
     lappend inv_urls $url
     ::http::cleanup $p
     return
   }
   set new_urls [list]
   # find all hrefs to same domain
   set re {<a\s.*?href="?([^"     >]+)["         >]} ;# "
   foreach {href new_url} [regexp -all -inline -- $re [::http::data $p]] {
     lappend new_urls [lindex [split $new_url ?] 0]
   }
 # Uncomment the following lines if you need to reach a page
 # behind a query form via its action.
 # set re {<form\s.*?action="?([^"     >]+)["         >]} ;# "
 # foreach {href new_url} [regexp -all -inline -- $re [::http::data $p]] {
 #   lappend new_urls [lindex [split $new_url ?] 0]
 # }
   foreach new_url $new_urls {
     puts "found $new_url"
     # don't visit excluded urls
     set drop 0
     foreach ex $exc_urls {
       if {[string match *${ex}* $new_url]} {
         puts "excluded $new_url"
         set drop 1
       }
     }
     if {$drop} {
       continue
     }
     if {[string index $new_url 0] eq "/"} {
       puts "completing $new_url"
       set new_url "${psite}$new_url"
     }
     if {![string match ${psite}* $new_url]} {
       puts "dropped $new_url"
       continue
     }
     # insert into db (unique!)
     if {[lsearch -exact $urls $new_url] == -1} {
       lappend urls $new_url
     }
   }
   ::http::cleanup $p
   # select the first unvisited url from db
   if {[llength $urls] > $urlptr} {
     set next_url [lindex $urls $urlptr]
     incr urlptr
     # call geturls
     get_urls $next_url
   }
   return
}


# init_db urls.db
set urls [list]
set inv_urls [list]
set urlptr 0

# A list of pages you don't want to have scanned
set exc_urls {contact}

get_urls [lindex $argv 0]

# cleanup invalid urls
foreach iu $inv_urls {
   set idx [lsearch -exact $urls $iu]
   set urls [lreplace $urls $idx $idx]
}

set fd [open sitemap.txt w]
foreach url [lsort $urls] {
   puts $fd $url
}
close $fd