if false {
A colleage of mine publishes technical sheets not only on our Intranet but also on CD-ROM.
Last friday, she asked me how to realise the search page on CD-ROM. I answered: maybe there's the ASP, but no little dwarf to work out the ASP, so
no way ...
But, next day was saturday ... maybe the wheather ... I found a
quick'n'dirty solution in Javascript. It is a single file search.js to be sourced. Here the source:
}
#! /usr/bin/tclsh
proc echo args {puts $args}
proc cat {file} {
# return contents of $file
set port [open $file]
set contents [read $port]
close $port
set contents
}
if {$argv ne ""} then {
set startFile [lindex $argv 0]
} else {
set startFile [lindex [glob *.htm*] 0]
}
proc leadFragments word {
lappend result $word
while {[string length $word] > 1} {
set word [string range $word 0 end-1]
if {[lsearch $result $word] < 0} then {
lappend result $word
}
}
set result
}
proc trailFragments word {
lappend result $word
while {[string length $word] > 1} {
set word [string range $word 1 end]
lappend result $word
}
set result
}
proc fragments word {
set result {}
foreach lead [leadFragments $word] {
foreach frag [trailFragments $lead] {
lappend result $frag
}
}
set result
}
proc docFragments doc {
set contents [regsub -all -- {[^[:alnum:]]+} $doc " "]
set result {}
foreach word [split $contents " "] {
eval lappend result [fragments $word]
}
lsort -unique $result
}
proc relPathFromTo {fromDir toDir} {
# return path string relative from $fromDir to $toDir.
# $fromDir is assumed to be a directory (not a file).
set from [file normalize $fromDir]
set to [file normalize $toDir]
if {$::tcl_platform(platform) eq "windows"} {
set driveMap {
a: A: b: B: c: C: d: D: e: E: f: F: g:
G: h: H: i: I: j: J: k: K: l: L: m: M:
n: N: o: O: p: P: q: Q: r: R: s: S: t:
T: u: U: v: V: w: W: x: X: y: Y: z: Z:
}
regexp {^[a-zA-Z]:} [pwd] drive
if {![regexp {^[a-zA-Z]:} $from]} {
set from $drive$from
}
set from [string map $driveMap $from]
if {![regexp {^[a-zA-Z]:} $to]} {
set to $drive$to
}
set to [string map $driveMap $to]
}
set fromList [file split $from]
set fromLength [llength $fromList]
set toList [file split $to]
set toLength [llength $toList]
set commonList {}
foreach path1 $fromList path2 $toList {
if {$path1 ne $path2} {
break
} else {
lappend commonList $path1
}
}
set commonLength [llength $commonList]
set fromList1 [lrange $fromList $commonLength end]
set toList1 [lrange $toList $commonLength end]
set resultList {}
foreach i $fromList1 {
lappend resultList ..
}
eval lappend resultList $toList1
if {$resultList ne {}} {
eval file join $resultList
}
}
array set database {}
set titles {}
proc parseFile file {
variable startFile
variable links
variable database
variable titles
if {$startFile eq $file} then {
set links {}
array unset database
array set database {}
}
set startDir [file dirname $startFile]
set file [file normalize $file]
set myPath [relPathFromTo [file normalize $startDir] $file]
if {[lsearch $links $myPath] >= 0} then {
return
} else {
lappend links [string map [list \\ \\\\ \" \\\"] $myPath]
}
echo ... processing $myPath ...
set myDir [file dirname $myPath]
# words
set words {}
switch -- [string tolower [file extension $file]] {
.htm - .html - .shtm - .shtml - .xhtm - .xhtml - .txt -
.php - .php4 - .php5 {
set contents [cat $file]
# title
if {[regexp -nocase {<title>[^<]+</title>} $contents title]} then {
set title [regsub -all { *</?title> *} $title ""]
} else {
set title [file rootname [file tail $file]]
}
lappend titles [string map [list \\ \\\\ \" \\\"] $title]
set contents1 [regsub -all -- {<.*?>} $contents ""]
set contents2 [string map {
ä ä
ö ö
ü ü
ß ß
Ä Ä
Ö Ö
Ü Ü
} $contents1]
# set contents3 [regsub -all -- {[^[:alnum:]]+} $contents2 " "]
eval lappend words [docFragments [string tolower $contents2]]
# links verfolgen
foreach src [regexp -nocase -inline -all\
{<a [^>]*?href=['"][^:?]+["']>} $contents] {
set href [regexp -inline {(?:href="[^\"]*"|href='[^']*')} $src]
set target\
[file normalize\
[file join $myDir\
[string trim [string range $href 7 end-1]\
'\"\\]]]
if {[file isfile $target]} then {
if {[catch {parseFile $target} err]} then {
puts stderr [list problems parsing $target, but don't panic ...]
}
} else {
puts stderr [list not processed: $target]
}
}
}
default {
echo ... skip non-html file $myPath ...
}
}
foreach word $words {
if {![info exists database($word)]
||
[lsearch $database($word) $myPath] < 0} then {
lappend database($word) $myPath
}
}
set words
}
parseFile $startFile
echo parsing is done, creating javascript database ...
set src {var files = }
append src {[} \n\t\" [join $links \",\n\t\"] \"\n {]} \n\n\
{var titles = }\
{[} \n\t\" [join $titles \",\n\t\"] \"\n {]} \n\n\
{var database = } \{
foreach key [array names database] {
append src \n\t\" $key \": " " \[
set indices {}
foreach target $database($key) {
lappend indices [lsearch $links $target]
}
append src [join $indices ", "]
append src \],
}
# remove trailing comma ...
set src [string range $src 0 end-1]
append src \n \}
set out [open search.js w]
puts $out $src
puts $out {
// from here on fixed javascript
// arrayContainsElement (arr, el)
// return true if el is element of arr
function arrayContainsElement (arr, el)
{
for (var i in arr) if (arr[i] == el) return true
return false
}
// commonElementsOf (arr1, arr2)
// return new array containing elements which are common in array
function commonElementsOf (arr1, arr2)
{
var result=[]
for (var i in arr1)
{
var el = arr1[i]
if (arrayContainsElement (arr2, el)) result .push(el)
}
return result
}
var formFields = unescape (location .search) .slice(1) .split("&")
var queryList = []
var lang = "de"
for (var i in formFields)
{
var keyVal = formFields[i] .split("=")
var key = keyVal[0]
var val = keyVal[1]
if (key == "query" || key == "q")
{
queryList = val .toLowerCase() .split("+")
// if there should be the search form on position 0 ...
if (document .forms .length && document .forms[0] [key])
document .forms [0] [key] .value = val .split("+") .join(" ")
}
if (key == "lang") lang = val
}
// write some feedback to HTML
var resultList = []
for (var i in queryList)
{
if (i == 0)
resultList = database[queryList[i]]
else if (queryList[i] != "")
resultList = commonElementsOf(resultList, database[queryList[i]])
}
if (resultList && resultList .length)
{
document .write( "<ol>")
for (var i in resultList)
{
var result = resultList[i]
document .write ("\n <li><a href='", files[result], "'>",
titles[result], "</a></li>")
}
document .write( "\n</ol>")
}
}
close $out
if false {
Here the contents of the search page:
<html>
<head>
<title>Suchseite</title>
</head>
<body>
<h1>Suchseite</h1>
<form>
<input type="text" name="query" />
<input type="submit" name="suche" />
</form>
<script type="text/javascript" src="search.js"></script>
</body>
</html>
Usage:
- cd to the directory where the start page of your site resides.
- start the script with the start page as argument, e.g. "tclsh makeSearch.tcl index.htm". The script takes this HTML as starting point for web-crawling, then it writes the found data to a file named search.js.
- Put the search above, e.g. as an HTML named search.htm, into the same directory.
- You can access this HTML not only by its own form but also from any other page by a reference like search.htm?query=My+Request+from+last+year
From now on, you can "google" your private site. Cool.
}