The indexer
#!/usr/bin/env tclsh
#
# Index the Tcl'ers Wiki
#
# (c) 2006 Michael Schlenker <mic42@users.sourceforge.net
#
# Index the Tcl'ers wiki with the xapian fulltext searching package.
#
package require Tcl 8.4
package require xapian 0.9.6
package require Mk4tcl
package require logger
set version 0.1
set log [logger::init windex]
logger::import -prefix log_ windex
${log}::setlevel notice
# path to the indexing dir
set indexpath db
# path to the uncompressed wikit metakit file
# (from http://mini.net/cgi-bin/wikit.gz )
#
set wikifile wikit
set dbfile [file join $indexpath xapindex]
set urlprefix http://wiki.tcl.tk/
set MAX_PROB_TERM_LENGTH 64
if {![file isdirectory $indexpath]} {
if {[file exists $indexpath]} {
log_error "Cannot create index cache dir"
} else {
file mkdir $indexpath
}
}
proc indexWiki {file} {
::mk::file open wk $file -readonly
set start [clock seconds]
set db [openIndexDatabase $::dbfile]
set count 0
mk::loop i wk.pages {
incr count [indexOneWikiPage wk $db $i]
}
closeIndexDatabase $db
set stop [clock seconds]
set diff [expr {$stop-$start}]
set persec [expr {$count/$diff}]
log_notice "Indexed $count documents in $diff seconds ($persec doc/sec)"
}
proc indexOneWikiPage {wiki db cur} {
foreach {name page} [mk::get $cur name page] {break}
if {![string length $page]} {return 0}
log_info "Processing \"$name\""
xapian::Document doc
# strip wk.pages! from the index name
set idx [string range $cur 9 end ]
# store the url of the page and its title for the result list
doc set_data [list $::urlprefix$idx $name]
set pos 0
indexTextBlock doc $page $pos
$db add_document doc
doc -delete
return 1
}
proc indexTextBlock {doc text {pos 0}} {
# A term is one or more alphanumerics, with optional trailing
# + and/or - (e.g. C++). But "hyphen-ated" should generate
# "hyphen" not "hyphen-".
# set re {([[:alnum:]]+(?:[-+]*(?![[:alnum:]]))?)}
set re {([[:alnum:]]+)}
set j 0
while {[regexp -indices -start $j $re $text -> word]} {
set i [lindex $word 0]
set j [lindex $word 1]
if {($j-$i) <= $::MAX_PROB_TERM_LENGTH} {
set term [string range $text $i $j]
set term [string tolower $term]
set sterm [estem stem_word $term]
log_debug "Indexing $term"
$doc add_posting $term $pos
incr pos
$doc add_term $sterm
}
incr j
}
return $pos
}
proc openIndexDatabase {file} {
xapian::WritableDatabase xapiandb $file $::xapian::DB_CREATE_OR_OVERWRITE
xapian::Stem estem "english"
return xapiandb
}
proc closeIndexDatabase {db} {
$db -delete
}
indexWiki $wikifileThe simple command line searching utility
#!/usr/bin/env tclsh
#
# Search the Wiki
#
# (c) 2006 Michael Schlenker <mic42@users.sourceforge.net
#
package require Tcl 8.4
package require xapian 0.9.6
package require logger
set log [logger::init wsearch]
logger::import -prefix log_ wsearch
log_info "Using Xapian API version [package present xapian]"
set indexpath db
set dbfile [file join $indexpath xapindex]
set baseurl http://wiki.tcl.tk/
set MAX_PROB_TERM_LENGTH 64
proc openIndexDatabase {file} {
xapian::Database xapiandb $file
xapian::Stem estem "english"
return xapiandb
}
proc closeIndexDatabase {db} {
$db -delete
}
if {[llength $argv] == 0} {
log_error "Empty commandline"
exit 1
}
set db [openIndexDatabase $dbfile]
xapian::Enquire enquire $db
log_info "Commandline is $argv"
log_debug "Building query"
xapian::QueryParser qparse
set qp qparse
$qp set_database $db
$qp set_stemmer estem
set query [$qp parse_query [join $argv]]
log_debug "Performing query [$query get_description]'"
enquire set_query $query
set matches [enquire get_mset 0 100]
log_info "[$matches get_matches_estimated] results found"
for {set i [$matches begin]} {![$i equals [$matches end]]} {$i next} {
xapian::Document document [$i get_document]
puts [format {ID %s %s%% [%s]} \
[$i get_docid] [$i get_percent] [document get_data]]
}
closeIndexDatabase $db
exit 0Example usage
- First get the wikit.gz file, then run the indexer on the file.
- Start searching:
./search.tcl schlenk AND xapian [Do Jun 01 23:40:45 CEST 2006] [wsearch] [info] 'Using Xapian API version 0.9.6' [Do Jun 01 23:40:45 CEST 2006] [wsearch] [info] 'Commandline is schlenk AND xapian' [Do Jun 01 23:40:45 CEST 2006] [wsearch] [debug] 'Building query' [Do Jun 01 23:40:45 CEST 2006] [wsearch] [debug] 'Performing query Xapian::Query((schlenk:(pos=1) AND xapian:(pos=2)))'' [Do Jun 01 23:40:45 CEST 2006] [wsearch] [info] '2 results found' ID 6873 100% [http://wiki.tcl.tk/13173Xapian] ID 8077 96% [http://wiki.tcl.tk/15637
{Package feature map}]


