The indexer
#!/usr/bin/env tclsh # # Index the Tcl'ers Wiki # # (c) 2006 Michael Schlenker <mic42@users.sourceforge.net # # Index the Tcl'ers wiki with the xapian fulltext searching package. # package require Tcl 8.4 package require xapian 0.9.6 package require Mk4tcl package require logger set version 0.1 set log [logger::init windex] logger::import -prefix log_ windex ${log}::setlevel notice # path to the indexing dir set indexpath db # path to the uncompressed wikit metakit file # (from http://mini.net/cgi-bin/wikit.gz ) # set wikifile wikit set dbfile [file join $indexpath xapindex] set urlprefix http://wiki.tcl.tk/ set MAX_PROB_TERM_LENGTH 64 if {![file isdirectory $indexpath]} { if {[file exists $indexpath]} { log_error "Cannot create index cache dir" } else { file mkdir $indexpath } } proc indexWiki {file} { ::mk::file open wk $file -readonly set start [clock seconds] set db [openIndexDatabase $::dbfile] set count 0 mk::loop i wk.pages { incr count [indexOneWikiPage wk $db $i] } closeIndexDatabase $db set stop [clock seconds] set diff [expr {$stop-$start}] set persec [expr {$count/$diff}] log_notice "Indexed $count documents in $diff seconds ($persec doc/sec)" } proc indexOneWikiPage {wiki db cur} { foreach {name page} [mk::get $cur name page] {break} if {![string length $page]} {return 0} log_info "Processing \"$name\"" xapian::Document doc # strip wk.pages! from the index name set idx [string range $cur 9 end ] # store the url of the page and its title for the result list doc set_data [list $::urlprefix$idx $name] set pos 0 indexTextBlock doc $page $pos $db add_document doc doc -delete return 1 } proc indexTextBlock {doc text {pos 0}} { # A term is one or more alphanumerics, with optional trailing # + and/or - (e.g. C++). But "hyphen-ated" should generate # "hyphen" not "hyphen-". # set re {([[:alnum:]]+(?:[-+]*(?![[:alnum:]]))?)} set re {([[:alnum:]]+)} set j 0 while {[regexp -indices -start $j $re $text -> word]} { set i [lindex $word 0] set j [lindex $word 1] if {($j-$i) <= $::MAX_PROB_TERM_LENGTH} { set term [string range $text $i $j] set term [string tolower $term] set sterm [estem stem_word $term] log_debug "Indexing $term" $doc add_posting $term $pos incr pos $doc add_term $sterm } incr j } return $pos } proc openIndexDatabase {file} { xapian::WritableDatabase xapiandb $file $::xapian::DB_CREATE_OR_OVERWRITE xapian::Stem estem "english" return xapiandb } proc closeIndexDatabase {db} { $db -delete } indexWiki $wikifile
The simple command line searching utility
#!/usr/bin/env tclsh # # Search the Wiki # # (c) 2006 Michael Schlenker <mic42@users.sourceforge.net # package require Tcl 8.4 package require xapian 0.9.6 package require logger set log [logger::init wsearch] logger::import -prefix log_ wsearch log_info "Using Xapian API version [package present xapian]" set indexpath db set dbfile [file join $indexpath xapindex] set baseurl http://wiki.tcl.tk/ set MAX_PROB_TERM_LENGTH 64 proc openIndexDatabase {file} { xapian::Database xapiandb $file xapian::Stem estem "english" return xapiandb } proc closeIndexDatabase {db} { $db -delete } if {[llength $argv] == 0} { log_error "Empty commandline" exit 1 } set db [openIndexDatabase $dbfile] xapian::Enquire enquire $db log_info "Commandline is $argv" log_debug "Building query" xapian::QueryParser qparse set qp qparse $qp set_database $db $qp set_stemmer estem set query [$qp parse_query [join $argv]] log_debug "Performing query [$query get_description]'" enquire set_query $query set matches [enquire get_mset 0 100] log_info "[$matches get_matches_estimated] results found" for {set i [$matches begin]} {![$i equals [$matches end]]} {$i next} { xapian::Document document [$i get_document] puts [format {ID %s %s%% [%s]} \ [$i get_docid] [$i get_percent] [document get_data]] } closeIndexDatabase $db exit 0
Example usage
- First get the wikit.gz file, then run the indexer on the file.
- Start searching:
./search.tcl schlenk AND xapian [Do Jun 01 23:40:45 CEST 2006] [wsearch] [info] 'Using Xapian API version 0.9.6' [Do Jun 01 23:40:45 CEST 2006] [wsearch] [info] 'Commandline is schlenk AND xapian' [Do Jun 01 23:40:45 CEST 2006] [wsearch] [debug] 'Building query' [Do Jun 01 23:40:45 CEST 2006] [wsearch] [debug] 'Performing query Xapian::Query((schlenk:(pos=1) AND xapian:(pos=2)))'' [Do Jun 01 23:40:45 CEST 2006] [wsearch] [info] '2 results found' ID 6873 100% [http://wiki.tcl.tk/13173 Xapian] ID 8077 96% [http://wiki.tcl.tk/15637 {Package feature map}]