# bibtex.tcl -- # # A basic parser for BibTeX bibliography databases. # # Copyright (c) 2005 Neil Madden. # License: Tcl/BSD style. package require Tcl 8.4 package provide bibtex 0.3 # A rough grammar (case-insensitive): # # Database ::= (Junk '@' Entry)* # Junk ::= .*? # Entry ::= Record # | Comment # | String # | Preamble # Comment ::= "comment" [^\n]* \n -- ignored # String ::= "string" '{' Field* '}' # Preamble ::= "preamble" '{' .* '}' -- (balanced) # Record ::= Type '{' Key ',' Field* '}' # | Type '(' Key ',' Field* ')' -- not handled # Type ::= Name # Key ::= Name # Field ::= Name '=' Value # Name ::= [^\s\"#%'(){}]* # Value ::= [0-9]+ # | '"' ([^'"']|\\'"')* '"' # | '{' .* '}' -- (balanced) namespace eval bibtex { variable id 0 variable data array set data { } # bibtex::parse -- # # Parse a bibtex file. # # parse ?options? ?bibtex? # where options can be: # -recordcommand cmd -- callback for each record # -preamblecommand cmd -- callback for @preamble blocks # -stringcommand cmd -- callback for @string macros # -commentcommand cmd -- callback for @comment blocks # -progresscommand cmd -- callback to indicate progress of parse proc parse {args} { variable data variable id # Argument processing if {[llength $args] < 1} { set err "[lindex [info level 0] 0] ?options? ?bibtex?" return -code error "wrong # args: should be \"$err\"" } set token bibtex[incr id] array set options { -async 0 -blocksize 1024 } set options(-stringcommand) [list [namespace current]::addStrings $token] if {[llength $args] % 2 == 1} { set data($token,buffer) [lindex $args end] set data($token,eof) 1 array set options [lrange $args 0 end-1] } else { set data($token,buffer) "" set data($token,eof) 0 array set options [lrange $args 0 end] if {![info exists options(-channel)]} { cancel $token return -code error "no channel and no data given" } if {$options(-async)} { fileevent $options(-channel) readable \ [list [namespace current]::ReadChan $token]] } else { # Snarf it all up in one go for now set data($token,buffer) [read $options(-channel)] set data($token,eof) 1 } } foreach {k v} [array get options] { set data($token,$k) $v } # String mappings set data($token,strings) { } if {$options(-async)} { cancel $token error "not implemented" } else { ParseRecords $token } } # Cleanup a parser, cancelling any callbacks etc. proc cancel {token} { variable data if {[info exists $data($token,channel)]} { fileevent $data($token,channel) readable {} } foreach key [array names data $name,*] { unset data($key) } } # bibtex::addStrings -- # # Add strings to the map for a particular parser. All strings are # expanded at parse time. proc addStrings {token strings} { variable data eval [list lappend data($token,strings)] $strings } # Private utility routines ================================= proc Callback {token type args} { variable data if {[info exists data($token,-${type}command)]} { if {$data($token,-async)} { after 0 $data($token,-${type}command) $args } else { eval $data($token,-${type}command) $args } } } proc ReadChan {token} { variable data set chan $data($token,-channel) append data($token,buffer) [read $chan] if {[eof $chan]} { set data($token,eof) 1 } } proc ParseRecords {token} { variable data set bibtex $data($token,buffer) # Run through each block set db [regexp -all -inline {((?:[^@]|\S@|[^\n][\r\t\f ]*@)*)\s?@} $bibtex] set total [expr {([llength $db]-2)/2}] set step [expr {double($total) / 100.0}] set istep [expr {$step > 1 ? int($step) : 1}] set count 0 foreach {_ block} [lrange $db 2 end] { if {([incr count] % $istep) == 0} { Callback $token progress [expr {int($count / $step)}] } if {[regexp -nocase {\s*comment([^\n])*\n(.*)} $block \ -> cmnt rest]} { # Are @comments blocks, or just 1 line? # Does anyone care? Callback $token comment $cmnt } elseif {[regexp -nocase {\s*string[^\{]*\{(.*)\}[^\}]*} \ $block -> rest]} { # string macro defs Callback $token string [ParseBlock $rest] } elseif {[regexp -nocase {\s*preamble[^\{]*\{(.*)\}[^\}]*} \ $block -> rest]} { Callback $token preamble $rest } elseif {[regexp {([^\{]+)\{([^,]*),(.*)\}[^\}]*} $block -> \ type key rest]} { # Do any @string mappings (these are case insensitive) set rest [string map -nocase $data($token,strings) $rest] Callback $token record [Tidy $type] [string trim $key] \ [ParseBlock $rest] } else { puts stderr "Skipping: $block" } } } proc Tidy {str} { string tolower [string trim $str] } proc ParseBlock {block} { set ret [list] set index 0 while {1} { if {[regexp -start $index -indices -- {(\S+)[^=]*=(.*)} $block -> \ key rest]} { foreach {ks ke} $key { break } set k [Tidy [string range $block $ks $ke]] foreach {rs re} $rest { break } foreach {v index} \ [ParseBibString $rs [string range $block $rs $re]] \ { break } lappend ret $k $v } else { break } } return $ret } proc ParseBibString {index str} { set count 0 set retstr "" set escape 0 foreach char [split $str ""] { incr index if {$escape} { set escape 0 } else { if {$char eq "\{"} { incr count continue } elseif {$char eq "\}"} { incr count -1 if {$count < 0} { incr index -1; break } continue } elseif {$char eq ","} { if {$count == 0} { break } } elseif {$char eq "\\"} { set escape 1; continue } elseif {$char eq "\""} { continue } } append retstr $char } regsub -all {\s+} $retstr { } retstr return [list [string trim $retstr] $index] } } proc readfile file { set fd [open $file] set cn [read $fd] close $fd return $cn } proc progress {percent} { set str [format "Processing: \[%3d%%\]" $percent] puts -nonewline "\r$str"; flush stdout } proc count {type key data} { global count total if {[info exists count($type)]} { incr count($type) } else { set count($type) 1 } incr total } array set count { } set total 0 puts -nonewline "Processing: \[ 0%\]"; flush stdout bibtex::parse \ -recordcommand count \ -progresscommand progress [readfile [lindex $argv 0]] puts "" puts "Summary ======" puts "Total: $total" parray count
schlenk Well spoken, the file format is just insane...Lars H: I agree, and (AFAICT) the only documentation of the BibTeX .bib format is "by example". A staggering contrast to the very comprehensive BNF grammars given for the syntax of TeX.