# bibtex.tcl --
#
# A basic parser for BibTeX bibliography databases.
#
# Copyright (c) 2005 Neil Madden.
# License: Tcl/BSD style.
package require Tcl 8.4
package provide bibtex 0.3
# A rough grammar (case-insensitive):
#
# Database ::= (Junk '@' Entry)*
# Junk ::= .*?
# Entry ::= Record
# | Comment
# | String
# | Preamble
# Comment ::= "comment" [^\n]* \n -- ignored
# String ::= "string" '{' Field* '}'
# Preamble ::= "preamble" '{' .* '}' -- (balanced)
# Record ::= Type '{' Key ',' Field* '}'
# | Type '(' Key ',' Field* ')' -- not handled
# Type ::= Name
# Key ::= Name
# Field ::= Name '=' Value
# Name ::= [^\s\"#%'(){}]*
# Value ::= [0-9]+
# | '"' ([^'"']|\\'"')* '"'
# | '{' .* '}' -- (balanced)
namespace eval bibtex {
variable id 0
variable data
array set data { }
# bibtex::parse --
#
# Parse a bibtex file.
#
# parse ?options? ?bibtex?
# where options can be:
# -recordcommand cmd -- callback for each record
# -preamblecommand cmd -- callback for @preamble blocks
# -stringcommand cmd -- callback for @string macros
# -commentcommand cmd -- callback for @comment blocks
# -progresscommand cmd -- callback to indicate progress of parse
proc parse {args} {
variable data
variable id
# Argument processing
if {[llength $args] < 1} {
set err "[lindex [info level 0] 0] ?options? ?bibtex?"
return -code error "wrong # args: should be \"$err\""
}
set token bibtex[incr id]
array set options {
-async 0
-blocksize 1024
}
set options(-stringcommand) [list [namespace current]::addStrings $token]
if {[llength $args] % 2 == 1} {
set data($token,buffer) [lindex $args end]
set data($token,eof) 1
array set options [lrange $args 0 end-1]
} else {
set data($token,buffer) ""
set data($token,eof) 0
array set options [lrange $args 0 end]
if {![info exists options(-channel)]} {
cancel $token
return -code error "no channel and no data given"
}
if {$options(-async)} {
fileevent $options(-channel) readable \
[list [namespace current]::ReadChan $token]]
} else {
# Snarf it all up in one go for now
set data($token,buffer) [read $options(-channel)]
set data($token,eof) 1
}
}
foreach {k v} [array get options] { set data($token,$k) $v }
# String mappings
set data($token,strings) { }
if {$options(-async)} {
cancel $token
error "not implemented"
} else {
ParseRecords $token
}
}
# Cleanup a parser, cancelling any callbacks etc.
proc cancel {token} {
variable data
if {[info exists $data($token,channel)]} {
fileevent $data($token,channel) readable {}
}
foreach key [array names data $name,*] {
unset data($key)
}
}
# bibtex::addStrings --
#
# Add strings to the map for a particular parser. All strings are
# expanded at parse time.
proc addStrings {token strings} {
variable data
eval [list lappend data($token,strings)] $strings
}
# Private utility routines =================================
proc Callback {token type args} {
variable data
if {[info exists data($token,-${type}command)]} {
if {$data($token,-async)} {
after 0 $data($token,-${type}command) $args
} else {
eval $data($token,-${type}command) $args
}
}
}
proc ReadChan {token} {
variable data
set chan $data($token,-channel)
append data($token,buffer) [read $chan]
if {[eof $chan]} {
set data($token,eof) 1
}
}
proc ParseRecords {token} {
variable data
set bibtex $data($token,buffer)
# Run through each block
set db [regexp -all -inline {((?:[^@]|\S@|[^\n][\r\t\f ]*@)*)\s?@} $bibtex]
set total [expr {([llength $db]-2)/2}]
set step [expr {double($total) / 100.0}]
set istep [expr {$step > 1 ? int($step) : 1}]
set count 0
foreach {_ block} [lrange $db 2 end] {
if {([incr count] % $istep) == 0} {
Callback $token progress [expr {int($count / $step)}]
}
if {[regexp -nocase {\s*comment([^\n])*\n(.*)} $block \
-> cmnt rest]} {
# Are @comments blocks, or just 1 line?
# Does anyone care?
Callback $token comment $cmnt
} elseif {[regexp -nocase {\s*string[^\{]*\{(.*)\}[^\}]*} \
$block -> rest]} {
# string macro defs
Callback $token string [ParseBlock $rest]
} elseif {[regexp -nocase {\s*preamble[^\{]*\{(.*)\}[^\}]*} \
$block -> rest]} {
Callback $token preamble $rest
} elseif {[regexp {([^\{]+)\{([^,]*),(.*)\}[^\}]*} $block -> \
type key rest]} {
# Do any @string mappings (these are case insensitive)
set rest [string map -nocase $data($token,strings) $rest]
Callback $token record [Tidy $type] [string trim $key] \
[ParseBlock $rest]
} else {
puts stderr "Skipping: $block"
}
}
}
proc Tidy {str} {
string tolower [string trim $str]
}
proc ParseBlock {block} {
set ret [list]
set index 0
while {1} {
if {[regexp -start $index -indices -- {(\S+)[^=]*=(.*)} $block -> \
key rest]} {
foreach {ks ke} $key { break }
set k [Tidy [string range $block $ks $ke]]
foreach {rs re} $rest { break }
foreach {v index} \
[ParseBibString $rs [string range $block $rs $re]] \
{ break }
lappend ret $k $v
} else { break }
}
return $ret
}
proc ParseBibString {index str} {
set count 0
set retstr ""
set escape 0
foreach char [split $str ""] {
incr index
if {$escape} {
set escape 0
} else {
if {$char eq "\{"} {
incr count
continue
} elseif {$char eq "\}"} {
incr count -1
if {$count < 0} { incr index -1; break }
continue
} elseif {$char eq ","} {
if {$count == 0} { break }
} elseif {$char eq "\\"} {
set escape 1; continue
} elseif {$char eq "\""} { continue }
}
append retstr $char
}
regsub -all {\s+} $retstr { } retstr
return [list [string trim $retstr] $index]
}
}
proc readfile file {
set fd [open $file]
set cn [read $fd]
close $fd
return $cn
}
proc progress {percent} {
set str [format "Processing: \[%3d%%\]" $percent]
puts -nonewline "\r$str"; flush stdout
}
proc count {type key data} {
global count total
if {[info exists count($type)]} {
incr count($type)
} else {
set count($type) 1
}
incr total
}
array set count { }
set total 0
puts -nonewline "Processing: \[ 0%\]"; flush stdout
bibtex::parse \
-recordcommand count \
-progresscommand progress [readfile [lindex $argv 0]]
puts ""
puts "Summary ======"
puts "Total: $total"
parray countschlenk Well spoken, the file format is just insane...Lars H: I agree, and (AFAICT) the only documentation of the BibTeX .bib format is "by example". A staggering contrast to the very comprehensive BNF grammars given for the syntax of TeX.

