Arjen Markus (23 december 2003) I am fascinated and awed by the - for me - frequent task of reading input files of all kinds of complexity. Even though most input files I deal with have a simple structure, the program units to read them (in as robust a way as necessary) can be devastatingly contorted.
So, any tool I can muster that makes life easier is welcome. These currently include:
- Fortran routines, as many of the files I deal with are written by Fortran programs and Fortran is surprisingly well fit for the job. (I intend to write another page about this)
- Yacc/Lex parsers if I need to do the job in C - C itself is, IMHO, hopeless.
- Designing the input as Tcl source, if I can get away with that
But I am interested too in the theoretical part of parsing. So, after some altercations with a very simple file that turned out to be tough to read directly in C, and reading a book about concepts of programming languages, I decided to have a go with the method of
recursive descent.
The script below is not perfect - it does not handle empty lists of dependents for a rule, it does not check for ambiguities, it does not pay any attention to errors in the input - but I do consider it a proof of concept.
It takes an LL grammar and parses a list of lexemes (the stuff input is made of) based on that grammar. No code generation is necessary, it is all done in memory.
Enjoy and comment!
AM Found the bug that prevented empty dependents to work - it had to do with the | procedure.
AM I realised I can use this approach with, say, Fortran, too ...
Frank Pilhofer Note that there is
Yeti, too, a Yacc clone in Tcl.
jt And
taccle, an even better clone of Yacc for Tcl.
# parser.tcl --
# Experiment with creating a parser based on a simple grammar
#
# Note:
# The type of parser is LL, so not as general as possible
#
# Parser --
# Namespace for the variables and procedures
#
namespace eval ::Parser {
variable rule_dependents
variable rule_code
variable lexeme ""
variable prev_lexeme
variable lexeme_list
variable lexeme_count
variable token
variable end
namespace export init define | rule getLexeme
}
# init --
# Initialise the parser by giving it a list of lexemes
# Arguments:
# input Input for the parser
# Result:
# None
# Side effects:
# Set the variables defining the state of the parser
#
proc ::Parser::init { input } {
variable end
variable lexeme_count
variable lexeme_list
set end 0
set lexeme_count -1
set lexeme_list $input
set lexeme ""
NextLexeme
}
# getLexeme --
# Get the lexeme that was last examined (for access in user-code)
# Arguments:
# None
# Result:
# Value of the lexeme (actually the previous one!)
# Side effects:
# Store the rule
#
proc ::Parser::getLexeme {} {
variable prev_lexeme
return $prev_lexeme
}
# define --
# Define the first rule for an item
# Arguments:
# item Name of the item to be defined
# depends Dependents for the rule
# code (Optional) code to be run if the rule matches
# Result:
# None
# Side effects:
# Store the rule
#
proc ::Parser::define { item depends {code {}} } {
variable rule_dependents
variable rule_code
variable last_item
set last_item $item
set rule_dependents($item) [list $depends]
set rule_code($item) [list $code]
}
# | --
# Define alternatives for the first rule for an item
# Arguments:
# depends Dependents for the rule
# code (Optional) code to be run if the rule matches
# Result:
# None
# Side effects:
# Append the new information to the rule
#
proc ::Parser::| { depends {code {}} } {
variable rule_dependents
variable rule_code
variable last_item
lappend rule_dependents($last_item) $depends
lappend rule_code($last_item) $code
}
# rule --
# Match the input to the given rule
# Arguments:
# item Root item that starts the parsing
# Result:
# 1 if matched, 0 if end of input, "error" if no match
#
proc ::Parser::rule { item } {
variable rule_dependents
variable rule_code
variable lexeme
variable end
if { $end } { return 0 }
#
# Try all the rules in turn
#
puts "Rule: $item"
set rule_count 0
foreach dependents $rule_dependents($item) code $rule_code($item) {
puts " Dependents: $dependents"
#
# Work our way along the dependents
#
set retcode 1
incr rule_count
foreach dep $dependents {
puts " Dependent: $dep"
#
# By convention: upper-case names mean terminals
#
if { [string toupper $dep] != $dep } {
set retcode [rule $dep]
if { $retcode == 0 } {
puts "==> did not work"
break ;# Try the next rule
} elseif { $retcode == "error" } {
return "error"
}
} else {
#
# We are dealing with a terminal - does the token match?
# If so, accept it
#
if { [getToken $lexeme] == $dep } {
puts "$item: $dep = $lexeme"
NextLexeme
} else {
#
# No match - tell the caller
#
return 0
}
}
}
#
# We have completed the list of dependents, so
# the rule is satisfied
#
if { $retcode == 1 } {
puts " Completed"
namespace eval :: $code
return 1
} elseif { $rule_count == [llength $rule_dependents($item)] } {
#
# This is a hack - my grammar should be expanded to include
# empty rules ...
#
if { $dep == $item } {
return 1
} else {
return 0
}
}
}
#
# We have tried all the rules - no match. This means
# an error in the input
#
return "error"
}
# NextLexeme --
# Get the next lexeme from the list
# Arguments:
# None
# Result:
# None
# Side effects:
# Sets the variable "lexeme" and if the end of the input is
# reached, sets the variable "end"
#
proc ::Parser::NextLexeme {} {
variable lexeme_count
variable lexeme_list
variable prev_lexeme
variable lexeme
variable end
incr lexeme_count
set prev_lexeme $lexeme
if { $lexeme_count < [llength $lexeme_list] } {
set lexeme [lindex $lexeme_list $lexeme_count]
puts "NextLexeme: $lexeme"
} else {
puts "NextLexeme: -- end --"
set end 1
}
}
# getToken --
# Identify the token to the given lexeme
# Arguments:
# lexeme Lexeme to be identified
# Result:
# The symbolic name for the token
# Note:
# Should become a user-definable procedure
# Right now:
# - if "*", return "MARKER"
# - if integer, return "INTEGER"
# - else return "STRING"
#
proc getToken { lexeme } {
if { $lexeme == "*" } {
puts "getToken: $lexeme = MARKER"
return "MARKER"
}
if { [string is integer -strict $lexeme] } {
puts "getToken: $lexeme = INTEGER"
return "INTEGER"
}
puts "getToken: $lexeme = STRING"
return "STRING"
}
# main --
# The grammar should parse this input:
# A 4 ; a list of attributes for "nodes"
# B 5
# C 2
# * ; a separator
# A B 3 ; a list of connections between "nodes" with a weight
# A C 1
#
namespace import ::Parser::*
define input {nodes separator links}
define nodes {node nodes}
define separator MARKER
define node {name weight} {
set Node($name1) $weight
}
define name STRING {
set name1 [getLexeme]
}
define weight INTEGER {
set weight [getLexeme]
}
define links {link links}
define link {name name2 weight} {
set Link($name1,$name2) $weight
}
define name2 STRING {
set name2 [getLexeme]
}
init {A 4 B 5 C 2 * A B 3 A C 1}
puts "Result: [rule input]"
parray Node
parray Link