Updated 2011-06-10 12:51:04 by RLE

Arjen Markus (23 december 2003) I am fascinated and awed by the - for me - frequent task of reading input files of all kinds of complexity. Even though most input files I deal with have a simple structure, the program units to read them (in as robust a way as necessary) can be devastatingly contorted.

So, any tool I can muster that makes life easier is welcome. These currently include:

  • Fortran routines, as many of the files I deal with are written by Fortran programs and Fortran is surprisingly well fit for the job. (I intend to write another page about this)
  • Yacc/Lex parsers if I need to do the job in C - C itself is, IMHO, hopeless.
  • Designing the input as Tcl source, if I can get away with that

But I am interested too in the theoretical part of parsing. So, after some altercations with a very simple file that turned out to be tough to read directly in C, and reading a book about concepts of programming languages, I decided to have a go with the method of recursive descent.

The script below is not perfect - it does not handle empty lists of dependents for a rule, it does not check for ambiguities, it does not pay any attention to errors in the input - but I do consider it a proof of concept.

It takes an LL grammar and parses a list of lexemes (the stuff input is made of) based on that grammar. No code generation is necessary, it is all done in memory.

Enjoy and comment!

AM Found the bug that prevented empty dependents to work - it had to do with the | procedure.

AM I realised I can use this approach with, say, Fortran, too ...

Frank Pilhofer Note that there is Yeti, too, a Yacc clone in Tcl.

jt And taccle, an even better clone of Yacc for Tcl.
 # parser.tcl --
 #    Experiment with creating a parser based on a simple grammar
 #
 # Note:
 #    The type of parser is LL, so not as general as possible
 #

 # Parser --
 #    Namespace for the variables and procedures
 #
 namespace eval ::Parser {
    variable rule_dependents
    variable rule_code
    variable lexeme        ""
    variable prev_lexeme
    variable lexeme_list
    variable lexeme_count
    variable token
    variable end

    namespace export init define | rule getLexeme
 }

 # init --
 #    Initialise the parser by giving it a list of lexemes
 # Arguments:
 #    input      Input for the parser
 # Result:
 #    None
 # Side effects:
 #    Set the variables defining the state of the parser
 #
 proc ::Parser::init { input } {
    variable end
    variable lexeme_count
    variable lexeme_list

    set end           0
    set lexeme_count -1
    set lexeme_list  $input
    set lexeme       ""

    NextLexeme
 }

 # getLexeme --
 #    Get the lexeme that was last examined (for access in user-code)
 # Arguments:
 #    None
 # Result:
 #    Value of the lexeme (actually the previous one!)
 # Side effects:
 #    Store the rule
 #
 proc ::Parser::getLexeme {} {
    variable prev_lexeme
    return $prev_lexeme
 }

 # define --
 #    Define the first rule for an item
 # Arguments:
 #    item       Name of the item to be defined
 #    depends    Dependents for the rule
 #    code       (Optional) code to be run if the rule matches
 # Result:
 #    None
 # Side effects:
 #    Store the rule
 #
 proc ::Parser::define { item depends {code {}} } {
    variable rule_dependents
    variable rule_code
    variable last_item

    set last_item              $item
    set rule_dependents($item) [list $depends]
    set rule_code($item)       [list $code]
 }

 # | --
 #    Define alternatives for the first rule for an item
 # Arguments:
 #    depends    Dependents for the rule
 #    code       (Optional) code to be run if the rule matches
 # Result:
 #    None
 # Side effects:
 #    Append the new information to the rule
 #
 proc ::Parser::| { depends {code {}} } {
    variable rule_dependents
    variable rule_code
    variable last_item

    lappend rule_dependents($last_item) $depends
    lappend rule_code($last_item)       $code
 }

 # rule --
 #    Match the input to the given rule
 # Arguments:
 #    item       Root item that starts the parsing
 # Result:
 #    1 if matched, 0 if end of input, "error" if no match
 #
 proc ::Parser::rule { item } {
    variable rule_dependents
    variable rule_code
    variable lexeme
    variable end

    if { $end } { return 0 }

    #
    # Try all the rules in turn
    #
    puts "Rule: $item"
    set rule_count 0
    foreach dependents $rule_dependents($item) code $rule_code($item) {
       puts "   Dependents: $dependents"
       #
       # Work our way along the dependents
       #
       set retcode 1
       incr rule_count
       foreach dep $dependents {
          puts "      Dependent: $dep"
          #
          # By convention: upper-case names mean terminals
          #
          if { [string toupper $dep] != $dep } {
             set retcode [rule $dep]
             if { $retcode == 0 } {
                puts "==> did not work"
                break ;# Try the next rule
             } elseif { $retcode == "error" } {
                return "error"
             }
          } else {
             #
             # We are dealing with a terminal - does the token match?
             # If so, accept it
             #
             if { [getToken $lexeme] == $dep } {
                puts "$item: $dep = $lexeme"
                NextLexeme
             } else {
                #
                # No match - tell the caller
                #
                return 0
             }
          }
       }
       #
       # We have completed the list of dependents, so
       # the rule is satisfied
       #
       if { $retcode == 1 } {
          puts "      Completed"
          namespace eval :: $code
          return 1
       } elseif { $rule_count == [llength $rule_dependents($item)] } {
          #
          # This is a hack - my grammar should be expanded to include
          # empty rules ...
          #
          if { $dep == $item } {
             return 1
          } else {
             return 0
          }
       }
    }

    #
    # We have tried all the rules - no match. This means
    # an error in the input
    #
    return "error"
 }

 # NextLexeme --
 #    Get the next lexeme from the list
 # Arguments:
 #    None
 # Result:
 #    None
 # Side effects:
 #    Sets the variable "lexeme" and if the end of the input is
 #    reached, sets the variable "end"
 #
 proc ::Parser::NextLexeme {} {
    variable lexeme_count
    variable lexeme_list
    variable prev_lexeme
    variable lexeme
    variable end

    incr lexeme_count
    set  prev_lexeme $lexeme
    if { $lexeme_count < [llength $lexeme_list] } {
       set lexeme [lindex $lexeme_list $lexeme_count]
       puts "NextLexeme: $lexeme"
    } else {
       puts "NextLexeme: -- end --"
       set end 1
    }
 }

 # getToken --
 #    Identify the token to the given lexeme
 # Arguments:
 #    lexeme        Lexeme to be identified
 # Result:
 #    The symbolic name for the token
 # Note:
 #    Should become a user-definable procedure
 #    Right now:
 #    - if "*", return "MARKER"
 #    - if integer, return "INTEGER"
 #    - else return "STRING"
 #
 proc getToken { lexeme } {
    if { $lexeme == "*" } {
       puts "getToken: $lexeme = MARKER"
       return "MARKER"
    }
    if { [string is integer -strict $lexeme] } {
       puts "getToken: $lexeme = INTEGER"
       return "INTEGER"
    }

    puts "getToken: $lexeme = STRING"
    return "STRING"
 }

 # main --
 #   The grammar should parse this input:
 #   A 4       ; a list of attributes for "nodes"
 #   B 5
 #   C 2
 #   *         ; a separator
 #   A B 3     ; a list of connections between "nodes" with a weight
 #   A C 1
 #

 namespace import ::Parser::*

 define input     {nodes separator links}
 define nodes     {node nodes}
 define separator MARKER
 define node      {name weight} {
    set Node($name1) $weight
 }
 define name      STRING {
    set name1 [getLexeme]
 }
 define weight    INTEGER {
    set weight [getLexeme]
 }
 define links     {link links}
 define link      {name name2 weight} {
    set Link($name1,$name2) $weight
 }
 define name2     STRING {
    set name2 [getLexeme]
 }

 init {A 4 B 5 C 2 * A B 3 A C 1}
 puts "Result: [rule input]"

 parray Node
 parray Link