Docs GODI Archive
Projects Blog Link DB

Search GODI:


More options
File lib/felix/rtl/elk_glr.h GODI Package apps-felix
 
   elk_glr.h  
#line 708 "lpsrc/elk.pak"
// glr.h            see license.txt for copyright and terms of use
// GLR parsing algorithm

/*
 * Author: Scott McPeak, April 2000
 *
 * The fundamental concept in Generalized LR (GLR) parsing
 * is to permit (at least local) ambiguity by "forking" the
 * parse stack.  If the input is actually unambiguous, then
 * all but one of the forked parsers will, at some point,
 * fail to shift a symbol, and die.  If the input is truly
 * ambiguous, forked parsers rejoin at some point, and the
 * parse tree becomes a parse DAG, representing all possible
 * parses.  (In fact, since cyclic grammars are supported,
 * which can have an infinite number of parse trees for
 * some inputs, we may end up with a cyclic parse *graph*.)
 *
 * In the larger scheme of things, this level of support for
 * ambiguity is useful because it lets us use simpler and
 * more intuitive grammars, more sophisticated disambiguation
 * techniques, and parsing in the presence of incomplete
 * or incorrect information (e.g. in an editor).
 *
 * The downside is that parsing is slower, and whatever tool
 * processes the parse graph needs to have ways of dealing
 * with the multiple parse interpretations.
 *
 * references:
 *
 *   [GLR]  J. Rekers.  Parser Generation for Interactive
 *          Environments.  PhD thesis, University of
 *          Amsterdam, 1992.  Available by ftp from
 *          ftp://ftp.cwi.nl/pub/gipe/reports/Rek92.ps.Z .
 *          [Contains a good description of the Generalized
 *          LR (GLR) algorithm.]
 */

#ifndef GLR_H
#define GLR_H

#include "elk_glrconfig.h"
#include "elk_parsetables.h"
#include "elk_rcptr.h"
#include "elk_useract.h"
#include "sm_objpool.h"
#include "sm_objlist.h"
#include "sm_srcloc.h"
#include "sm_sobjlist.h"

#include <stdio.h>         // FILE
#include <iostream>      // std::ostream
#include "flx_rtl_config.hpp"

// fwds from other files
class LexerInterface;      // lexerint.h

// forward decls for things declared below
class StackNode;           // unit of parse state
class SiblingLink;         // connections between stack nodes
class PendingShift;        // for postponing shifts.. may remove
class FLX_RTL_EXTERN GLR;                 // main class for GLR parsing


// a pointer from a stacknode to one 'below' it (in the LR
// parse stack sense); also has a link to the parse graph
// we're constructing
class SiblingLink {
public:
  // the stack node being pointed-at; it was created eariler
  // than the one doing the pointing
  RCPtr<StackNode> sib;

  // this is the semantic value associated with this link
  // (parse tree nodes are *not* associated with stack nodes --
  // that's now it was originally, but I figured out the hard
  // way that's wrong (more info in compiler.notes.txt));
  // this is an *owner* pointer
  SemanticValue sval;

  // the source location of the left edge of the subtree rooted
  // at this stack node; this is in essence part of the semantic
  // value, but automatically propagated by the parser
  SOURCELOC( SourceLoc loc; )

  // number of times this 'sval' has been yielded; this is used
  // to track cases where we yield a value and then merge it
  // (which means the induced parse forest is incomplete)
  YIELD_COUNT( int yieldCount; )

  // if you add additional fields, they need to be inited in the
  // constructor *and* in StackNode::addFirstSiblingLink_noRefCt

public:
  SiblingLink(StackNode *s, SemanticValue sv
              SOURCELOCARG( SourceLoc L ) );
  ~SiblingLink();

  #if GLR_SOURCELOC
    bool validLoc() const { return loc != SL_UNKNOWN; }
  #else
    bool validLoc() const { return false; }
  #endif
};


// the GLR parse state is primarily made up of a graph of these
// nodes, which play a role analogous to the stack nodes of a
// normal LR parser; GLR nodes form a graph instead of a linear
// stack because choice points (real or potential ambiguities)
// are represented as multiple left-siblings
class StackNode {
public:
  // the LR state the parser is in when this node is at the
  // top ("at the top" means that nothing, besides perhaps itself,
  // is pointing to it)
  //ItemSet const * const state;                 // (serf)
  StateId state;       // now it is an id

  // each leftSibling points to a stack node in one possible LR stack.
  // if there is more than one, it means two or more LR stacks have
  // been joined at this point.  this is the parse-time representation
  // of ambiguity (actually, unambiguous grammars or inputs do
  // sometimes lead to multiple siblings)
  ObjList<SiblingLink> leftSiblings;           // this is a set

  // the *first* sibling is simply embedded directly into the
  // stack node, to avoid list overhead in the common case of
  // only one sibling; when firstSib.sib==NULL, there are no
  // siblings
  SiblingLink firstSib;

  // number of sibling links pointing at 'this', plus the number
  // of worklists on which 'this' appears (some liberty is taken
  // in the mini-LR parser, but it is carefully documented there)
  int referenceCount;

  // how many stack nodes can I pop before hitting a nondeterminism?
  // if this node itself has >1 sibling link, determinDepth==0; if
  // this node has 1 sibling, but that sibling has >1 sibling, then
  // determinDepth==1, and so on; if this node has 0 siblings, then
  // determinDepth==1
  int determinDepth;

  union {
    // somewhat nonideal: I need access to the 'userActions' to
    // deallocate semantic values when refCt hits zero, and I need
    // to map states to state-symbols for the same reason.
    // update: now I'm also using this to support pool-based
    // deallocation in decRefCt()
    GLR *glr;

    // this is used by the ObjectPool which handles allocation of
    // StackNodes
    StackNode *nextInFreeList;
  };

  // ordinal position of the token that was being processed
  // when this stack node was created; this information is useful
  // for laying out the nodes when visualizing the GSS, but is
  // not used by the parsing algorithm itself
  NODE_COLUMN( int column; )

  // count and high-water for stack nodes
  static int numStackNodesAllocd;
  static int maxStackNodesAllocd;


private:    // funcs
  SiblingLink *
    addAdditionalSiblingLink(StackNode *leftSib, SemanticValue sval
                             SOURCELOCARG( SourceLoc loc ) );

public:     // funcs
  StackNode();
  ~StackNode();

  // ctor/dtor from point of view of the object pool user
  void init(StateId state, GLR *glr);
  void deinit();

  // internal workings of 'deinit', exposed for performance reasons
  inline void decrementAllocCounter();
  void deallocSemanticValues();

  // add a new link with the given tree node; return the link
  SiblingLink *addSiblingLink(StackNode *leftSib, SemanticValue sval
                              SOURCELOCARG( SourceLoc loc ) );

  // specialized version for performance-critical sections
  inline void
    addFirstSiblingLink_noRefCt(StackNode *leftSib, SemanticValue sval
                                SOURCELOCARG( SourceLoc loc ) );

  // return the symbol represented by this stack node;  it's
  // the symbol shifted or reduced-to to get to this state
  // (this used to be a data member, but there are at least
  // two ways to compute it, so there's no need to store it)
  SymbolId getSymbolC() const;

  // reference count stuff
  void incRefCt() { referenceCount++; }
  void decRefCt();

  // sibling count queries (each one answerable in constant time)
  bool hasZeroSiblings() const { return firstSib.sib==NULL; }
  bool hasOneSibling() const { return firstSib.sib!=NULL && leftSiblings.isEmpty(); }
  bool hasMultipleSiblings() const { return leftSiblings.isNotEmpty(); }

  // when you expect there's only one sibling link, get it this way
  SiblingLink const *getUniqueLinkC() const;
  SiblingLink *getUniqueLink() { return const_cast<SiblingLink*>(getUniqueLinkC()); }

  // retrieve pointer to the sibling link to a given node, or NULL if none
  SiblingLink *getLinkTo(StackNode *another);

  // recompute my determinDepth based on siblings,
  // but don't actually change the state
  int computeDeterminDepth() const;

  // debugging
  static void printAllocStats();
  void checkLocalInvariants() const;
};


// this is a priority queue of stack node paths that are candidates to
// reduce, maintained such that we can select paths in an order which
// will avoid yield-then-merge
class ReductionPathQueue {
public:       // types
  // a single path in the stack
  class Path {
  public:     // data
    // ---- right edge info ----
    // the rightmost state's id; we're reducing in this state
    StateId startStateId;

    // id of the production with which we're reducing
    int prodIndex;

    // ---- left edge info ----
    // the token column (ordinal position of a token in the token
    // stream) of the leftmost stack node; the smaller the
    // startColumn, the more tokens this reduction spans
    int startColumn;

    // stack node at the left edge; our reduction will push a new
    // stack node on top of this one
    StackNode *leftEdgeNode;

    // ---- path in between ----
    // array of sibling links, naming the path; 'sibLink[0]' is the
    // leftmost link; array length is given by the rhsLen of
    // prodIndex's production
    GrowArray<SiblingLink*> sibLinks;    // (array of serfs)

    // corresponding array of symbol ids so we know how to interpret
    // the semantic values in the links
    GrowArray<SymbolId> symbols;

    union {
      // link between nodes for construction of a linked list,
      // kept in sorted order
      Path *next;

      // link for free list in the object pool
      Path *nextInFreeList;
    };

  public:     // funcs
    Path();
    ~Path();

    void init(StateId startStateId, int prodIndex, int rhsLen);
    void deinit() {}
  };

private:      // data
  // head of the list
  Path *top;

  // allocation pool of Path objects
  ObjectPool<Path> pathPool;

  // parse tables, so we can decode prodIndex and also compare
  // production ids for sorting purposes
  ParseTables *tables;

private:      // funcs
  bool goesBefore(Path const *p1, Path const *p2) const;

public:       // funcs
  ReductionPathQueue(ParseTables *t);
  ~ReductionPathQueue();

  // get another Path object, inited with these values
  Path *newPath(StateId startStateId, int prodIndex, int rhsLen);

  // make a copy of the prototype 'src', fill in its left-edge
  // fields using 'leftEdge', and insert it into sorted order
  // in the queue
  void insertPathCopy(Path const *src, StackNode *leftEdge);

  // true if there are no more paths
  bool isEmpty() const { return top == NULL; }
  bool isNotEmpty() const { return !isEmpty(); }

  // remove the next path to reduce from the list, and return it
  Path *dequeue();

  // mark a path as not being used, so it will be recycled into the pool
  void deletePath(Path *p);
};


// each GLR object is a parser for a specific grammar, but can be
// used to parse multiple token streams
class FLX_RTL_EXTERN GLR {
public:
  // ---- grammar-wide data ----
  // user-specified actions
  UserActions *userAct;                     // (serf)

  // parse tables derived from the grammar
  ParseTables *tables;                      // (serf)

  // ---- parser state between tokens ----
  // I keep a pointer to this so I can ask for token descriptions
  // inside some of the helper functions
  LexerInterface *lexerPtr;                 // (serf)

  // Every node in this set is (the top of) a parser that might
  // ultimately succeed to parse the input, or might reach a
  // point where it cannot proceed, and therefore dies.  (See
  // comments at top of glr.cc for more details.)
  ArrayStack<StackNode*> topmostParsers;     // (refct list)

  // index: StateId -> index in 'topmostParsers' of unique parser
  // with that state, or INDEX_NO_PARSER if none has that state
  typedef unsigned char ParserIndexEntry;
  enum { INDEX_NO_PARSER = 255 };
  ParserIndexEntry *parserIndex;            // (owner)

  // this is for assigning unique ids to stack nodes
  int nextStackNodeId;
  enum { initialStackNodeId = 1 };

  // ---- parser state during each token ----
  // I used to have fields:
  //   int currentTokenType;
  //   SemanticValue currentTokenValue;
  //   SourceLoc currentTokenLoc;
  // but these have been now replaced by, respectively,
  //   lexerPtr->type
  //   lexerPtr->sval
  //   lexerPtr->loc

  // ---- scratch space re-used at token-level (or finer) granularity ----
  // to be regarded as a local variable of GLR::rwlProcessWorklist
  GrowArray<SemanticValue> toPass;

  // persistent array that I swap with 'topmostParsers' during
  // 'rwlShiftTerminals' to avoid extra copying or allocation;
  // this should be regarded as variable local to that function
  ArrayStack<StackNode*> prevTopmost;        // (refct list)

  // ---- allocation pools ----
  // this is a pointer to the same-named local variable in innerGlrParse
  ObjectPool<StackNode> *stackNodePool;

  // pool and list for the RWL implementation
  ReductionPathQueue pathQueue;

  // ---- user options ----
  // when true, failed parses are accompanied by some rudimentary
  // diagnosis; when false, failed parses are silent (default: true)
  bool noisyFailedParse;

  // ---- debugging trace ----
  // these are computed during GLR::GLR since the profiler reports
  // there is significant expense to computing the debug sm_strings
  // (that are then usually not printed)
  bool trParse;                             // tracingSys("parse")
  std::ostream &trsParse;                        // trace("parse")

  // track column for new nodes
  NODE_COLUMN( int globalNodeColumn; )

  // statistics on parser actions
  int detShift, detReduce, nondetShift, nondetReduce;

  // count of # of times yield-then-merge happens
  int yieldThenMergeCt;

private:    // funcs
  // comments in glr.cc
  SemanticValue duplicateSemanticValue(SymbolId sym, SemanticValue sval);
  void deallocateSemanticValue(SymbolId sym, SemanticValue sval);
  SemanticValue grabTopSval(StackNode *node);

  StackNode *findTopmostParser(StateId state);
  StackNode *makeStackNode(StateId state);
  void writeParseGraph(char const *input) const;
  void clearAllStackNodes();
  void addTopmostParser(StackNode *parser);
  void pullFromTopmostParsers(StackNode *parser);
  bool canMakeProgress(StackNode *parser);
  void dumpGSS(int tokenNumber) const;
  void dumpGSSEdge(FILE *dest, StackNode const *src,
                               StackNode const *target) const;
  void printConfig() const;
  void buildParserIndex();
  void printParseErrorMessage(StateId lastToDie);
  bool cleanupAfterParse(SemanticValue &treeTop);
  bool nondeterministicParseToken();
  static bool innerGlrParse(GLR &glr, LexerInterface &lexer, SemanticValue &treeTop);
  SemanticValue doReductionAction(
    int productionId, SemanticValue const *svals
    SOURCELOCARG( SourceLoc loc ) );

  void rwlProcessWorklist();
  SiblingLink *rwlShiftNonterminal(StackNode *leftSibling, int lhsIndex,
                                   SemanticValue /*owner*/ sval
                                   SOURCELOCARG( SourceLoc loc ) );
  int rwlEnqueueReductions(StackNode *parser, ActionEntry action,
                           SiblingLink *sibLink);
  void rwlCollectPathLink(
    ReductionPathQueue::Path *proto, int popsRemaining,
    StackNode *currentNode, SiblingLink *mustUseLink, SiblingLink *linkToAdd);
  void rwlRecursiveEnqueue(
    ReductionPathQueue::Path *proto,
    int popsRemaining,
    StackNode *currentNode,
    SiblingLink *mustUseLink);
  void rwlShiftTerminals();

  void configCheck(char const *option, bool core, bool table);

  sm_string stackSummary() const;
  void nodeSummary(sm_stringBuilder &sb, StackNode const *node) const;
  void innerStackSummary(sm_stringBuilder &sb,
                         SObjList<StackNode const> &printed,
                         StackNode const *node) const;

public:     // funcs
  GLR(UserActions *userAct, ParseTables *tables);
  ~GLR();

  // ------- primary interface -------
  // read the named grammar file (.bin extension, typically)
  void readBinaryGrammar(char const *grammarFname);

  // parse, using the token stream in 'lexer', and store the final
  // semantic value in 'treeTop'
  bool glrParse(LexerInterface &lexer, SemanticValue &treeTop);

};


#endif // GLR_H

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml