bison-patches
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH 5/6] lalr1.cc: add LAC support


From: Akim Demaille
Subject: [PATCH 5/6] lalr1.cc: add LAC support
Date: Fri, 9 Aug 2019 06:47:01 -0500

From: Adrian Vogelsgesang <address@hidden>

Implement lookahead correction (LAC) for the C++ skeleton.  LAC is a
mechanism to make sure that we report the correct list of expected
tokens if a syntax error occurs.  So far, LAC was only supported for
the C skeleton "yacc.c".

* data/skeletons/lalr1.cc: Add LAC support.
* doc/bison.texi: Update.
---
 data/skeletons/lalr1.cc | 231 +++++++++++++++++++++++++++++++++++++---
 doc/bison.texi          |   4 +-
 2 files changed, 216 insertions(+), 19 deletions(-)

diff --git a/data/skeletons/lalr1.cc b/data/skeletons/lalr1.cc
index ddee9569..37c12f7d 100644
--- a/data/skeletons/lalr1.cc
+++ b/data/skeletons/lalr1.cc
@@ -20,6 +20,14 @@ m4_include(b4_skeletonsdir/[c++.m4])
 # api.value.type=variant is valid.
 m4_define([b4_value_type_setup_variant])
 
+# Check the value of %define parse.lac, where LAC stands for lookahead
+# correction.
+b4_percent_define_default([[parse.lac]], [[none]])
+b4_define_flag_if([lac])
+m4_define([b4_lac_flag],
+          [m4_if(b4_percent_define_get([[parse.lac]]),
+                 [none], [[0]], [[1]])])
+
 
 # b4_integral_parser_table_declare(TABLE-NAME, CONTENT, COMMENT)
 # --------------------------------------------------------------
@@ -220,7 +228,18 @@ m4_define([b4_shared_declarations],
   private:
     /// This class is not copyable.
     ]b4_parser_class[ (const ]b4_parser_class[&);
-    ]b4_parser_class[& operator= (const ]b4_parser_class[&);
+    ]b4_parser_class[& operator= (const ]b4_parser_class[&);]b4_lac_if([[
+
+    /// Check the lookahead yytoken.
+    /// \returns  true iff the token will be eventually shifted.
+    bool yy_lac_check_ (int yytoken) const;
+    /// Establish the initial context if no initial context currently exists.
+    /// \returns  true iff the token will be eventually shifted.
+    bool yy_lac_establish_ (int yytoken);
+    /// Discard any previous initial lookahead context because of event.
+    /// \param event  the event which caused the lookahead to be discarded.
+    ///               Only used for debbuging output.
+    void yy_lac_discard_ (const char* event);]])[
 
     /// State numbers.
     typedef int state_type;
@@ -344,7 +363,16 @@ m4_define([b4_shared_declarations],
     typedef stack<stack_symbol_type> stack_type;
 
     /// The stack.
-    stack_type yystack_;
+    stack_type yystack_;]b4_lac_if([[
+    /// The stack for LAC.
+    /// Logically, the yy_lac_stack's lifetime is confined to the function
+    /// yy_lac_check_. We just store it as a member of this class to hold
+    /// on to the memory and to avoid frequent reallocations.
+    /// Since yy_lac_check_ is const, this member must be mutable.
+    mutable std::vector<state_type> yylac_stack_;
+    /// Whether an initial LAC context was established.
+    bool yy_lac_established_;
+]])[
 
     /// Push a new state on the stack.
     /// \param m    a debug message to display
@@ -774,7 +802,11 @@ m4_if(b4_prefix, [yy], [],
     stack_symbol_type yyerror_range[3];]])[
 
     /// The return value of parse ().
-    int yyresult;
+    int yyresult;]b4_lac_if([[
+
+    /// Discard the LAC context in case there still is one left from a
+    /// previous invocation.
+    yy_lac_discard_ ("init");]])[
 
 #if YY_EXCEPTIONS
     try
@@ -843,14 +875,21 @@ b4_dollar_popdef])[]dnl
        to detect an error, take that action.  */
     yyn += yyla.type_get ();
     if (yyn < 0 || yylast_ < yyn || yycheck_[yyn] != yyla.type_get ())
-      goto yydefault;
+      {]b4_lac_if([[
+        if (!yy_lac_establish_ (yyla.type_get ()))
+           goto yyerrlab;]])[
+        goto yydefault;
+      }
 
     // Reduce or error.
     yyn = yytable_[yyn];
     if (yyn <= 0)
       {
         if (yy_table_value_is_error_ (yyn))
-          goto yyerrlab;
+          goto yyerrlab;]b4_lac_if([[
+        if (!yy_lac_establish_ (yyla.type_get ()))
+           goto yyerrlab;
+]])[
         yyn = -yyn;
         goto yyreduce;
       }
@@ -860,7 +899,8 @@ b4_dollar_popdef])[]dnl
       --yyerrstatus_;
 
     // Shift the lookahead token.
-    yypush_ ("Shifting", yyn, YY_MOVE (yyla));
+    yypush_ ("Shifting", yyn, YY_MOVE (yyla));]b4_lac_if([[
+    yy_lac_discard_ ("shift");]])[
     goto yynewstate;
 
 
@@ -1020,7 +1060,8 @@ b4_dollar_popdef])[]dnl
       yyerror_range[2].location = yyla.location;
       YYLLOC_DEFAULT (error_token.location, yyerror_range, 2);]])[
 
-      // Shift the error token.
+      // Shift the error token.]b4_lac_if([[
+      yy_lac_discard_ ("error recovery");]])[
       error_token.state = yyn;
       yypush_ ("Shifting", YY_MOVE (error_token));
     }
@@ -1085,8 +1126,148 @@ b4_dollar_popdef])[]dnl
   {
     error (]b4_join(b4_locations_if([yyexc.location]),
                     [[yyexc.what ()]])[);
+  }]b4_lac_if([[
+
+  bool
+  ]b4_parser_class[::yy_lac_check_ (int yytoken) const
+  {
+    // Logically, the yylac_stack's lifetime is confined to this function.
+    // Clear it, to get rid of potential left-overs from previous call.
+    yylac_stack_.clear ();
+    // Reduce until we encounter a shift and thereby accept the token.
+#if ]b4_api_PREFIX[DEBUG
+    YYCDEBUG << "LAC: checking lookahead " << yytname_[yytoken] << ':';
+#endif
+    size_t lac_top = 0;
+    while (true)
+      {
+        state_type top_state = (yylac_stack_.empty ()
+                                ? yystack_[lac_top].state
+                                : yylac_stack_.back ());
+        int yyrule = yypact_[top_state];
+        if (yy_pact_value_is_default_ (yyrule)
+            || (yyrule += yytoken) < 0 || yylast_ < yyrule
+            || yycheck_[yyrule] != yytoken)
+          {
+            // Use the default action.
+            yyrule = yydefact_[top_state];
+            if (yyrule == 0)
+              {
+                YYCDEBUG << " Err\n";
+                return false;
+              }
+          }
+        else
+          {
+            // Use the action from yytable.
+            yyrule = yytable_[yyrule];
+            if (yy_table_value_is_error_ (yyrule))
+              {
+                YYCDEBUG << " Err\n";
+                return false;
+              }
+            if (0 < yyrule)
+              {
+                YYCDEBUG << " S" << yyrule << '\n';
+                return true;
+              }
+            yyrule = -yyrule;
+          }
+        // By now we know we have to simulate a reduce.
+        YYCDEBUG << " R" << yyrule - 1;
+        // Pop the corresponding number of values from the stack.
+        {
+          size_t yylen = yyr2_[yyrule];
+          // First pop from the LAC stack as many tokens as possible.
+          size_t lac_size = yylac_stack_.size ();
+          if (yylen < lac_size)
+            {
+              for (size_t i = 0; i < yylen; ++i)
+                yylac_stack_.pop_back ();
+              yylen = 0;
+            }
+          else if (lac_size)
+            {
+              yylac_stack_.clear ();
+              yylen -= lac_size;
+            }
+          // Only aftwerwards look at the main stack.
+          // We simulate popping elements by incrementing lac_top.
+          lac_top += yylen;
+        }
+        // Keep top_state in sync with the updated stack.
+        top_state = (yylac_stack_.empty ()
+                     ? yystack_[lac_top].state
+                     : yylac_stack_.back ());
+        // Push the resulting state of the reduction.
+        state_type state = yy_lr_goto_state_ (top_state, yyr1_[yyrule]);
+        YYCDEBUG << " G" << state;
+        yylac_stack_.push_back (state);
+      }
+  }
+
+  // Establish the initial context if no initial context currently exists.
+  bool
+  ]b4_parser_class[::yy_lac_establish_ (int yytoken)
+  {
+    /* Establish the initial context for the current lookahead if no initial
+       context is currently established.
+
+       We define a context as a snapshot of the parser stacks.  We define
+       the initial context for a lookahead as the context in which the
+       parser initially examines that lookahead in order to select a
+       syntactic action.  Thus, if the lookahead eventually proves
+       syntactically unacceptable (possibly in a later context reached via a
+       series of reductions), the initial context can be used to determine
+       the exact set of tokens that would be syntactically acceptable in the
+       lookahead's place.  Moreover, it is the context after which any
+       further semantic actions would be erroneous because they would be
+       determined by a syntactically unacceptable token.
+
+       yy_lac_establish_ should be invoked when a reduction is about to be
+       performed in an inconsistent state (which, for the purposes of LAC,
+       includes consistent states that don't know they're consistent because
+       their default reductions have been disabled).
+
+       For parse.lac=full, the implementation of yy_lac_establish_ is as
+       follows.  If no initial context is currently established for the
+       current lookahead, then check if that lookahead can eventually be
+       shifted if syntactic actions continue from the current context.  */
+    if (!yy_lac_established_)
+      {
+#if ]b4_api_PREFIX[DEBUG
+        YYCDEBUG << "LAC: initial context established for "
+                 << yytname_[yytoken] << '\n';
+#endif
+        yy_lac_established_ = true;
+        return yy_lac_check_ (yytoken);
+      }
+    return true;
   }
 
+  // Discard any previous initial lookahead context.
+  void
+  ]b4_parser_class[::yy_lac_discard_ (const char* evt)
+  {
+   /* Discard any previous initial lookahead context because of Event,
+      which may be a lookahead change or an invalidation of the currently
+      established initial context for the current lookahead.
+
+      The most common example of a lookahead change is a shift.  An example
+      of both cases is syntax error recovery.  That is, a syntax error
+      occurs when the lookahead is syntactically erroneous for the
+      currently established initial context, so error recovery manipulates
+      the parser stacks to try to find a new initial context in which the
+      current lookahead is syntactically acceptable.  If it fails to find
+      such a context, it discards the lookahead.  */
+    if (yy_lac_established_)
+      {
+        YYCDEBUG << "LAC: initial context discarded due to "
+                 << evt << '\n';
+        yy_lac_established_ = false;
+      }
+  }]])[
+
   // Generate an error message.
   std::string
   ]b4_parser_class[::yysyntax_error_ (]dnl
@@ -1115,24 +1296,40 @@ b4_error_verbose_if([state_type yystate, const 
symbol_type& yyla],
          a consistent state with a default action.  There might have
          been a previous inconsistent state, consistent state with a
          non-default action, or user semantic action that manipulated
-         yyla.  (However, yyla is currently not documented for users.)
+         yyla.  (However, yyla is currently not documented for 
users.)]b4_lac_if([[
+         In the first two cases, it might appear that the current syntax
+         error should have been detected in the previous state when
+         yy_lac_check was invoked.  However, at that time, there might
+         have been a different syntax error that discarded a different
+         initial context during error recovery, leaving behind the
+         current lookahead.]], [[
        - Of course, the expected token list depends on states to have
          correct lookahead information, and it depends on the parser not
          to perform extra reductions after fetching a lookahead from the
-         scanner and before detecting a syntax error.  Thus, state
-         merging (from LALR or IELR) and default reductions corrupt the
-         expected token list.  However, the list is correct for
-         canonical LR with one exception: it will still contain any
-         token that will not be accepted due to an error action in a
-         later state.
+         scanner and before detecting a syntax error.  Thus, state merging
+         (from LALR or IELR) and default reductions corrupt the expected
+         token list.  However, the list is correct for canonical LR with
+         one exception: it will still contain any token that will not be
+         accepted due to an error action in a later state.]])[
     */
     if (!yyla.empty ())
       {
         int yytoken = yyla.type_get ();
-        yyarg[yycount++] = yytname_[yytoken];
+        yyarg[yycount++] = yytname_[yytoken];]b4_lac_if([[
+
+#if ]b4_api_PREFIX[DEBUG
+        // Execute LAC once. We don't care if it is succesful, we
+        // only do it for the sake of debugging output.
+        if (!yy_lac_established_)
+          yy_lac_check_ (yytoken);
+#endif]])[
+
         int yyn = yypact_[yystate];
         if (!yy_pact_value_is_default_ (yyn))
-          {
+          {]b4_lac_if([[
+            for (int yyx = 0; yyx < yyntokens_; ++yyx)
+              if (yyx != yyterror_ && yy_lac_check_(yyx))
+                {]], [[
             /* Start YYX at -YYN if negative to avoid negative indexes in
                YYCHECK.  In other words, skip the first -YYN actions for
                this state because they are default actions.  */
@@ -1143,7 +1340,7 @@ b4_error_verbose_if([state_type yystate, const 
symbol_type& yyla],
             for (int yyx = yyxbegin; yyx < yyxend; ++yyx)
               if (yycheck_[yyx + yyn] == yyx && yyx != yyterror_
                   && !yy_table_value_is_error_ (yytable_[yyx + yyn]))
-                {
+                {]])[
                   if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
                     {
                       yycount = 1;
diff --git a/doc/bison.texi b/doc/bison.texi
index c0b14383..9b6981d3 100644
--- a/doc/bison.texi
+++ b/doc/bison.texi
@@ -8697,7 +8697,7 @@ Enable LAC to improve syntax error handling.
 @item @code{none} (default)
 @item @code{full}
 @end itemize
-This feature is currently only available for deterministic parsers in C.
+This feature is currently only available for deterministic parsers in C and 
C++.
 @end deffn
 
 Conceptually, the LAC mechanism is straight-forward.  Whenever the parser
@@ -11685,7 +11685,7 @@ location (if enabled) being @var{yylval} and 
@var{yylloc}.  Invocations of
 Note that when using variants, the interface for @code{yylex} is the same,
 but @code{yylval} is handled differently.
 
-Regular union-based code in Lex scanner typically look like:
+Regular union-based code in Lex scanner typically looks like:
 
 @example
 [0-9]+   @{
-- 
2.22.0




reply via email to

[Prev in Thread] Current Thread [Next in Thread]