From 1de738013890245b10db12770daf1c3398e8030d Mon Sep 17 00:00:00 2001
From: Marko Durkovic <marko@ternaris.com>
Date: Thu, 28 Jul 2022 19:14:39 +0200
Subject: [PATCH] Fix comment parsing in message definitions

---
 src/rosbags/typesys/idl.py | 16 ++++-------
 src/rosbags/typesys/msg.py | 12 ++------
 src/rosbags/typesys/peg.py | 57 ++++++++++++++++++++++++++------------
 tests/test_parse.py        |  9 +++++-
 4 files changed, 57 insertions(+), 37 deletions(-)

diff --git a/src/rosbags/typesys/idl.py b/src/rosbags/typesys/idl.py
index 84522269..cd8d3b47 100644
--- a/src/rosbags/typesys/idl.py
+++ b/src/rosbags/typesys/idl.py
@@ -11,6 +11,7 @@ Grammar, parse tree visitor and conversion functions for message definitions in
 
 from __future__ import annotations
 
+import re
 from typing import TYPE_CHECKING
 
 from .base import Nodetype, parse_message_definition
@@ -31,17 +32,12 @@ specification
   = definition+
 
 definition
-  = comment
-  / macro
+  = macro
   / include
   / module_dcl ';'
   / const_dcl ';'
   / type_dcl ';'
 
-comment
-  = r'/\*.*?\*/'
-  / r'[/][/][^\n]*'
-
 macro
   = ifndef
   / define
@@ -254,7 +250,10 @@ string_literal
 class VisitorIDL(Visitor):  # pylint: disable=too-many-public-methods
     """IDL file visitor."""
 
-    RULES = parse_grammar(GRAMMAR_IDL)
+    RULES = parse_grammar(
+        GRAMMAR_IDL,
+        re.compile(r'(\s|/[*]([^*]|[*](?!/))*[*]/|//[^\n]*$)+', re.M | re.S),
+    )
 
     def __init__(self) -> None:
         """Initialize."""
@@ -299,9 +298,6 @@ class VisitorIDL(Visitor):  # pylint: disable=too-many-public-methods
         return {k: (consts[k], v) for k, v in structs.items()}
     # yapf: enable
 
-    def visit_comment(self, _: str) -> None:
-        """Process comment, suppress output."""
-
     def visit_macro(self, _: Union[LiteralMatch, tuple[LiteralMatch, str]]) -> None:
         """Process macro, suppress output."""
 
diff --git a/src/rosbags/typesys/msg.py b/src/rosbags/typesys/msg.py
index 0ba942b4..61245dec 100644
--- a/src/rosbags/typesys/msg.py
+++ b/src/rosbags/typesys/msg.py
@@ -12,6 +12,7 @@ Rosbag1 connection information.
 
 from __future__ import annotations
 
+import re
 from hashlib import md5
 from pathlib import PurePosixPath as Path
 from typing import TYPE_CHECKING
@@ -43,13 +44,9 @@ msgsep
   = r'================================================================================'
 
 definition
-  = comment
-  / const_dcl
+  = const_dcl
   / field_dcl
 
-comment
-  = r'#[^\n]*'
-
 const_dcl
   = 'string' identifier '=' r'(?!={79}\n)[^\n]+'
   / type_spec identifier '=' float_literal
@@ -205,7 +202,7 @@ def denormalize_msgtype(typename: str) -> str:
 class VisitorMSG(Visitor):
     """MSG file visitor."""
 
-    RULES = parse_grammar(GRAMMAR_MSG)
+    RULES = parse_grammar(GRAMMAR_MSG, re.compile(r'(\s|#[^\n]*$)+', re.M | re.S))
 
     BASETYPES = {
         'bool',
@@ -222,9 +219,6 @@ class VisitorMSG(Visitor):
         'string',
     }
 
-    def visit_comment(self, _: str) -> None:
-        """Process comment, suppress output."""
-
     def visit_const_dcl(
         self,
         children: tuple[StringNode, StringNode, LiteralMatch, ConstValue],
diff --git a/src/rosbags/typesys/peg.py b/src/rosbags/typesys/peg.py
index bb2b9dba..1c296ea9 100644
--- a/src/rosbags/typesys/peg.py
+++ b/src/rosbags/typesys/peg.py
@@ -24,12 +24,12 @@ class Rule:
     """Rule base class."""
 
     LIT = 'LITERAL'
-    WS = re.compile(r'\s+', re.M | re.S)
 
     def __init__(
         self,
         value: Union[str, Pattern[str], Rule, list[Rule]],
         rules: dict[str, Rule],
+        whitespace: Pattern[str],
         name: Optional[str] = None,
     ):
         """Initialize.
@@ -37,16 +37,18 @@ class Rule:
         Args:
             value: Value of this rule.
             rules: Grammar containing all rules.
+            whitespace: Whitespace pattern.
             name: Name of this rule.
 
         """
         self.value = value
         self.rules = rules
         self.name = name
+        self.whitespace = whitespace
 
     def skip_ws(self, text: str, pos: int) -> int:
         """Skip whitespace."""
-        match = self.WS.match(text, pos)
+        match = self.whitespace.match(text, pos)
         return match.span()[1] if match else pos
 
     def make_node(self, data: T) -> Union[T, dict[str, Union[str, T]]]:
@@ -61,16 +63,23 @@ class Rule:
 class RuleLiteral(Rule):
     """Rule to match string literal."""
 
-    def __init__(self, value: str, rules: dict[str, Rule], name: Optional[str] = None):
+    def __init__(
+        self,
+        value: str,
+        rules: dict[str, Rule],
+        whitespace: Pattern[str],
+        name: Optional[str] = None,
+    ):
         """Initialize.
 
         Args:
             value: Value of this rule.
             rules: Grammar containing all rules.
+            whitespace: Whitespace pattern.
             name: Name of this rule.
 
         """
-        super().__init__(value, rules, name)
+        super().__init__(value, rules, whitespace, name)
         self.value = value[1:-1].replace('\\\'', '\'')
 
     def parse(self, text: str, pos: int) -> tuple[int, Any]:
@@ -89,16 +98,23 @@ class RuleRegex(Rule):
 
     value: Pattern[str]
 
-    def __init__(self, value: str, rules: dict[str, Rule], name: Optional[str] = None):
+    def __init__(
+        self,
+        value: str,
+        rules: dict[str, Rule],
+        whitespace: Pattern[str],
+        name: Optional[str] = None,
+    ):
         """Initialize.
 
         Args:
             value: Value of this rule.
             rules: Grammar containing all rules.
+            whitespace: Whitespace pattern.
             name: Name of this rule.
 
         """
-        super().__init__(value, rules, name)
+        super().__init__(value, rules, whitespace, name)
         self.value = re.compile(value[2:-1], re.M | re.S)
 
     def parse(self, text: str, pos: int) -> tuple[int, Any]:
@@ -234,7 +250,11 @@ def split_token(tok: str) -> list[str]:
     return list(filter(None, re.split(r'(^\()|(\)(?=[*+?]?$))|([*+?]$)', tok)))
 
 
-def collapse_tokens(toks: list[Optional[Rule]], rules: dict[str, Rule]) -> Rule:
+def collapse_tokens(
+    toks: list[Optional[Rule]],
+    rules: dict[str, Rule],
+    whitespace: Pattern[str],
+) -> Rule:
     """Collapse linear list of tokens to oneof of sequences."""
     value: list[Rule] = []
     seq: list[Rule] = []
@@ -242,13 +262,16 @@ def collapse_tokens(toks: list[Optional[Rule]], rules: dict[str, Rule]) -> Rule:
         if tok:
             seq.append(tok)
         else:
-            value.append(RuleSequence(seq, rules) if len(seq) > 1 else seq[0])
+            value.append(RuleSequence(seq, rules, whitespace) if len(seq) > 1 else seq[0])
             seq = []
-    value.append(RuleSequence(seq, rules) if len(seq) > 1 else seq[0])
-    return RuleOneof(value, rules) if len(value) > 1 else value[0]
+    value.append(RuleSequence(seq, rules, whitespace) if len(seq) > 1 else seq[0])
+    return RuleOneof(value, rules, whitespace) if len(value) > 1 else value[0]
 
 
-def parse_grammar(grammar: str) -> dict[str, Rule]:
+def parse_grammar(
+    grammar: str,
+    whitespace: Pattern[str] = re.compile(r'\s+', re.M | re.S),
+) -> dict[str, Rule]:
     """Parse grammar into rule dictionary."""
     rules: dict[str, Rule] = {}
     for token in grammar.split('\n\n'):
@@ -268,24 +291,24 @@ def parse_grammar(grammar: str) -> dict[str, Rule]:
                     '*': RuleZeroPlus,
                     '+': RuleOnePlus,
                     '?': RuleZeroOne,
-                }[tok](stack[-1], rules)
+                }[tok](stack[-1], rules, whitespace)
             elif tok == '/':
                 stack.append(None)
             elif tok == '(':
                 parens.append(len(stack))
             elif tok == ')':
                 index = parens.pop()
-                rule = collapse_tokens(stack[index:], rules)
+                rule = collapse_tokens(stack[index:], rules, whitespace)
                 stack = stack[:index]
                 stack.append(rule)
             elif len(tok) > 2 and tok[:2] == 'r\'':
-                stack.append(RuleRegex(tok, rules))
+                stack.append(RuleRegex(tok, rules, whitespace))
             elif tok[0] == '\'':
-                stack.append(RuleLiteral(tok, rules))
+                stack.append(RuleLiteral(tok, rules, whitespace))
             else:
-                stack.append(RuleToken(tok, rules))
+                stack.append(RuleToken(tok, rules, whitespace))
 
-        res = collapse_tokens(stack, rules)
+        res = collapse_tokens(stack, rules, whitespace)
         res.name = name
         rules[name] = res
     return rules
diff --git a/tests/test_parse.py b/tests/test_parse.py
index 662aecfd..129f16d2 100644
--- a/tests/test_parse.py
+++ b/tests/test_parse.py
@@ -118,6 +118,12 @@ module test_msgs {
   // comment in module
   typedef std_msgs::msg::Bool Bool;
 
+  /**/ /***/ /* block comment */
+
+  /*
+   * block comment
+   */
+
   module msg {
     // comment in submodule
     typedef Bool Balias;
@@ -131,10 +137,11 @@ module test_msgs {
 
     @comment(type="text", text="ignore")
     struct Foo {
+        // comment in struct
         std_msgs::msg::Header header;
         Balias bool;
         Bar sibling;
-        double x;
+        double/* comment in member declaration */x;
         sequence<double> seq1;
         sequence<double, 4> seq2;
         d4 array;