1 /**
2 This module contains function to inspect a Pegged grammar.
3 */
4 module pegged.introspection;
5 
6 import std.algorithm : equal;
7 import std.typecons;
8 
9 import pegged.parser;
10 
11 @safe:
12 
13 /**
14 The different kinds of recursion for a rule.
15 'direct' means the rule name appears in its own definition. 'indirect' means the rule calls itself through another rule (the call chain can be long).
16 */
17 enum Recursive { no, direct, indirect }
18 
19 /**
20 Left-recursion diagnostic for a rule. A rule is left-recursive when its own name appears at the beginning of its definition or behind possibly-null-matching rules (see below for null matches).
21 For example A <- A 'a' is left-recursive, whereas A <- 'a' A is not. *But* A <- 'a'? A is left-recursive, since if the input does not begin
22 with 'a', then the parsing will continue by invoking A again, at the same position.
23 
24 'direct' means the rule invokes itself in the first position of its definition (A <- A ...). 'hidden' means the rule names appears after a possibly-null other rule (A <- 'a'? A ...). 'indirect' means the rule calls itself trough another rule.
25 */
26 enum LeftRecursive { no, direct, hidden, indirect }
27 
28 /**
29 NullMatch.yes means a rule can succeed while consuming no input. For example e? or e*, for all expressions e.
30 Nullmatch.no means a rule will always consume at least a token while succeeding.
31 Nullmatch.indeterminate means the algorithm could not converge.
32 */
33 enum NullMatch { no, yes, indeterminate }
34 
35 /**
36 InfiniteLoop.yes means a rule can loop indefinitely while consuming nothing.
37 InfiniteLoop.no means a rule cannot loop indefinitely.
38 InfiniteLoop.indeterminate means the algorithm could not converge.
39 */
40 enum InfiniteLoop { no, yes, indeterminate }
41 
42 /**
43 Struct holding the introspection info on a rule.
44 */
45 struct RuleInfo
46 {
47     Recursive recursion; /// Is the rule recursive?
48     LeftRecursive leftRecursion; /// Is the rule left-recursive?
49     NullMatch nullMatch; /// Can the rule succeed while consuming nothing?
50     InfiniteLoop infiniteLoop; /// Can the rule loop indefinitely, while consuming nothing?
51     string[] leftRecursiveCycle; /// The path of rules traversed before indirect left-recursion recurses.
52 }
53 
54 /**
55 Struct holding the introspection info on a grammar.
56 */
57 struct GrammarInfo
58 {
59     RuleInfo[string] ruleInfo;
60     immutable(string[])[] leftRecursiveCycles;
61 }
62 
63 pure void appendCycleIfUnique(ref immutable(string[])[] cycles, const string[] cycle)
64 {
65     bool exists()
66     {
67         import std.algorithm : countUntil;
68         foreach (c; cycles)
69         {
70             if (c.length != cycle.length)
71                 continue;
72             auto pos = countUntil(c, cycle[0]);
73             if (pos < 0)
74                 continue;
75             if (equal!equal(c[pos .. $] ~ c[0 .. pos], cycle))
76                 return true;
77         }
78         return false;
79     }
80     if (!exists())
81         cycles ~= cycle.idup;
82 }
83 
84 /**
85 Returns for all grammar rules:
86 
87 - the recursion type (no recursion, direct or indirect recursion).
88 - the left-recursion type (no left-recursion, direct left-recursion, hidden, or indirect)
89 - the null-match for a grammar's rules: whether the rule can succeed while consuming nothing.
90 - the possibility of an infinite loop (if 'e' can null-match, then 'e*' can enter an infinite loop).
91 
92 This kind of potential problem can be detected statically and should be transmitted to the grammar designer.
93 */
94 pure GrammarInfo grammarInfo(ParseTree p)
95 {
96     if (p.name == "Pegged")
97         return grammarInfo(p.children[0]);
98     assert(p.name == "Pegged.Grammar");
99 
100     GrammarInfo result;
101     ParseTree[string] rules;
102 
103     /**
104     Returns the call graph of a grammar: the list of rules directly called by each rule of the grammar.
105     The graph is represented as a bool[string][string] associative array, the string holding
106     the rules names. graph["ruleName"] contains all rules called by ruleName, as a set (a bool[string] AA).
107 
108     graph.keys thus gives the grammar's rules names.
109 
110     If a rule calls itself, its own name will appear in the called set. If a rule calls an external rule, it will
111     also appear in the call graph when the rule has a name: hence, calls to predefined rules like 'identifier' or
112     'digit' will appear, but not a call to '[0-9]+', considered here as an anonymous rule.
113     */
114     bool[string][string] callGraph(ParseTree p) @safe
115     {
116         bool[string] findIdentifiers(ParseTree p) @safe
117         {
118             bool[string] idList;
119             if (p.name == "Pegged.Identifier")
120                 idList[p.matches[0]] = true;
121             else
122                 foreach(child; p.children)
123                 {
124                     auto ids = findIdentifiers(child);
125                     static if (hasSystemAAKeys)
126                         auto keys = () @trusted { return ids.keys; } ();
127                     else
128                         auto keys = ids.keys;
129                     foreach(name; keys)
130                         idList[name] = true;
131                 }
132 
133             return idList;
134         }
135 
136         bool[string][string] graph;
137 
138         foreach(definition; p.children)
139             if (definition.name == "Pegged.Definition")
140             {
141                 auto ids = findIdentifiers(definition.children[2]);
142                 graph[definition.matches[0]] = ids;
143                 foreach(id, _; ids) // getting possible external calls
144                     if (id !in graph)
145                         graph[id] = (bool[string]).init;
146             }
147 
148         return graph;
149     }
150 
151     /**
152     The transitive closure of a call graph.
153     It will propagate the calls to find all rules called by a given rule,
154     directly (already in the call graph) or indirectly (through another rule).
155     */
156     bool[string][string] closure(bool[string][string] graph) @safe
157     {
158         bool[string][string] path;
159         foreach(rule, children; graph) // deep-dupping, to avoid children aliasing
160             path[rule] = children.dup;
161 
162         bool changed = true;
163 
164         while(changed)
165         {
166             changed = false;
167             static if (hasSystemAAKeys)
168                 auto keys = () @trusted { return graph.keys; } ();
169             else
170                 auto keys = graph.keys;
171             foreach(rule1; keys)
172                 foreach(rule2; keys)
173                     if (rule2 in path[rule1])
174                         foreach(rule3; keys)
175                             if (rule3 in path[rule2] && rule3 !in path[rule1])
176                             {
177                                 path[rule1][rule3] = true;
178                                 changed = true;
179                             }
180         }
181 
182         return path;
183     }
184 
185     Recursive[string] recursions(bool[string][string] graph) @safe
186     {
187         bool[string][string] path = closure(graph);
188 
189         Recursive[string] result;
190         foreach(rule, children; path)
191         {
192             result[rule] = Recursive.no;
193             if (rule in children)
194             {
195                 if (rule in graph[rule])
196                     result[rule] = Recursive.direct;
197                 else
198                     result[rule] = Recursive.indirect;
199             }
200         }
201 
202         return result;
203     }
204 
205     NullMatch nullMatching(ParseTree p) @safe
206     {
207         switch (p.name)
208         {
209             case "Pegged.Expression":
210                 return nullMatching(p.children[0]);
211             case "Pegged.FirstExpression",
212                  "Pegged.LongestExpression": // choice expressions null-match whenever one of their components can null-match
213                 foreach(seq; p.children)
214                     if (nullMatching(seq) == NullMatch.yes)
215                         return NullMatch.yes;
216                 foreach(seq; p.children)
217                     if (nullMatching(seq) == NullMatch.indeterminate)
218                         return NullMatch.indeterminate;
219                 return NullMatch.no;
220             case "Pegged.Sequence": // sequence expressions can null-match when all their components can null-match
221                 foreach(pref; p.children)
222                     if (nullMatching(pref) == NullMatch.no)
223                         return NullMatch.no;
224                 foreach(pref; p.children)
225                     if (nullMatching(pref) == NullMatch.indeterminate)
226                         return NullMatch.indeterminate;
227                 return NullMatch.yes;
228             case "Pegged.Prefix":
229                 foreach(pref; p.children[0..$-1])
230                     if (pref.name == "Pegged.POS" || pref.name == "Pegged.NEG")
231                         return NullMatch.yes;
232                 return nullMatching(p.children[$-1]);
233             case "Pegged.Suffix":
234                 foreach(suff; p.children[1..$])
235                     if (suff.name == "Pegged.ZEROORMORE" || suff.name == "Pegged.OPTION")
236                         return NullMatch.yes;
237                 return nullMatching(p.children[0]);
238             case "Pegged.Primary":
239                 return nullMatching(p.children[0]);
240             case "Pegged.RhsName":
241                 if (p.matches[0] in result.ruleInfo)
242                     if (result.ruleInfo[p.matches[0]].nullMatch != NullMatch.indeterminate)
243                         return result.ruleInfo[p.matches[0]].nullMatch;
244                 return nullMatching(p.children[0]);
245             case "Pegged.Identifier":
246                 if (p.matches[0] == "eps" ||
247                     p.matches[0] == "eoi")
248                     return NullMatch.yes;
249                 return NullMatch.indeterminate;
250             case "Pegged.Literal":
251                 if (p.matches[0].length == 0) // Empty literal, '' or ""
252                     return NullMatch.yes;
253                 else
254                     return NullMatch.no;
255             case "Pegged.CharClass":
256             case "Pegged.ANY":
257                 return NullMatch.no;
258             case "eps":
259             case "eoi":
260                 return NullMatch.yes;
261             default:
262                 return NullMatch.indeterminate;
263         }
264     }
265 
266     InfiniteLoop infiniteLooping(ParseTree p) @safe
267     {
268         switch (p.name)
269         {
270             case "Pegged.Expression": // choice expressions loop whenever one of their components can loop
271             case "Pegged.Sequence": // sequence expressions can loop when one of their components can loop
272                 foreach(seq; p.children)
273                 {
274                     auto nm = infiniteLooping(seq);
275                     if (nm == InfiniteLoop.yes)
276                         return InfiniteLoop.yes;
277                     if (nm == InfiniteLoop.indeterminate)
278                         return InfiniteLoop.indeterminate;
279                 }
280                 return InfiniteLoop.no;
281             case "Pegged.Prefix":
282                 return infiniteLooping(p.children[$-1]);
283             case "Pegged.Suffix":
284                 foreach(pref; p.children[1..$])
285                     if ((  pref.name == "Pegged.ZEROORMORE" || pref.name == "Pegged.ONEORMORE")
286                         && p.matches[0] in result.ruleInfo
287                         && result.ruleInfo[p.matches[0]].nullMatch == NullMatch.yes)
288                         return InfiniteLoop.yes;
289                 return infiniteLooping(p.children[0]);
290             case "Pegged.Primary":
291                 return infiniteLooping(p.children[0]);
292             case "Pegged.RhsName":
293                 if (p.matches[0] in result.ruleInfo)
294                     return result.ruleInfo[p.matches[0]].infiniteLoop;
295                 else
296                     return infiniteLooping(p.children[0]);
297             case "Pegged.Literal":
298             case "Pegged.CharClass":
299             case "Pegged.ANY":
300             case "eps":
301             case "eoi":
302                 return InfiniteLoop.no;
303             default:
304                 return InfiniteLoop.indeterminate;
305         }
306     }
307 
308     LeftRecursive leftRecursion(ParseTree p, ref string[] cycle) @safe
309     {
310         import std.algorithm.searching: countUntil;
311         switch (p.name)
312         {
313             case "Pegged.Expression":
314                 return leftRecursion(p.children[0], cycle);
315             case "Pegged.FirstExpression",
316                  "Pegged.LongestExpression": // Choices are left-recursive if any choice is left-recursive
317                 // Because memoized left-recursion handling needs to know about all left-recursive cycles,
318                 // we consider all choices, not just one.
319                 auto any_lr = LeftRecursive.no;
320                 size_t current = cycle.length;
321                 foreach(seq; p.children)
322                 {
323                     cycle = cycle[0..current];
324                     auto lr = leftRecursion(seq, cycle);
325                     if (lr != LeftRecursive.no && any_lr == LeftRecursive.no)
326                         any_lr = lr;
327                 }
328                 cycle = cycle[0..current];
329                 return any_lr;
330             case "Pegged.Sequence": // Sequences are left-recursive when the leftmost member is left-recursive
331                                     // or behind null-matching members
332                 size_t current = cycle.length;
333                 foreach(i, seq; p.children)
334                 {
335                     auto lr = leftRecursion(seq, cycle);
336                     if (lr == LeftRecursive.direct)
337                         return (i == 0 ? LeftRecursive.direct : LeftRecursive.hidden);
338                     if (lr == LeftRecursive.hidden || lr == LeftRecursive.indirect)
339                         return lr;
340                     if (nullMatching(seq) == NullMatch.yes)
341                         continue;
342                     else
343                     {
344                         cycle = cycle[0..current];
345                         return LeftRecursive.no;
346                     }
347                 }
348                 cycle = cycle[0..current];
349                 return LeftRecursive.no; // found only null-matching rules!
350             case "Pegged.Prefix":
351                 return leftRecursion(p.children[$-1], cycle);
352             case "Pegged.Suffix":
353             case "Pegged.Primary":
354                 return leftRecursion(p.children[0], cycle);
355             case "Pegged.RhsName":
356                 auto dejavu = cycle.countUntil(p.matches[0]);
357                 if (dejavu >= 0)
358                 {
359                     if (cycle.length == 1)
360                     {
361                         result.leftRecursiveCycles.appendCycleIfUnique(cycle);
362                         return LeftRecursive.direct;
363                     }
364                     // Strictly spoken, the rules before dejavu are not actively left-recursive, but since they
365                     // can call into a left-recursive cycle, they would still left-recurse if left-recursion
366                     // wasn't handled.
367                     result.leftRecursiveCycles.appendCycleIfUnique(cycle[dejavu..$]);
368                     return LeftRecursive.indirect;
369                 }
370                 size_t current = cycle.length;
371                 cycle ~= p.matches[0];
372                 if ((p.matches[0] in rules) && (leftRecursion(rules[p.matches[0]], cycle) != LeftRecursive.no))
373                     return LeftRecursive.indirect;
374                 cycle = cycle[0..current];
375                 return LeftRecursive.no;
376             default:
377                 return LeftRecursive.no;
378         }
379     }
380 
381     // Initialize rules and result.
382     foreach(definition; p.children)
383         if (definition.name == "Pegged.Definition")
384         {
385             rules[definition.matches[0]] = definition.children[2];
386             result.ruleInfo[definition.matches[0]] = RuleInfo(Recursive.no, LeftRecursive.no,
387                                                               NullMatch.indeterminate, InfiniteLoop.indeterminate);
388         }
389 
390     // Detect recursions.
391     foreach(rule, recursionType; recursions(callGraph(p)))
392         if (rule in result.ruleInfo) // external rules are in recursions, but not in result
393             result.ruleInfo[rule].recursion = recursionType;
394 
395     // Detect left-recursions.
396     foreach(name, tree; rules)
397         if (result.ruleInfo[name].recursion != Recursive.no)
398         {
399             result.ruleInfo[name].leftRecursiveCycle ~= name;
400             result.ruleInfo[name].leftRecursion = leftRecursion(tree, result.ruleInfo[name].leftRecursiveCycle);
401         }
402 
403     // Detect null matches.
404     bool changed = true;
405     while(changed) // while something new happened, the process is not over
406     {
407         changed = false;
408         foreach(name, tree; rules)
409             if (result.ruleInfo[name].nullMatch == NullMatch.indeterminate) // not done yet
410             {
411                 result.ruleInfo[name].nullMatch = nullMatching(tree); // try to find if it's null-matching
412                 if (result.ruleInfo[name].nullMatch != NullMatch.indeterminate)
413                     changed = true;
414             }
415     }
416 
417     // Detect infinite loops.
418     changed = true;
419     while(changed) // while something new happened, the process is not over
420     {
421         changed = false;
422         foreach(name, tree; rules)
423             if (result.ruleInfo[name].infiniteLoop == InfiniteLoop.indeterminate) // not done yet
424             {
425                 result.ruleInfo[name].infiniteLoop = infiniteLooping(tree); // try to find if it's looping
426                 if (result.ruleInfo[name].infiniteLoop != InfiniteLoop.indeterminate)
427                     changed = true;
428             }
429     }
430 
431     return result;
432 }
433 
434 /**
435 Returns for all grammar rules:
436 
437 - the recursion type (no recursion, direct or indirect recursion).
438 - the left-recursion type (no left-recursion, direct left-recursion, hidden, or indirect)
439 - the null-match for a grammar's rules: whether the rule can succeed while consuming nothing.
440 - the possibility of an infinite loop (if 'e' can null-match, then 'e*' can enter an infinite loop).
441 
442 This kind of potential problem can be detected statically and should be transmitted to the grammar designer.
443 */
444 pure RuleInfo[string] ruleInfo(ParseTree p)
445 {
446     return grammarInfo(p).ruleInfo;
447 }
448 
449 /** ditto */
450 RuleInfo[string] ruleInfo(string grammar)
451 {
452     return ruleInfo(Pegged(grammar).children[0]);
453 }
454 
455 unittest
456 {
457     auto info = ruleInfo(`
458         Test:
459             A <- A 'a'
460     `);
461     assert(info["A"].leftRecursion == LeftRecursive.direct);
462 
463     info = ruleInfo(`
464         Test:
465             A <- B? A 'a'
466             B <- 'b'
467     `);
468     assert(info["A"].leftRecursion == LeftRecursive.hidden);
469 
470     info = ruleInfo(`
471         Test:
472             A <- B 'a'
473             B <- A
474     `);
475     assert(info["A"].leftRecursion == LeftRecursive.indirect);
476 }
477 
478 // Test against infinite recursion in detection of indirect left-recursion.
479 unittest
480 {
481     auto info = ruleInfo(`
482         Test:
483             A <- B / C 'a'
484             B <- A
485             C <- A
486     `);
487     assert(info["A"].leftRecursion == LeftRecursive.indirect);
488 }
489 
490 // Test against compile-time infinite recursion.
491 unittest // Mutual left-recursion
492 {
493     enum ct = ruleInfo(`
494       Left:
495         A <- L
496         L <- P
497         P <- P / L
498     `);
499     static assert(ct["A"].leftRecursion == LeftRecursive.no);
500     static assert(ct["L"].leftRecursion == LeftRecursive.indirect);
501     static assert(ct["P"].leftRecursion == LeftRecursive.direct);
502 
503     auto rt = ruleInfo(`
504         Left:
505           A <- L
506           L <- P
507           P <- P / L
508     `);
509     assert(rt["A"].leftRecursion == LeftRecursive.no);
510     assert(rt["L"].leftRecursion == LeftRecursive.indirect);
511     assert(rt["P"].leftRecursion == LeftRecursive.direct);
512 }
513 
514 unittest // Intersecting cycles of left-recursion
515 {
516     enum ct = ruleInfo(`
517       Left:
518         C <- A
519         A <- B* C
520         B <- A
521     `);
522     static assert(ct["C"].leftRecursion == LeftRecursive.indirect);
523     static assert(ct["A"].leftRecursion == LeftRecursive.indirect);
524     static assert(ct["B"].leftRecursion == LeftRecursive.indirect);
525     auto rt = ruleInfo(`
526       Left:
527         C <- A
528         A <- B* C
529         B <- A
530     `);
531     assert(rt["C"].leftRecursion == LeftRecursive.indirect);
532     assert(rt["A"].leftRecursion == LeftRecursive.indirect);
533     assert(rt["B"].leftRecursion == LeftRecursive.indirect);
534 }
535 
536 unittest // Null-matching
537 {
538     enum ct = ruleInfo(`
539       NM:
540         NMM <- NML eoi
541         NML <- 'x'?
542     `);
543     static assert(ct["NML"].nullMatch == NullMatch.yes);
544     static assert(ct["NMM"].nullMatch == NullMatch.yes);
545     auto rt = ruleInfo(`
546       NM:
547         NMM <- NML eoi
548         NML <- 'x'?
549     `);
550     assert(rt["NML"].nullMatch == NullMatch.yes);
551     assert(rt["NMM"].nullMatch == NullMatch.yes);
552 }
553 
554 unittest // Not null-matching
555 {
556     enum ct = ruleInfo(`
557       Left:
558         M <- L eoi
559         L <- P '.x' / 'x'
560         P <- P '(n)' / L
561     `);
562     static assert(ct["M"].nullMatch == NullMatch.no);
563     static assert(ct["L"].nullMatch == NullMatch.no);
564     static assert(ct["P"].nullMatch == NullMatch.no);
565     auto rt = ruleInfo(`
566       Left:
567         M <- L eoi
568         L <- P '.x' / 'x'
569         P <- P '(n)' / L
570     `);
571     assert(rt["M"].nullMatch == NullMatch.no);
572     assert(rt["L"].nullMatch == NullMatch.no);
573     assert(rt["P"].nullMatch == NullMatch.no);
574 }
575 
576 unittest // Left-recursive null-matching
577 {
578     enum ct = ruleInfo(`
579       Left:
580         M <- L eoi
581         L <- P? '.x'? / 'x'
582         P <- P '(n)' / L
583     `);
584     static assert(ct["M"].nullMatch == NullMatch.yes);
585     static assert(ct["L"].nullMatch == NullMatch.yes);
586     static assert(ct["P"].nullMatch == NullMatch.yes);
587     auto rt = ruleInfo(`
588       Left:
589         M <- L eoi
590         L <- P? '.x'? / 'x'
591         P <- P '(n)' / L
592     `);
593     assert(rt["M"].nullMatch == NullMatch.yes);
594     assert(rt["L"].nullMatch == NullMatch.yes);
595     assert(rt["P"].nullMatch == NullMatch.yes);
596 }
597 
598 
599 /**
600 Act on rules parse tree as produced by pegged.parser.
601 Replace every occurence of child in parent by child's parse tree
602 */
603 ParseTree replaceInto(ParseTree parent, ParseTree child)
604 {
605     if (parent.name == "Pegged.RhsName" && parent.matches[0] == child.matches[0])
606         return ParseTree("Pegged.Named", true, child.matches[0..1], "",0,0,
607                        [child.children[2],
608                         ParseTree("Pegged.Identifier", true, child.matches[0..1])]);
609     else
610         foreach(ref branch; parent.children)
611             branch = replaceInto(branch, child);
612     return parent;
613 }
614 
615 /* .keys is @safe after compiler version >= 2.098.0.
616  *
617  * See:
618  * https://dlang.org/changelog/2.098.0.html#bugfix-list and
619  * https://issues.dlang.org/show_bug.cgi?id=14439 */
620 enum bool hasSystemAAKeys = __VERSION__ < 2098;