-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathLearnPrettyPrinter.rsc
More file actions
145 lines (118 loc) · 7.28 KB
/
LearnPrettyPrinter.rsc
File metadata and controls
145 lines (118 loc) · 7.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
module analysis::m3::LearnPrettyPrinter
import IO;
import Node;
import Type;
import List;
import Location;
import String;
import Set;
import util::FileSystem;
import lang::cpp::AST;
@synopsis{this is a demo/test function that learns a C pretty printer from the C files in the `example` folder.}
void generateFormatterForC() {
writeFile(|project://tree-pattern-generator/src/main/rascal/analysis/diff/GeneratedFormatter.rsc|,
"module analysis::diff::GeneratedFormatter
'
'import lang::cpp::AST;
'
'<filesToPrettyPrinter(find(|project://tree-pattern-generator/examples|, "c"), #Declaration, parseC)>
'");
}
@synopsis{Takes a corpus of example files and an (external) parser for the given language and produces a pretty printing function as Rascal code.}
@description{
If the external parser satisfies the specification in `analysis::m3::AST` then this function will learn a basic
`unparser` (a.k.a. pretty printer) from the corpus of files.
The code that is generated is Rascal code which can be expanded into a module that imports the right AST format, and nothing else.
The "pretty" aspect should not be over-estimated:
* The generated formatter does not deal with nested indentation.
* It has one rule for every unique AST node kind (constructor), so special combinations do not get special treatment
* It is not able to generate syntactically correct files for layout-sensitive languages
* If the core example for a node contains source code comments, these will end up in the output
This function is typically used in source-to-source transformations where an intermediate AST to AST transformation
takes place. To derive source code from fresh syntax nodes, the format function is required.
}
str filesToPrettyPrinter(set[loc] files, type[node] grammar, node (loc) parser)
= treesToPrettyPrinter({parser(f) | f <- files}, grammar);
@synopsis{Takes a corpus of syntax trees (for the same language!) and produces Rascal code that can generate code back from ASTs.}
str treesToPrettyPrinter(set[node] asts, type[node] _grammar) {
// First we collect some general information for quick and easy reference later
// This the string contents of each file
rel[loc, str] files = {<l.top, readFile(l.top)> | n <- asts, loc l := n.src};
// These are _all_ the sub-trees and their location in the input file
rel[loc, node] nodes = {<l, n> | /node n := asts, n.src?, loc l := n.src};
// these are all the sub-strings for all the sub-strees, indexed by location in the input file
rel[loc, str] yields = {<l, content[l.offset..l.offset+l.length]> | <l, _> <- nodes, str content <- files[l.top]};
// now we have a function generated for each node type in the language. This also removes duplicates.
rel[str, str] formatters = { <"<getName(n)>(<"<for (i <- [0..arity(n)]) {><\type(n[i])> arg_<i+1>, <}>"[..-2]>)", formatFunction(y, n)> | <l,y> <- yields, n <- nodes[l]};
// we prefer the longest alternatives, given that those would retain more information from the example code.
// different lengths my appear when some constructors do not use optionals or have empty lists in the corpus
rel[str, str] bestFormatters = {<key, sort(formatters[key], bool (str x, str y) { return x > y; })[-1]> | key <- formatters<0> };
return
"@synopsis{Formats abstract syntax trees back to source code.}
'<for (<_, f> <- bestFormatters) {><f>
'
'<}>";
}
str formatFunction(str yield, node n)
= "str format(<getName(n)>(" + "<for (i <- [0..arity(n)]) {><\type(n[i])> arg_<i+1>, <}>"[..-2] + ")) =
'\"<for (p <- cutOut(yield, n)) {><print(p)><}>\";";
private data Template
= hole(int n)
| sep(str chars)
| \list(str sep, int n)
| const(str chars, int n)
;
@synopsis{get a template out of an example node by replacing the children with holes}
list[Template] cutOut(str yield, node n) {
children = getChildren(n);
positions = position(\loc(n), children);
ind = index(children);
begin = \loc(n).offset;
pattern = for (<value ch, loc pos, int i> <- reverse(zip3(children, positions, ind))) {
append sep(yield[-begin + pos.offset + pos.length ..]);
append template(begin, ch, i+1, yield);
yield = yield[..-begin + pos.offset];
}
if (positions[0]?) {
pattern += sep(yield[..positions[0].offset]);
}
else {
pattern += sep(yield);
}
return reverse(pattern);
}
@synopsis{Turns a child into a template element}
Template template(int _begin, int _, int i, str _yield) = const("<i>", i); // normal nodes become holes
Template template(int _begin, str s, int i, str _yield) = const(s, i); // normal nodes become holes
Template template(int _begin, node _, int i, str _yield) = hole(i); // normal nodes become holes
Template template(int _begin, [], int i, str _yield) = sep(""); // empty lists are empty separators
Template template(int _begin, [node a], int i, str yield) = \list("", i); // singletons do not have separators
Template template(int begin, [node a, node b, *_], int i, str yield) = \list(yield[-begin+\loc(a).offset+\loc(a).length..-begin+\loc(b).offset], i); // we learn a separator from the non-empty lists
@synopsis{Prints a rascal code template part, including recursive calls.}
str print(hole(int i)) = "\<format(arg_<i>)\>";
str print(sep(str chars)) = escape(chars);
str print(const(str _chars, int i)) = "\<arg_<i>\>";
str print(\list(str sep, int i)) = "\<\"\<for (value v \<- arg_<i>) {\>\<format(v)\><escape(sep)>\<}\>\"<if (sep != "") {>[..-<size(sep)>]<}>\>";
@synopsis{Give every element a true location for later processing.}
list[loc] position(loc span, list[value] l) = infer(span, [pos(span, x) | x <- l]);
@synsopsis{An element either knows its position, or it does not.}
loc pos(loc span, int _) = span;
loc pos(loc span, str _) = span;
loc pos(loc _span, []) = |empty:///|;
loc pos(loc _span, node n) = \loc(n);
loc pos(loc _span, [node n]) = \loc(n);
loc pos(loc _span, [node a, *_, node b]) = cover([\loc(a), \loc(b)]);
@synopsis{Replaces all |empty:///| with a correct loc inferred from the surroundings}
list[loc] infer(loc span, [loc l, *loc rest]) = infer(span, [span[length=0], *rest]) when l == |empty:///|;
list[loc] infer(loc span, [*loc rest, loc l]) = infer(span, [*rest, span[offset=span.offset+span.length-1][length=0]]) when l == |empty:///|;
list[loc] infer(loc span, [*loc pre, loc before, loc l, *loc post]) = infer(span, [*pre, before, before[offset=before.offset+before.length][length = 0], *post]) when l == |empty:///|;
list[loc] infer(loc span, [*loc pre, loc l, loc after, *loc post]) = infer(span, [*pre, after[offset=after.offset][length = 0], after, *post]) when l == |empty:///|;
default list[loc] infer(loc _span, list[loc] done) = done;
@synopsis{Print the type of a value, as Rascal code}
str \type(value n) = "<type(typeOf(n), ())>";
@synopsis{Escape string elements for use in a Rascal template string}
str escape(str s) = visit("<[s]>"[2..-2]) {
case /\\n/ => "\n\'"
};
@synopsis{Waiting for `node.src` to be available in Rascal for good...}
loc \loc(node n) = l when loc l := n.src;