Skip to content

Commit 7dfce64

Browse files
committed
LALR(1) parser lib/yaml
1 parent 4cbe73b commit 7dfce64

3 files changed

Lines changed: 266 additions & 0 deletions

File tree

lib/yaml/README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# C YAML Parser
2+
3+
This implementation uses standard compiler tools (Lex and Yacc) to parse YAML.
4+
5+
Current Status: 155 tests passing and 196 failing.
6+
7+
## Usage
8+
9+
You can build and run the parser using the terminal:
10+
11+
```bash
12+
lex yaml.l # Generate lexer (lex.yy.c)
13+
yacc -d yaml.y # Generate parser (y.tab.c, y.tab.h)
14+
cc lex.yy.c y.tab.c -lfl -o yaml # Compile
15+
./yaml < input.yaml # Run
16+
```
17+
18+
## Design
19+
20+
We employ a standard LALR(1) grammar, similar to those defining languages like C or Python. To accommodate YAML's significant indentation, the lexer pre-processes whitespace into explicit tokens. This approach allows us to use mathematically well-understood tools (`flex` and `bison`) rather than ad-hoc parsing strategies.
21+
22+
### References
23+
24+
- [The YAML Specification (1.2.2)](https://yaml.org/spec/1.2.2/)
25+
- `man 1 bison`, `man 1 flex`

lib/yaml/yaml.l

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
%{
2+
/* YAML Lexer - Refined for LALR(1) compatibility */
3+
#include <stdio.h>
4+
#include <string.h>
5+
#include <stdbool.h>
6+
#include "y.tab.h"
7+
8+
extern YYSTYPE yylval;
9+
int indent_stack[100] = {0}, indent_level = 0, flow_level = 0;
10+
int token_queue[1024], q_head = 0, q_tail = 0;
11+
12+
void enqueue(int t) { if (q_tail < 1024) token_queue[q_tail++] = t; }
13+
int dequeue() { return (q_head < q_tail) ? token_queue[q_head++] : 0; }
14+
15+
int process_indent(int indent) {
16+
if (flow_level > 0) return 0;
17+
if (indent > indent_stack[indent_level]) {
18+
indent_stack[++indent_level] = indent;
19+
return INDENT;
20+
}
21+
while (indent < indent_stack[indent_level] && indent_level > 0) {
22+
indent_level--; enqueue(DEDENT);
23+
}
24+
return dequeue();
25+
}
26+
27+
int real_yylex();
28+
int next_tok = -1;
29+
30+
int yylex() {
31+
int tok;
32+
if (next_tok != -1) { tok = next_tok; next_tok = -1; return tok; }
33+
tok = (q_head < q_tail) ? dequeue() : real_yylex();
34+
if (tok == NEWLINE) {
35+
int following = real_yylex();
36+
if (following == INDENT) return NEWLINE_INDENT;
37+
if (following == DEDENT) return NEWLINE_DEDENT;
38+
next_tok = following;
39+
}
40+
return tok;
41+
}
42+
43+
const char* tok_name(int tok) {
44+
switch(tok) {
45+
case DOC_START: return "DOC_START"; case DOC_END: return "DOC_END";
46+
case LBRACKET: return "LBRACKET"; case RBRACKET: return "RBRACKET";
47+
case LBRACE: return "LBRACE"; case RBRACE: return "RBRACE";
48+
case COMMA: return "COMMA"; case SEQ_ENTRY: return "SEQ_ENTRY";
49+
case MAP_KEY: return "MAP_KEY"; case COLON: return "COLON";
50+
case NEWLINE: return "NEWLINE"; case INDENT: return "INDENT";
51+
case DEDENT: return "DEDENT"; case NEWLINE_DEDENT: return "NEWLINE_DEDENT";
52+
case NEWLINE_INDENT: return "NEWLINE_INDENT"; case ANCHOR: return "ANCHOR";
53+
case ALIAS: return "ALIAS"; case TAG: return "TAG";
54+
case PLAIN_SCALAR: return "PLAIN_SCALAR"; case DQUOTE_STRING: return "DQUOTE_STRING";
55+
case SQUOTE_STRING: return "SQUOTE_STRING"; case LITERAL: return "LITERAL";
56+
case FOLDED: return "FOLDED"; case LITERAL_CONTENT: return "LITERAL_CONTENT";
57+
default: return "CHAR";
58+
}
59+
}
60+
#define yylex real_yylex
61+
%}
62+
63+
%option noyywrap yylineno
64+
%x BOL FLOW
65+
NEWLINE \n|\r\n|\r
66+
SEP [ \t\n\r,\[\]\{\}]
67+
SAFE_S_BLK [^ \t\n\r\-\?:,\[\]\{\}!#&*|>\x22\x27%@\x60]
68+
SAFE_C_BLK [^ \t\n\r,\[\]\{\}:!]
69+
70+
%%
71+
72+
<BOL>[ \t]* {
73+
int indent = 0;
74+
for (int i = 0; i < yyleng; i++) indent += (yytext[i] == '\t') ? 8 - (indent % 8) : 1;
75+
BEGIN(INITIAL);
76+
int tok = process_indent(indent);
77+
if (tok) return tok;
78+
}
79+
<BOL>{NEWLINE} { return NEWLINE; }
80+
<BOL><<EOF>> { while (indent_level >= 0) { indent_level--; enqueue(DEDENT); } return dequeue(); }
81+
<BOL>. { yyless(0); BEGIN(INITIAL); }
82+
83+
<INITIAL,FLOW>"---"/({SEP}|$) { return DOC_START; }
84+
<INITIAL,FLOW>"..."/({SEP}|$) { return DOC_END; }
85+
<INITIAL>"-"/({SEP}|$) { return SEQ_ENTRY; }
86+
<INITIAL>":"/({SEP}|$) { return COLON; }
87+
<INITIAL,FLOW>"," { return COMMA; }
88+
<INITIAL,FLOW>"[" { flow_level++; BEGIN(FLOW); return LBRACKET; }
89+
<FLOW>"]" { if (--flow_level == 0) BEGIN(INITIAL); return RBRACKET; }
90+
<INITIAL,FLOW>"{" { flow_level++; BEGIN(FLOW); return LBRACE; }
91+
<FLOW>"}" { if (--flow_level == 0) BEGIN(INITIAL); return RBRACE; }
92+
<FLOW>":" { return COLON; }
93+
94+
<INITIAL,FLOW>"&"[a-zA-Z0-9_\-]+ { yylval.str = strdup(yytext+1); return ANCHOR; }
95+
<INITIAL,FLOW>"*"[a-zA-Z0-9_\-]+ { yylval.str = strdup(yytext+1); return ALIAS; }
96+
<INITIAL,FLOW>"!"[a-zA-Z0-9_\-./!@#$%&()=+~\\<>]* { yylval.str = strdup(yytext+1); return TAG; }
97+
98+
<INITIAL,FLOW>\"([^\"\\]|\\.)*\" { yylval.str = strndup(yytext+1, yyleng-2); return DQUOTE_STRING; }
99+
<INITIAL,FLOW>\'[^\']*\' { yylval.str = strndup(yytext+1, yyleng-2); return SQUOTE_STRING; }
100+
101+
<INITIAL,FLOW>{SAFE_S_BLK}({SAFE_C_BLK}|":"[^ \t\n\r,\[\]\{\}])*[!#&*|%>]* { yylval.str = strdup(yytext); return PLAIN_SCALAR; }
102+
<INITIAL,FLOW>[\-\?:][^ \t\n\r,\[\]\{\}]+ { yylval.str = strdup(yytext); return PLAIN_SCALAR; }
103+
104+
<INITIAL,FLOW>"#"[^\n\r]* { /* ignore */ }
105+
<INITIAL>{NEWLINE} { BEGIN(BOL); return NEWLINE; }
106+
<FLOW>{NEWLINE} { /* ignore */ }
107+
[ \t]+ { /* whitespace */ }
108+
. { return yytext[0]; }
109+
<<EOF>> { while (indent_level >= 0) { indent_level--; enqueue(DEDENT); } if (q_head < q_tail) return dequeue(); return 0; }
110+
111+
%%
112+
#undef yylex

lib/yaml/yaml.y

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
%{
2+
/* YAML Parser - LALR-Safe with Cleaned Flow Entries */
3+
#include <stdio.h>
4+
#include <stdlib.h>
5+
#include <string.h>
6+
7+
extern int yylineno;
8+
extern char *yytext;
9+
const char* tok_name(int tok);
10+
void yyerror(const char *s);
11+
int yylex(void);
12+
13+
typedef struct Node {
14+
enum { N_SCALAR, N_SEQ, N_MAP, N_ALIAS, N_STREAM, N_NULL } type;
15+
char *tag, *anchor, *value;
16+
struct Node *children, *next;
17+
} Node;
18+
19+
Node *root = NULL;
20+
21+
Node* nnew(int type) {
22+
Node *n = calloc(1, sizeof(Node)); n->type = type; return n;
23+
}
24+
Node* nscalar(char *v) { Node *n = nnew(N_SCALAR); n->value = v; return n; }
25+
Node* nseq(Node *c) { Node *n = nnew(N_SEQ); n->children = c; return n; }
26+
Node* nmap(Node *c) { Node *n = nnew(N_MAP); n->children = c; return n; }
27+
Node* nalias(char *v) { Node *n = nnew(N_ALIAS); n->value = v; return n; }
28+
Node* nnull() { Node *n = nnew(N_NULL); n->value = strdup("null"); return n; }
29+
30+
Node* napply(Node *n, Node *p) {
31+
if (!n) n = nnull();
32+
if (p) { n->anchor = p->anchor; n->tag = p->tag; free(p); }
33+
return n;
34+
}
35+
Node* nappend(Node *l, Node *i) {
36+
if (!l) return i;
37+
Node *c = l; while (c->next) c = c->next;
38+
c->next = i; return l;
39+
}
40+
void nprint(Node *n, int d, int pa, int pt) {
41+
if (!n) return;
42+
if (pa && n->anchor) { for(int i=0;i<d*2;i++)putchar(' '); printf("ANCHOR: &%s\n", n->anchor); nprint(n, d+1, 0, 1); return; }
43+
if (pt && n->tag) { for(int i=0;i<d*2;i++)putchar(' '); printf("TAG: %s\n", n->tag); nprint(n, d+1, 0, 0); return; }
44+
if (n->type == N_STREAM) { printf("STREAM:\n"); for(Node *c=n->children;c;c=c->next) nprint(c, d, 1, 1); return; }
45+
for(int i=0;i<d*2;i++)putchar(' ');
46+
switch(n->type) {
47+
case N_SCALAR: printf("SCALAR: %s\n", n->value); break;
48+
case N_SEQ: printf("SEQUENCE:\n"); for(Node *c=n->children;c;c=c->next) nprint(c, d+1, 1, 1); break;
49+
case N_MAP: printf("MAPPING:\n"); for(Node *c=n->children;c;c=c->next) nprint(c, d+1, 1, 1); break;
50+
case N_ALIAS: printf("ALIAS: *%s\n", n->value); break;
51+
case N_NULL: printf("SCALAR: null\n"); break;
52+
}
53+
}
54+
char* jscalar(char *s1, char *s2) {
55+
char *r = malloc(strlen(s1)+strlen(s2)+2); sprintf(r, "%s %s", s1, s2); free(s1); free(s2); return r;
56+
}
57+
%}
58+
59+
%union { char *str; struct Node *node; }
60+
%token DOC_START LBRACKET RBRACKET LBRACE RBRACE COMMA SEQ_ENTRY MAP_KEY COLON NEWLINE INDENT DEDENT NEWLINE_DEDENT NEWLINE_INDENT DOC_END
61+
%token <str> ANCHOR ALIAS TAG PLAIN_SCALAR DQUOTE_STRING SQUOTE_STRING LITERAL_CONTENT
62+
%token LITERAL FOLDED
63+
64+
%nonassoc LOW_PREC
65+
%nonassoc TAG ANCHOR
66+
%nonassoc DEDENT NEWLINE_DEDENT NEWLINE_INDENT
67+
%nonassoc NEWLINE
68+
%right COLON
69+
70+
%type <node> stream document node pair atom map_list seq_list seq_entry properties property indented_node flow_seq_items flow_map_entries flow_entry flow_seq_item flow_node
71+
%type <str> plain
72+
73+
%start stream
74+
75+
%%
76+
77+
stream : /* empty */ { root = nnew(N_STREAM); $$ = root; }
78+
| stream document { if($2) $1->children = nappend($1->children, $2); $$=$1; }
79+
| stream NEWLINE { $$=$1; } | stream DEDENT { $$=$1; } | stream NEWLINE_DEDENT { $$=$1; } ;
80+
81+
document : node | DOC_START node { $$ = $2; } | DOC_START { $$ = nnull(); } ;
82+
83+
node : atom %prec LOW_PREC
84+
| map_list { $$ = nmap($1); }
85+
| seq_list { $$ = nseq($1); }
86+
| indented_node { $$ = $1; }
87+
| LITERAL LITERAL_CONTENT { $$ = nscalar($2); }
88+
| FOLDED LITERAL_CONTENT { $$ = nscalar($2); }
89+
;
90+
91+
map_list : pair { $$ = $1; }
92+
| map_list NEWLINE pair { $$ = nappend($1, $3); } ;
93+
94+
pair : atom COLON node { $$ = nappend($1, $3); }
95+
| atom COLON { $$ = nappend($1, nnull()); } %prec LOW_PREC
96+
| MAP_KEY node COLON node { $$ = nappend($2, $4); }
97+
| MAP_KEY node { $$ = nappend($2, nnull()); } %prec LOW_PREC ;
98+
99+
atom : flow_node | properties flow_node { $$ = napply($2, $1); }
100+
| properties %prec LOW_PREC { $$ = napply(NULL, $1); } ;
101+
102+
seq_list : seq_entry { $$ = $1; }
103+
| seq_list NEWLINE seq_entry { $$ = nappend($1, $3); } ;
104+
105+
seq_entry : SEQ_ENTRY node { $$ = $2; } | SEQ_ENTRY { $$ = nnull(); } ;
106+
107+
indented_node : INDENT node DEDENT { $$ = $2; } | INDENT node NEWLINE_DEDENT { $$ = $2; }
108+
| NEWLINE_INDENT node DEDENT { $$ = $2; } | NEWLINE_INDENT node NEWLINE_DEDENT { $$ = $2; } ;
109+
110+
flow_node : plain { $$ = nscalar($1); } | DQUOTE_STRING { $$ = nscalar($1); } | SQUOTE_STRING { $$ = nscalar($1); }
111+
| ALIAS { $$ = nalias($1); }
112+
| LBRACE flow_map_entries RBRACE { $$ = nmap($2); } | LBRACE RBRACE { $$ = nmap(NULL); }
113+
| LBRACKET flow_seq_items RBRACKET { $$ = nseq($2); } | LBRACKET RBRACKET { $$ = nseq(NULL); }
114+
;
115+
116+
plain : PLAIN_SCALAR | plain PLAIN_SCALAR { $$ = jscalar($1, $2); } ;
117+
118+
flow_seq_items : flow_seq_item { $$ = $1; } | flow_seq_items COMMA flow_seq_item { $$ = nappend($1, $3); } | flow_seq_items COMMA { $$ = nappend($1, nnull()); } ;
119+
flow_seq_item : node %prec LOW_PREC { $$ = $1; } ;
120+
121+
flow_map_entries : flow_entry { $$ = $1; } | flow_map_entries COMMA flow_entry { $$ = nappend($1, $3); } | flow_map_entries COMMA { $$ = $1; } ;
122+
flow_entry : pair | atom { $$ = nappend($1, nnull()); } %prec LOW_PREC ;
123+
124+
properties : property | properties property { if($2->anchor) $1->anchor = $2->anchor; if($2->tag) $1->tag = $2->tag; free($2); $$ = $1; } ;
125+
property : ANCHOR { $$ = nnew(0); $$->anchor = $1; } | TAG { $$ = nnew(0); $$->tag = $1; } ;
126+
127+
%%
128+
void yyerror(const char *s) { fprintf(stderr, "Error line %d: %s (tok: %s, text: '%s')\n", yylineno, s, tok_name(yychar), yytext); }
129+
int main() { if (!yyparse() && root) { nprint(root, 0, 1, 1); return 0; } return 1; }

0 commit comments

Comments
 (0)