Skip to content

Commit 0f72364

Browse files
committed
feat: add line numbers support
1 parent c103b51 commit 0f72364

4 files changed

Lines changed: 143 additions & 34 deletions

File tree

README.md

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ A fast C++ lexer for extracting named exports from CommonJS modules. This librar
66

77
- **Fast**: Zero-copy parsing for most exports using `std::string_view`
88
- **Accurate**: Handles complex CommonJS patterns including re-exports, Object.defineProperty, and transpiler output
9+
- **Source Locations**: Each export includes a 1-based line number for tooling integration
910
- **Unicode Support**: Properly unescapes JavaScript string literals including `\u{XXXX}` and surrogate pairs
1011
- **Optional SIMD Acceleration**: Can use [simdutf](https://github.com/simdutf/simdutf) for faster string operations
1112
- **No Dependencies**: Single-header distribution available (simdutf is optional)
@@ -49,20 +50,21 @@ int main() {
4950
if (result) {
5051
std::cout << "Exports found:" << std::endl;
5152
for (const auto& exp : result->exports) {
52-
std::cout << " - " << lexer::get_string_view(exp) << std::endl;
53+
std::cout << " - " << lexer::get_string_view(exp)
54+
<< " (line " << exp.line << ")" << std::endl;
5355
}
5456
}
55-
57+
5658
return 0;
5759
}
5860
```
5961

6062
Output:
6163
```
6264
Exports found:
63-
- foo
64-
- bar
65-
- baz
65+
- foo (line 2)
66+
- bar (line 3)
67+
- baz (line 4)
6668
```
6769

6870
## API Reference
@@ -85,11 +87,22 @@ Parses CommonJS source code and extracts export information.
8587
8688
```cpp
8789
struct lexer_analysis {
88-
std::vector<export_string> exports; // Named exports
89-
std::vector<export_string> re_exports; // Re-exported module specifiers
90+
std::vector<export_entry> exports; // Named exports
91+
std::vector<export_entry> re_exports; // Re-exported module specifiers
92+
};
93+
```
94+
95+
### `lexer::export_entry`
96+
97+
```cpp
98+
struct export_entry {
99+
export_string name;
100+
uint32_t line; // 1-based line number
90101
};
91102
```
92103
104+
Each export/re-export entry includes the name and the 1-based line number where it was found in the source.
105+
93106
### `lexer::export_string`
94107
95108
```cpp
@@ -104,9 +117,10 @@ Export names are stored as a variant to avoid unnecessary copies:
104117

105118
```cpp
106119
inline std::string_view get_string_view(const export_string& s);
120+
inline std::string_view get_string_view(const export_entry& e);
107121
```
108122
109-
Helper function to get a `string_view` from either variant type.
123+
Helper functions to get a `string_view` from an `export_string` or `export_entry`.
110124
111125
### `lexer::get_last_error`
112126

include/merve/parser.h

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,14 @@ enum lexer_error {
4747
*/
4848
using export_string = std::variant<std::string, std::string_view>;
4949

50+
/**
51+
* @brief An export name together with its 1-based source line number.
52+
*/
53+
struct export_entry {
54+
export_string name;
55+
uint32_t line; // 1-based line number
56+
};
57+
5058
/**
5159
* @brief Result of parsing a CommonJS module.
5260
*/
@@ -61,7 +69,7 @@ struct lexer_analysis {
6169
* - module.exports = { a, b, c }
6270
* - Object.defineProperty(exports, 'name', {...})
6371
*/
64-
std::vector<export_string> exports{};
72+
std::vector<export_entry> exports{};
6573

6674
/**
6775
* @brief Module specifiers from re-export patterns.
@@ -72,7 +80,7 @@ struct lexer_analysis {
7280
* - __export(require('other'))
7381
* - Object.keys(require('other')).forEach(...)
7482
*/
75-
std::vector<export_string> re_exports{};
83+
std::vector<export_entry> re_exports{};
7684
};
7785

7886
/**
@@ -89,6 +97,13 @@ inline std::string_view get_string_view(const export_string& s) {
8997
return std::visit([](const auto& v) -> std::string_view { return v; }, s);
9098
}
9199

100+
/**
101+
* @brief Get a string_view from an export_entry (delegates to the name field).
102+
*/
103+
inline std::string_view get_string_view(const export_entry& e) {
104+
return get_string_view(e.name);
105+
}
106+
92107
/**
93108
* @brief Parse CommonJS source code and extract export information.
94109
*

src/parser.cpp

Lines changed: 49 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,8 @@ class CJSLexer {
325325
uint16_t openTokenDepth;
326326
uint16_t templateDepth;
327327

328+
uint32_t line_;
329+
328330
bool lastSlashWasDivision;
329331
bool nextBraceIsClass;
330332

@@ -335,8 +337,13 @@ class CJSLexer {
335337
StarExportBinding* starExportStack;
336338
const StarExportBinding* STAR_EXPORT_STACK_END;
337339

338-
std::vector<export_string>& exports;
339-
std::vector<export_string>& re_exports;
340+
std::vector<export_entry>& exports;
341+
std::vector<export_entry>& re_exports;
342+
343+
void countNewline(char ch) {
344+
if (ch == '\n') ++line_;
345+
else if (ch == '\r' && (pos + 1 >= end || *(pos + 1) != '\n')) ++line_;
346+
}
340347

341348
// Character classification helpers using lookup tables
342349
static bool isBr(char c) {
@@ -495,6 +502,8 @@ class CJSLexer {
495502
return ch;
496503
} else if (!isBrOrWs(ch)) {
497504
return ch;
505+
} else {
506+
countNewline(ch);
498507
}
499508
} while (pos++ < end);
500509
return ch;
@@ -503,8 +512,10 @@ class CJSLexer {
503512
void lineComment() {
504513
while (pos++ < end) {
505514
char ch = *pos;
506-
if (ch == '\n' || ch == '\r')
515+
if (ch == '\n' || ch == '\r') {
516+
countNewline(ch);
507517
return;
518+
}
508519
}
509520
}
510521

@@ -516,6 +527,7 @@ class CJSLexer {
516527
pos++;
517528
return;
518529
}
530+
countNewline(ch);
519531
}
520532
}
521533

@@ -527,8 +539,13 @@ class CJSLexer {
527539
if (ch == '\\') {
528540
if (pos + 1 >= end) break;
529541
ch = *++pos;
530-
if (ch == '\r' && *(pos + 1) == '\n')
531-
pos++;
542+
if (ch == '\r') {
543+
++line_;
544+
if (*(pos + 1) == '\n')
545+
pos++;
546+
} else if (ch == '\n') {
547+
++line_;
548+
}
532549
} else if (isBr(ch))
533550
break;
534551
}
@@ -580,8 +597,12 @@ class CJSLexer {
580597
}
581598
if (ch == '`')
582599
return;
583-
if (ch == '\\' && pos + 1 < end)
600+
if (ch == '\\' && pos + 1 < end) {
584601
pos++;
602+
countNewline(*pos);
603+
} else {
604+
countNewline(ch);
605+
}
585606
}
586607
syntaxError(lexer_error::UNTERMINATED_TEMPLATE_STRING);
587608
}
@@ -614,7 +635,7 @@ class CJSLexer {
614635
#endif
615636
}
616637

617-
void addExport(std::string_view export_name) {
638+
void addExport(std::string_view export_name, uint32_t at_line) {
618639
// Skip surrounding quotes if present
619640
if (!export_name.empty() && (export_name.front() == '\'' || export_name.front() == '"')) {
620641
export_name.remove_prefix(1);
@@ -625,11 +646,11 @@ class CJSLexer {
625646
if (!needsUnescaping(export_name)) {
626647
// Check if this export already exists (avoid duplicates)
627648
for (const auto& existing : exports) {
628-
if (get_string_view(existing) == export_name) {
649+
if (get_string_view(existing.name) == export_name) {
629650
return; // Already exists, skip
630651
}
631652
}
632-
exports.push_back(export_name);
653+
exports.push_back(export_entry{export_name, at_line});
633654
return;
634655
}
635656

@@ -644,14 +665,14 @@ class CJSLexer {
644665

645666
// Check if this export already exists (avoid duplicates)
646667
for (const auto& existing : exports) {
647-
if (get_string_view(existing) == name) {
668+
if (get_string_view(existing.name) == name) {
648669
return; // Already exists, skip
649670
}
650671
}
651-
exports.push_back(std::move(unescaped.value()));
672+
exports.push_back(export_entry{std::move(unescaped.value()), at_line});
652673
}
653674

654-
void addReexport(std::string_view reexport_name) {
675+
void addReexport(std::string_view reexport_name, uint32_t at_line) {
655676
// Skip surrounding quotes if present
656677
if (!reexport_name.empty() && (reexport_name.front() == '\'' || reexport_name.front() == '"')) {
657678
reexport_name.remove_prefix(1);
@@ -660,7 +681,7 @@ class CJSLexer {
660681

661682
// Fast path: no escaping needed, use string_view directly
662683
if (!needsUnescaping(reexport_name)) {
663-
re_exports.push_back(reexport_name);
684+
re_exports.push_back(export_entry{reexport_name, at_line});
664685
return;
665686
}
666687

@@ -670,7 +691,7 @@ class CJSLexer {
670691
return; // Skip invalid escape sequences
671692
}
672693

673-
re_exports.push_back(std::move(unescaped.value()));
694+
re_exports.push_back(export_entry{std::move(unescaped.value()), at_line});
674695
}
675696

676697
bool readExportsOrModuleDotExports(char ch) {
@@ -712,7 +733,7 @@ class CJSLexer {
712733
switch (requireType) {
713734
case RequireType::ExportStar:
714735
case RequireType::ExportAssign:
715-
addReexport(std::string_view(reexportStart, reexportEnd - reexportStart));
736+
addReexport(std::string_view(reexportStart, reexportEnd - reexportStart), line_);
716737
return true;
717738
default:
718739
if (starExportStack < STAR_EXPORT_STACK_END) {
@@ -773,7 +794,7 @@ class CJSLexer {
773794
return;
774795
}
775796
}
776-
addExport(std::string_view(startPos, endPos - startPos));
797+
addExport(std::string_view(startPos, endPos - startPos), line_);
777798
} else if (ch == '\'' || ch == '"') {
778799
const char* start = pos;
779800
stringLiteral(ch);
@@ -786,7 +807,7 @@ class CJSLexer {
786807
pos = revertPos;
787808
return;
788809
}
789-
addExport(std::string_view(start, end_pos - start));
810+
addExport(std::string_view(start, end_pos - start), line_);
790811
}
791812
} else if (ch == '.' && matchesAt(pos + 1, end, "..")) {
792813
pos += 3;
@@ -825,7 +846,7 @@ class CJSLexer {
825846
const char* endPos = pos;
826847
ch = commentWhitespace();
827848
if (ch == '=') {
828-
addExport(std::string_view(startPos, endPos - startPos));
849+
addExport(std::string_view(startPos, endPos - startPos), line_);
829850
return;
830851
}
831852
}
@@ -843,7 +864,7 @@ class CJSLexer {
843864
pos++;
844865
ch = commentWhitespace();
845866
if (ch != '=') break;
846-
addExport(std::string_view(startPos, endPos - startPos));
867+
addExport(std::string_view(startPos, endPos - startPos), line_);
847868
}
848869
break;
849870
}
@@ -974,7 +995,7 @@ class CJSLexer {
974995
ch = commentWhitespace();
975996
if (ch != ':') break;
976997
if (exportStart && exportEnd)
977-
addExport(std::string_view(exportStart, exportEnd - exportStart));
998+
addExport(std::string_view(exportStart, exportEnd - exportStart), line_);
978999
pos = revertPos;
9791000
return;
9801001
} else if (ch == 'g') {
@@ -1042,7 +1063,7 @@ class CJSLexer {
10421063
ch = commentWhitespace();
10431064
if (ch != ')') break;
10441065
if (exportStart && exportEnd)
1045-
addExport(std::string_view(exportStart, exportEnd - exportStart));
1066+
addExport(std::string_view(exportStart, exportEnd - exportStart), line_);
10461067
return;
10471068
}
10481069
break;
@@ -1406,7 +1427,7 @@ class CJSLexer {
14061427
StarExportBinding* curCheckBinding = &starExportStack_[0];
14071428
while (curCheckBinding != starExportStack) {
14081429
if (curCheckBinding->id == id) {
1409-
addReexport(curCheckBinding->specifier);
1430+
addReexport(curCheckBinding->specifier, line_);
14101431
pos = revertPos;
14111432
return;
14121433
}
@@ -1506,9 +1527,10 @@ class CJSLexer {
15061527
}
15071528

15081529
public:
1509-
CJSLexer(std::vector<export_string>& out_exports, std::vector<export_string>& out_re_exports)
1530+
CJSLexer(std::vector<export_entry>& out_exports, std::vector<export_entry>& out_re_exports)
15101531
: source(nullptr), pos(nullptr), end(nullptr), lastTokenPos(nullptr),
15111532
templateStackDepth(0), openTokenDepth(0), templateDepth(0),
1533+
line_(1),
15121534
lastSlashWasDivision(false), nextBraceIsClass(false),
15131535
templateStack_{}, openTokenPosStack_{}, openClassPosStack{},
15141536
starExportStack_{}, starExportStack(nullptr), STAR_EXPORT_STACK_END(nullptr),
@@ -1525,6 +1547,7 @@ class CJSLexer {
15251547
templateStackDepth = 0;
15261548
openTokenDepth = 0;
15271549
templateDepth = std::numeric_limits<uint16_t>::max();
1550+
line_ = 1;
15281551
lastSlashWasDivision = false;
15291552
starExportStack = &starExportStack_[0];
15301553
STAR_EXPORT_STACK_END = &starExportStack_[MAX_STAR_EXPORTS - 1];
@@ -1549,8 +1572,10 @@ class CJSLexer {
15491572
while (pos++ < end) {
15501573
ch = *pos;
15511574

1552-
if (ch == ' ' || (ch < 14 && ch > 8))
1575+
if (ch == ' ' || (ch < 14 && ch > 8)) {
1576+
countNewline(ch);
15531577
continue;
1578+
}
15541579

15551580
if (openTokenDepth == 0) {
15561581
switch (ch) {

0 commit comments

Comments
 (0)