diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..30faa0c --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,11 @@ +version: 2 +updates: + - package-ecosystem: cargo + directory: /rust + schedule: + interval: monthly + + - package-ecosystem: github-actions + directory: / + schedule: + interval: monthly diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 616a358..8ee41aa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -130,3 +130,44 @@ jobs: - name: Test (Windows) if: matrix.os == 'windows-latest' run: ctest -C ${{ matrix.cmake_config }} --output-on-failure --test-dir build + + rust: + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-22.04 + toolchain: stable + - os: ubuntu-22.04 + toolchain: nightly + - os: ubuntu-22.04-arm + toolchain: stable + - os: macos-14 + toolchain: stable + - os: macos-15 + toolchain: stable + - os: windows-latest + toolchain: stable + + name: Rust ${{ matrix.toolchain }} / ${{ matrix.os }} + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v4 + + - name: Install Rust + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ matrix.toolchain }} + components: clippy + + - name: Clippy + working-directory: rust + run: cargo clippy -- -D warnings + + - name: Test + working-directory: rust + run: cargo test + + - name: Test (no default features) + working-directory: rust + run: cargo test --no-default-features diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a1b4cbc..296727b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -54,3 +54,41 @@ jobs: env: GH_TOKEN: ${{ github.token }} RELEASE_TAG: ${{ needs.release-please.outputs.release_tag }} + + publish-rust: + needs: [release-please, create-release] + if: ${{ needs.release-please.outputs.release_created }} + runs-on: ubuntu-latest + permissions: + contents: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install Rust + uses: dtolnay/rust-toolchain@master + with: + toolchain: stable + + - name: Build and test (populates rust/deps from source) + working-directory: rust + run: cargo test + + - name: Commit rust/deps if changed + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add rust/deps/ + if git diff --cached --quiet; then + echo "rust/deps is up to date" + else + git commit -m "chore: update rust/deps vendored sources" + git push + fi + + - name: Publish to crates.io + working-directory: rust + run: cargo publish + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} diff --git a/release-please-config.json b/release-please-config.json index 218677b..4a418f8 100644 --- a/release-please-config.json +++ b/release-please-config.json @@ -4,7 +4,8 @@ "release-type": "simple", "extra-files": [ "CMakeLists.txt", - "include/merve/version.h" + "include/merve/version.h", + "rust/Cargo.toml" ] } } diff --git a/rust/.gitignore b/rust/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/rust/.gitignore @@ -0,0 +1 @@ +/target diff --git a/rust/Cargo.lock b/rust/Cargo.lock new file mode 100644 index 0000000..1e4260b --- /dev/null +++ b/rust/Cargo.lock @@ -0,0 +1,141 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "cc" +version = "1.2.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom", + "libc", +] + +[[package]] +name = "libc" +version = "0.2.182" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" + +[[package]] +name = "link_args" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c7721e472624c9aaad27a5eb6b7c9c6045c7a396f2efb6dabaec1b640d5e89b" + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "merve" +version = "1.0.1" +dependencies = [ + "cc", + "link_args", + "regex", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" diff --git a/rust/Cargo.toml b/rust/Cargo.toml new file mode 100644 index 0000000..2a21b19 --- /dev/null +++ b/rust/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "merve" +version = "1.0.1" # x-release-please-version +edition = "2024" +rust-version = "1.85" +authors = ["Yagiz Nizipli "] +license = "Apache-2.0 OR MIT" +description = "A fast C++ lexer for extracting named exports from CommonJS modules" +repository = "https://github.com/nodejs/merve" +categories = ["parsing", "development-tools"] +keywords = ["commonjs", "cjs", "exports", "lexer", "javascript"] +include = [ + "src/**/*.rs", + "deps/**/*.cpp", + "deps/**/*.h", + "wasi_to_unknown.cpp", + "build.rs", + "Cargo.toml", + "LICENSE-*", + "README.md", +] + +[features] +default = ["std"] +# pass `cpp_set_stdlib("c++")` to `cc` +libcpp = [] +# enable allocations +std = [] + +[package.metadata.docs.rs] +rustdoc-args = ["--cfg", "docsrs"] + +[build-dependencies] +cc = { version = "1.1", features = ["parallel"] } +link_args = "0.6" +regex = { version = "1.11", features = [] } diff --git a/rust/README.md b/rust/README.md new file mode 100644 index 0000000..5c34152 --- /dev/null +++ b/rust/README.md @@ -0,0 +1,138 @@ +# merve (Rust) + +Fast CommonJS export lexer for Rust. Extracts named exports and re-exports from +CommonJS modules via static analysis, without executing the code. + +This crate provides safe Rust bindings to the [merve](https://github.com/nodejs/merve) C++ library. + +## Usage + +Add to your `Cargo.toml`: + +```toml +[dependencies] +merve = "0.1" +``` + +Parse CommonJS source and iterate over exports: + +```rust +use merve::parse_commonjs; + +let source = r#" + exports.foo = 1; + exports.bar = function() {}; + module.exports.baz = 'hello'; +"#; + +let analysis = parse_commonjs(source).expect("parse failed"); + +for export in analysis.exports() { + println!("{} (line {})", export.name, export.line); +} +``` + +## Features + +**std** (default): Enables `std::error::Error` impl for `LexerError`. Disable for `no_std`: + +```toml +merve = { version = "0.1", default-features = false } +``` + +**libcpp**: Build the underlying C++ code with `libc++` instead of `libstdc++`. +Requires `libc++` to be installed: + +```toml +merve = { version = "0.1", features = ["libcpp"] } +``` + +## API + +### `parse_commonjs` + +```rust +pub fn parse_commonjs(source: &str) -> Result, LexerError> +``` + +Parse CommonJS source code and extract export information. The returned +`Analysis` borrows from `source` because export names may point directly into +the source buffer (zero-copy). + +### `Analysis<'a>` + +| Method | Returns | Description | +| -------- | --------- | ------------- | +| `exports_count()` | `usize` | Number of named exports | +| `reexports_count()` | `usize` | Number of re-export specifiers | +| `export_name(index)` | `Option<&'a str>` | Export name at index | +| `export_line(index)` | `Option` | 1-based line number of export | +| `reexport_name(index)` | `Option<&'a str>` | Re-export specifier at index | +| `reexport_line(index)` | `Option` | 1-based line number of re-export | +| `exports()` | `ExportIter` | Iterator over all exports | +| `reexports()` | `ExportIter` | Iterator over all re-exports | + +### `Export<'a>` + +```rust +pub struct Export<'a> { + pub name: &'a str, + pub line: u32, +} +``` + +### `LexerError` + +Returned when the input contains ESM syntax or malformed constructs: + +| Variant | Description | +| --------- | ------------- | +| `UnexpectedEsmImport` | Found `import` declaration | +| `UnexpectedEsmExport` | Found `export` declaration | +| `UnexpectedEsmImportMeta` | Found `import.meta` | +| `UnterminatedStringLiteral` | Unclosed string literal | +| `UnterminatedTemplateString` | Unclosed template literal | +| `UnterminatedRegex` | Unclosed regular expression | +| `UnexpectedParen` | Unexpected `)` | +| `UnexpectedBrace` | Unexpected `}` | +| `UnterminatedParen` | Unclosed `(` | +| `UnterminatedBrace` | Unclosed `{` | +| `TemplateNestOverflow` | Template literal nesting too deep | + +`LexerError` implements `Display` and, with the `std` feature, `std::error::Error`. + +### Versioning helpers + +```rust +pub fn version() -> &'static str +pub fn version_components() -> (i32, i32, i32) +``` + +## Lifetime semantics + +`Analysis<'a>` ties its lifetime to the source `&str` passed to `parse_commonjs`. +Export names returned by `export_name()` / the iterator borrow from the original +source buffer (the C++ library uses `std::string_view` for zero-copy export names). +Keep the source string alive as long as you access export names. + +```rust +let source = String::from("exports.hello = 1;"); +let analysis = merve::parse_commonjs(&source).unwrap(); +// `analysis` borrows `source` -- both must stay alive +assert_eq!(analysis.export_name(0), Some("hello")); +``` + +## Thread safety + +`Analysis` implements `Send` and `Sync`. + +## License + +Licensed under either of + +* Apache License, Version 2.0 + ([LICENSE-APACHE](../LICENSE-APACHE) or ) +* MIT license + ([LICENSE-MIT](../LICENSE-MIT) or ) + +at your option. diff --git a/rust/build.rs b/rust/build.rs new file mode 100644 index 0000000..74db0a9 --- /dev/null +++ b/rust/build.rs @@ -0,0 +1,308 @@ +use regex::Regex; +use std::fmt::{Display, Formatter}; +use std::fs::{self, File}; +use std::io::Read; +use std::path::{Path, PathBuf}; +use std::{env, fmt}; + +#[derive(Clone, Debug)] +pub struct Target { + pub architecture: String, + pub vendor: String, + pub system: Option, + pub abi: Option, +} + +impl Target { + pub fn as_strs(&self) -> (&str, &str, Option<&str>, Option<&str>) { + ( + self.architecture.as_str(), + self.vendor.as_str(), + self.system.as_deref(), + self.abi.as_deref(), + ) + } +} + +impl Display for Target { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { + write!(f, "{}-{}", &self.architecture, &self.vendor)?; + + if let Some(ref system) = self.system { + write!(f, "-{}", system) + } else { + Ok(()) + }?; + + if let Some(ref abi) = self.abi { + write!(f, "-{}", abi) + } else { + Ok(()) + } + } +} + +pub fn ndk() -> String { + env::var("ANDROID_NDK").expect("ANDROID_NDK variable not set") +} + +pub fn target_arch(arch: &str) -> &str { + match arch { + "armv7" => "arm", + "aarch64" => "arm64", + "i686" => "x86", + arch => arch, + } +} + +fn host_tag() -> &'static str { + if cfg!(target_os = "windows") { + "windows-x86_64" + } else if cfg!(target_os = "linux") { + "linux-x86_64" + } else if cfg!(target_os = "macos") { + "darwin-x86_64" + } else { + panic!("host os is not supported") + } +} + +/// Get NDK major version from source.properties +fn ndk_major_version(ndk_dir: &Path) -> u32 { + let re = Regex::new(r"Pkg.Revision = (\d+)\.(\d+)\.(\d+)").unwrap(); + let mut source_properties = + File::open(ndk_dir.join("source.properties")).expect("Couldn't open source.properties"); + let mut buf = String::new(); + source_properties + .read_to_string(&mut buf) + .expect("Could not read source.properties"); + let captures = re + .captures(&buf) + .expect("source.properties did not match the regex"); + captures[1].parse().expect("could not parse major version") +} + +/// Recursively inline `#include "..."` directives, deduplicating by file name. +fn amalgamate_file( + include_path: &Path, + source_path: &Path, + base_path: &Path, + filename: &str, + out: &mut String, + included: &mut Vec, +) { + let file_path = base_path.join(filename); + let content = fs::read_to_string(&file_path) + .unwrap_or_else(|e| panic!("failed to read {}: {e}", file_path.display())); + + let include_re = Regex::new(r#"^\s*#\s*include\s+"([^"]+)""#).unwrap(); + + out.push_str(&format!("/* begin file {filename} */\n")); + + for line in content.lines() { + if let Some(caps) = include_re.captures(line) { + let inc_file = caps[1].to_string(); + + if included.contains(&inc_file) { + continue; + } + + let resolved = if include_path.join(&inc_file).exists() { + included.push(inc_file.clone()); + Some((include_path.to_path_buf(), inc_file)) + } else if source_path.join(&inc_file).exists() { + included.push(inc_file.clone()); + Some((source_path.to_path_buf(), inc_file)) + } else { + // System or unrecognized include — keep as-is. + None + }; + + if let Some((base, name)) = resolved { + amalgamate_file(include_path, source_path, &base, &name, out, included); + } else { + out.push_str(line); + out.push('\n'); + } + } else { + out.push_str(line); + out.push('\n'); + } + } + + out.push_str(&format!("/* end file {filename} */\n")); +} + +/// When building inside the merve repository, produce the three amalgamated +/// files in `deps/`: merve.h, merve.cpp, merve_c.h. +fn amalgamate_from_repo(project_root: &Path, deps: &Path) { + let include_path = project_root.join("include"); + let source_path = project_root.join("src"); + + // Remove stale files / subdirectories from a previous layout. + if deps.exists() { + fs::remove_dir_all(deps).ok(); + } + fs::create_dir_all(deps).expect("failed to create deps/"); + + let mut included: Vec = Vec::new(); + + // 1. Amalgamate merve.h (inlines merve/parser.h -> merve/version.h). + let mut header = String::new(); + amalgamate_file( + &include_path, + &source_path, + &include_path, + "merve.h", + &mut header, + &mut included, + ); + fs::write(deps.join("merve.h"), &header).expect("failed to write deps/merve.h"); + + // 2. Amalgamate merve.cpp (parser.cpp + merve_c.cpp with includes resolved). + let mut source = String::from("#include \"merve.h\"\n\n"); + for cpp in &["parser.cpp", "merve_c.cpp"] { + amalgamate_file( + &include_path, + &source_path, + &source_path, + cpp, + &mut source, + &mut included, + ); + } + fs::write(deps.join("merve.cpp"), &source).expect("failed to write deps/merve.cpp"); + + // 3. Copy merve_c.h verbatim (standalone C header). + fs::copy(include_path.join("merve_c.h"), deps.join("merve_c.h")) + .expect("failed to copy merve_c.h"); +} + +fn main() { + let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + let deps = manifest_dir.join("deps"); + + // Detect in-repo build by checking for the parent C++ sources. + let project_root = manifest_dir.parent().unwrap(); + let in_repo = project_root.join("src/parser.cpp").exists() + && project_root.join("src/merve_c.cpp").exists() + && project_root.join("include/merve.h").exists(); + + if in_repo { + amalgamate_from_repo(project_root, &deps); + + // Rebuild when upstream C++ sources change. + for src in &[ + "src/parser.cpp", + "src/merve_c.cpp", + "include/merve.h", + "include/merve_c.h", + "include/merve/parser.h", + "include/merve/version.h", + ] { + println!( + "cargo:rerun-if-changed={}", + project_root.join(src).display() + ); + } + } + + // Both in-repo and published crate use the same layout: merve.cpp + merve.h + merve_c.h + assert!( + deps.join("merve.cpp").exists(), + "No C++ sources found in deps/. \ + When building outside the repository, deps/ must contain the amalgamated sources." + ); + + let mut build = cc::Build::new(); + build.file(deps.join("merve.cpp")); + build.include(&deps); + build.cpp(true).std("c++20").warnings(false); + + // Target handling + let target_str = env::var("TARGET").unwrap(); + let target: Vec = target_str.split('-').map(|s| s.into()).collect(); + assert!(target.len() >= 2, "Failed to parse TARGET {}", target_str); + + let abi = if target.len() > 3 { + Some(target[3].clone()) + } else { + None + }; + let system = if target.len() > 2 { + Some(target[2].clone()) + } else { + None + }; + let target = Target { + architecture: target[0].clone(), + vendor: target[1].clone(), + system, + abi, + }; + + let compile_target_arch = env::var("CARGO_CFG_TARGET_ARCH").expect("CARGO_CFG_TARGET_ARCH"); + let compile_target_os = env::var("CARGO_CFG_TARGET_OS").expect("CARGO_CFG_TARGET_OS"); + let compile_target_feature = env::var("CARGO_CFG_TARGET_FEATURE"); + + match target.system.as_deref() { + Some("android" | "androideabi") => { + let ndk = ndk(); + let major = ndk_major_version(Path::new(&ndk)); + if major < 22 { + build + .flag(format!("--sysroot={}/sysroot", ndk)) + .flag(format!( + "-isystem{}/sources/cxx-stl/llvm-libc++/include", + ndk + )); + } else { + let host_toolchain = format!("{}/toolchains/llvm/prebuilt/{}", ndk, host_tag()); + build.flag(format!("--sysroot={}/sysroot", host_toolchain)); + } + } + _ => { + if compile_target_arch.starts_with("wasm") && compile_target_os != "emscripten" { + let wasi_sdk = env::var("WASI_SDK").unwrap_or_else(|_| "/opt/wasi-sdk".to_owned()); + assert!( + Path::new(&wasi_sdk).exists(), + "WASI SDK not found at {wasi_sdk}" + ); + build.compiler(format!("{wasi_sdk}/bin/clang++")); + let wasi_sysroot_lib = match compile_target_feature { + Ok(compile_target_feature) if compile_target_feature.contains("atomics") => { + "wasm32-wasip1-threads" + } + _ => "wasm32-wasip1", + }; + println!( + "cargo:rustc-link-search={wasi_sdk}/share/wasi-sysroot/lib/{wasi_sysroot_lib}" + ); + build.flag("-fno-exceptions"); + build.cpp_set_stdlib("c++"); + println!("cargo:rustc-link-lib=c++abi"); + if compile_target_os == "unknown" { + build.target("wasm32-wasip1"); + println!("cargo:rustc-link-lib=c"); + build.file(manifest_dir.join("wasi_to_unknown.cpp")); + } + } + + let compiler = build.get_compiler(); + if compiler.is_like_msvc() { + build.static_crt(true); + link_args::windows! { + unsafe { + no_default_lib( + "libcmt.lib", + ); + } + } + } else if compiler.is_like_clang() && cfg!(feature = "libcpp") { + build.cpp_set_stdlib("c++"); + } + } + } + + build.compile("merve"); +} diff --git a/rust/deps/merve.cpp b/rust/deps/merve.cpp new file mode 100644 index 0000000..dbb39b3 --- /dev/null +++ b/rust/deps/merve.cpp @@ -0,0 +1,2036 @@ +#include "merve.h" + +/* begin file parser.cpp */ +#include +#include +#include + +#ifdef MERVE_USE_SIMDUTF +#include +#endif + +namespace lexer { + +// ============================================================================ +// Compile-time lookup tables for character classification +// ============================================================================ + +// Hex digit lookup table: maps char -> hex value (0-15), or 255 if invalid +static constexpr std::array kHexTable = []() consteval { + std::array table{}; + for (int i = 0; i < 256; ++i) table[i] = 255; + for (int i = '0'; i <= '9'; ++i) table[i] = static_cast(i - '0'); + for (int i = 'a'; i <= 'f'; ++i) table[i] = static_cast(i - 'a' + 10); + for (int i = 'A'; i <= 'F'; ++i) table[i] = static_cast(i - 'A' + 10); + return table; +}(); + +// Simple escape lookup table: maps escape char -> result char +// Uses 0xFF as "not a simple escape" marker since '\0' is a valid escape result +static constexpr std::array kSimpleEscapeTable = []() consteval { + std::array table{}; + for (int i = 0; i < 256; ++i) table[i] = 0xFF; + table['n'] = '\n'; + table['r'] = '\r'; + table['t'] = '\t'; + table['b'] = '\b'; + table['f'] = '\f'; + table['v'] = '\v'; + table['0'] = '\0'; + table['\\'] = '\\'; + table['\''] = '\''; + table['"'] = '"'; + return table; +}(); + +// Punctuator lookup table +static constexpr std::array kPunctuatorTable = []() consteval { + std::array table{}; + table['!'] = true; + table['%'] = true; + table['&'] = true; + // ch > 39 && ch < 48: '(' ')' '*' '+' ',' '-' '.' '/' + for (int i = 40; i < 48; ++i) table[i] = true; + // ch > 57 && ch < 64: ':' ';' '<' '=' '>' '?' + for (int i = 58; i < 64; ++i) table[i] = true; + table['['] = true; + table[']'] = true; + table['^'] = true; + // ch > 122 && ch < 127: '{' '|' '}' '~' + for (int i = 123; i < 127; ++i) table[i] = true; + return table; +}(); + +// Expression punctuator lookup table (similar but excludes ')' and '}') +static constexpr std::array kExpressionPunctuatorTable = []() consteval { + std::array table{}; + table['!'] = true; + table['%'] = true; + table['&'] = true; + // ch > 39 && ch < 47 && ch != 41: '(' '*' '+' ',' '-' '.' + for (int i = 40; i < 47; ++i) { + if (i != 41) table[i] = true; // Skip ')' + } + // ch > 57 && ch < 64: ':' ';' '<' '=' '>' '?' + for (int i = 58; i < 64; ++i) table[i] = true; + table['['] = true; + table['^'] = true; + // ch > 122 && ch < 127 && ch != '}': '{' '|' '~' + for (int i = 123; i < 127; ++i) { + if (i != 125) table[i] = true; // Skip '}' + } + return table; +}(); + +// Identifier start lookup table (a-z, A-Z, _, $, >= 0x80) +static constexpr std::array kIdentifierStartTable = []() consteval { + std::array table{}; + for (int i = 'a'; i <= 'z'; ++i) table[i] = true; + for (int i = 'A'; i <= 'Z'; ++i) table[i] = true; + table['_'] = true; + table['$'] = true; + // UTF-8 continuation bytes and lead bytes (>= 0x80) + for (int i = 0x80; i < 256; ++i) table[i] = true; + return table; +}(); + +// Identifier char lookup table (identifier start + digits) +static constexpr std::array kIdentifierCharTable = []() consteval { + std::array table{}; + for (int i = 'a'; i <= 'z'; ++i) table[i] = true; + for (int i = 'A'; i <= 'Z'; ++i) table[i] = true; + table['_'] = true; + table['$'] = true; + for (int i = 0x80; i < 256; ++i) table[i] = true; + for (int i = '0'; i <= '9'; ++i) table[i] = true; + return table; +}(); + +// Whitespace/line break lookup table +static constexpr std::array kBrOrWsTable = []() consteval { + std::array table{}; + // c > 8 && c < 14: \t \n \v \f \r + for (int i = 9; i < 14; ++i) table[i] = true; + table[32] = true; // space + return table; +}(); + +// ============================================================================ +// Inline functions using lookup tables +// ============================================================================ + +// Parse a hex digit, returns -1 if invalid +inline int hexDigit(unsigned char c) { + uint8_t val = kHexTable[c]; + return val == 255 ? -1 : static_cast(val); +} + +// Encode a Unicode code point as UTF-8 into the output string +inline void encodeUtf8(std::string& out, uint32_t codepoint) { +#ifdef MERVE_USE_SIMDUTF + // Use simdutf for optimized UTF-32 to UTF-8 conversion + char buf[4]; + size_t len = simdutf::convert_utf32_to_utf8( + reinterpret_cast(&codepoint), 1, buf); + out.append(buf, len); +#else + if (codepoint <= 0x7F) { + out.push_back(static_cast(codepoint)); + } else if (codepoint <= 0x7FF) { + out.push_back(static_cast(0xC0 | (codepoint >> 6))); + out.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } else if (codepoint <= 0xFFFF) { + out.push_back(static_cast(0xE0 | (codepoint >> 12))); + out.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } else if (codepoint <= 0x10FFFF) { + out.push_back(static_cast(0xF0 | (codepoint >> 18))); + out.push_back(static_cast(0x80 | ((codepoint >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (codepoint & 0x3F))); + } +#endif +} + +// Unescape JavaScript string escape sequences +// Returns empty optional on invalid escape sequences (like lone surrogates) +std::optional unescapeJsString(std::string_view str) { + std::string result; + result.reserve(str.size()); + + for (size_t i = 0; i < str.size(); ++i) { + if (str[i] != '\\') { + result.push_back(str[i]); + continue; + } + + if (++i >= str.size()) { + return std::nullopt; // Trailing backslash + } + + // Check simple escape table first (single character escapes) + uint8_t simple = kSimpleEscapeTable[static_cast(str[i])]; + if (simple != 0xFF) { + result.push_back(static_cast(simple)); + continue; + } + + // Handle complex escapes + switch (str[i]) { + case 'x': { + // \xHH - two hex digits + if (i + 2 >= str.size()) return std::nullopt; + int h1 = hexDigit(static_cast(str[i + 1])); + int h2 = hexDigit(static_cast(str[i + 2])); + if (h1 < 0 || h2 < 0) return std::nullopt; + result.push_back(static_cast((h1 << 4) | h2)); + i += 2; + break; + } + case 'u': { + if (i + 1 >= str.size()) return std::nullopt; + if (str[i + 1] == '{') { + // \u{XXXX} - variable length hex + size_t start = i + 2; + size_t end_brace = str.find('}', start); + if (end_brace == std::string_view::npos || end_brace == start) return std::nullopt; + uint32_t codepoint = 0; + for (size_t j = start; j < end_brace; ++j) { + int digit = hexDigit(static_cast(str[j])); + if (digit < 0) return std::nullopt; + codepoint = (codepoint << 4) | static_cast(digit); + if (codepoint > 0x10FFFF) return std::nullopt; // Invalid codepoint + } + // Handle surrogate pairs in \u{XXXX} format + if (codepoint >= 0xD800 && codepoint <= 0xDBFF) { + // High surrogate - check for low surrogate \u{XXXX} + if (end_brace + 3 < str.size() && str[end_brace + 1] == '\\' && + str[end_brace + 2] == 'u' && str[end_brace + 3] == '{') { + size_t low_start = end_brace + 4; + size_t low_end = str.find('}', low_start); + if (low_end != std::string_view::npos && low_end > low_start) { + uint32_t low = 0; + bool valid_low = true; + for (size_t j = low_start; j < low_end; ++j) { + int digit = hexDigit(static_cast(str[j])); + if (digit < 0) { + valid_low = false; + break; + } + low = (low << 4) | static_cast(digit); + } + if (valid_low && low >= 0xDC00 && low <= 0xDFFF) { + // Valid surrogate pair - combine into single codepoint + codepoint = 0x10000 + ((codepoint - 0xD800) << 10) + (low - 0xDC00); + end_brace = low_end; // Skip past the low surrogate + } else { + // Lone high surrogate + return std::nullopt; + } + } else { + // Lone high surrogate + return std::nullopt; + } + } else { + // Lone high surrogate + return std::nullopt; + } + } else if (codepoint >= 0xDC00 && codepoint <= 0xDFFF) { + // Lone low surrogate + return std::nullopt; + } + encodeUtf8(result, codepoint); + i = end_brace; + } else { + // \uHHHH - exactly four hex digits + if (i + 4 >= str.size()) return std::nullopt; + uint32_t codepoint = 0; + for (int j = 1; j <= 4; ++j) { + int digit = hexDigit(static_cast(str[i + static_cast(j)])); + if (digit < 0) return std::nullopt; + codepoint = (codepoint << 4) | static_cast(digit); + } + // Handle surrogate pairs + if (codepoint >= 0xD800 && codepoint <= 0xDBFF) { + // High surrogate - check for low surrogate + if (i + 10 < str.size() && str[i + 5] == '\\' && str[i + 6] == 'u') { + uint32_t low = 0; + bool valid_low = true; + for (int j = 7; j <= 10; ++j) { + int digit = hexDigit(static_cast(str[i + static_cast(j)])); + if (digit < 0) { + valid_low = false; + break; + } + low = (low << 4) | static_cast(digit); + } + if (valid_low && low >= 0xDC00 && low <= 0xDFFF) { + // Valid surrogate pair - combine into single codepoint + codepoint = 0x10000 + ((codepoint - 0xD800) << 10) + (low - 0xDC00); + i += 6; // Skip the low surrogate + } else { + // Lone high surrogate + return std::nullopt; + } + } else { + // Lone high surrogate + return std::nullopt; + } + } else if (codepoint >= 0xDC00 && codepoint <= 0xDFFF) { + // Lone low surrogate + return std::nullopt; + } + encodeUtf8(result, codepoint); + i += 4; + } + break; + } + default: + // Unknown escape - just include the character as-is + result.push_back(str[i]); + break; + } + } + + return result; +} + +// Stack depth limits +constexpr size_t STACK_DEPTH = 2048; +constexpr size_t MAX_STAR_EXPORTS = 256; + +// RequireType enum for parsing require statements +enum class RequireType { + Import, + ExportAssign, + ExportStar +}; + +// StarExportBinding structure for tracking star export bindings +struct StarExportBinding { + std::string_view specifier; + std::string_view id; +}; + +// Thread-local state for error tracking (safe for concurrent parse calls). +thread_local std::optional last_error; + +// Lexer state class +class CJSLexer { +private: + const char* source; + const char* pos; + const char* end; + const char* lastTokenPos; + + uint16_t templateStackDepth; + uint16_t openTokenDepth; + uint16_t templateDepth; + + uint32_t line; + + bool lastSlashWasDivision; + bool nextBraceIsClass; + + std::array templateStack_; + std::array openTokenPosStack_; + std::array openClassPosStack; + std::array starExportStack_; + StarExportBinding* starExportStack; + const StarExportBinding* STAR_EXPORT_STACK_END; + + std::vector& exports; + std::vector& re_exports; + + // Increments `line` when consuming a line terminator. + // - Counts '\n' as a newline. + // - Counts '\r' as a newline only when it is not part of a CRLF sequence. + // (i.e., the next character is not '\n' or we're at end-of-input.) + void countNewline(char ch) { + line += (ch == '\n') || (ch == '\r' && (pos + 1 >= end || *(pos + 1) != '\n')); + } + + // Character classification helpers using lookup tables + static bool isBr(char c) { + return c == '\r' || c == '\n'; + } + + static bool isBrOrWs(unsigned char c) { + return kBrOrWsTable[c]; + } + + static bool isBrOrWsOrPunctuatorNotDot(unsigned char c) { + return kBrOrWsTable[c] || (kPunctuatorTable[c] && c != '.'); + } + + static bool isPunctuator(unsigned char ch) { + return kPunctuatorTable[ch]; + } + + static bool isExpressionPunctuator(unsigned char ch) { + return kExpressionPunctuatorTable[ch]; + } + + // String comparison helpers using string_view for cleaner, more maintainable code + static constexpr bool matchesAt(const char* p, const char* end_pos, std::string_view expected) { + size_t available = static_cast(end_pos - p); + if (available < expected.size()) return false; + for (size_t i = 0; i < expected.size(); ++i) { + if (p[i] != expected[i]) return false; + } + return true; + } + + // Character type detection using lookup tables + static bool isIdentifierStart(uint8_t ch) { + return kIdentifierStartTable[ch]; + } + + static bool isIdentifierChar(uint8_t ch) { + return kIdentifierCharTable[ch]; + } + + constexpr bool keywordStart(const char* p) const { + return p == source || isBrOrWsOrPunctuatorNotDot(*(p - 1)); + } + + constexpr bool readPrecedingKeyword(const char* p, std::string_view keyword) const { + if (p - static_cast(keyword.size()) + 1 < source) return false; + const char* start = p - keyword.size() + 1; + return matchesAt(start, end, keyword) && (start == source || isBrOrWsOrPunctuatorNotDot(*(start - 1))); + } + + // Keyword detection + constexpr bool isExpressionKeyword(const char* p) const { + switch (*p) { + case 'd': + switch (*(p - 1)) { + case 'i': + return readPrecedingKeyword(p - 2, "vo"); + case 'l': + return readPrecedingKeyword(p - 2, "yie"); + default: + return false; + } + case 'e': + switch (*(p - 1)) { + case 's': + switch (*(p - 2)) { + case 'l': + return p - 3 >= source && *(p - 3) == 'e' && keywordStart(p - 3); + case 'a': + return p - 3 >= source && *(p - 3) == 'c' && keywordStart(p - 3); + default: + return false; + } + case 't': + return readPrecedingKeyword(p - 2, "dele"); + default: + return false; + } + case 'f': + if (*(p - 1) != 'o' || *(p - 2) != 'e') + return false; + switch (*(p - 3)) { + case 'c': + return readPrecedingKeyword(p - 4, "instan"); + case 'p': + return readPrecedingKeyword(p - 4, "ty"); + default: + return false; + } + case 'n': + return (p - 1 >= source && *(p - 1) == 'i' && keywordStart(p - 1)) || + readPrecedingKeyword(p - 1, "retur"); + case 'o': + return p - 1 >= source && *(p - 1) == 'd' && keywordStart(p - 1); + case 'r': + return readPrecedingKeyword(p - 1, "debugge"); + case 't': + return readPrecedingKeyword(p - 1, "awai"); + case 'w': + switch (*(p - 1)) { + case 'e': + return p - 2 >= source && *(p - 2) == 'n' && keywordStart(p - 2); + case 'o': + return readPrecedingKeyword(p - 2, "thr"); + default: + return false; + } + } + return false; + } + + constexpr bool isParenKeyword(const char* curPos) const { + return readPrecedingKeyword(curPos, "while") || + readPrecedingKeyword(curPos, "for") || + readPrecedingKeyword(curPos, "if"); + } + + constexpr bool isExpressionTerminator(const char* curPos) const { + switch (*curPos) { + case '>': + return *(curPos - 1) == '='; + case ';': + case ')': + return true; + case 'h': + return readPrecedingKeyword(curPos - 1, "catc"); + case 'y': + return readPrecedingKeyword(curPos - 1, "finall"); + case 'e': + return readPrecedingKeyword(curPos - 1, "els"); + } + return false; + } + + // Parsing utilities + void syntaxError(lexer_error code) { + if (!last_error) { + last_error = code; + } + pos = end + 1; + } + + char commentWhitespace() { + char ch; + do { + if (pos >= end) return '\0'; + ch = *pos; + if (ch == '/') { + char next_ch = pos + 1 < end ? *(pos + 1) : '\0'; + if (next_ch == '/') + lineComment(); + else if (next_ch == '*') + blockComment(); + else + return ch; + } else if (!isBrOrWs(ch)) { + return ch; + } else { + countNewline(ch); + } + } while (pos++ < end); + return ch; + } + + void lineComment() { + while (pos++ < end) { + char ch = *pos; + if (ch == '\n' || ch == '\r') { + countNewline(ch); + return; + } + } + } + + void blockComment() { + pos++; + while (pos++ < end) { + char ch = *pos; + if (ch == '*' && *(pos + 1) == '/') { + pos++; + return; + } + countNewline(ch); + } + } + + void stringLiteral(char quote) { + while (pos++ < end) { + char ch = *pos; + if (ch == quote) + return; + if (ch == '\\') { + if (pos + 1 >= end) break; + ch = *++pos; + if (ch == '\r') { + ++line; + if (*(pos + 1) == '\n') + pos++; + } else if (ch == '\n') { + ++line; + } + } else if (isBr(ch)) + break; + } + syntaxError(lexer_error::UNTERMINATED_STRING_LITERAL); + } + + void regularExpression() { + while (pos++ < end) { + char ch = *pos; + if (ch == '/') + return; + if (ch == '[') { + regexCharacterClass(); + } else if (ch == '\\') { + if (pos + 1 < end) + pos++; + } else if (ch == '\n' || ch == '\r') + break; + } + syntaxError(lexer_error::UNTERMINATED_REGEX); + } + + void regexCharacterClass() { + while (pos++ < end) { + char ch = *pos; + if (ch == ']') + return; + if (ch == '\\') { + if (pos + 1 < end) + pos++; + } else if (ch == '\n' || ch == '\r') + break; + } + syntaxError(lexer_error::UNTERMINATED_REGEX_CHARACTER_CLASS); + } + + void templateString() { + while (pos++ < end) { + char ch = *pos; + if (ch == '$' && *(pos + 1) == '{') { + pos++; + if (templateStackDepth >= STACK_DEPTH) { + syntaxError(lexer_error::TEMPLATE_NEST_OVERFLOW); + return; + } + templateStack_[templateStackDepth++] = templateDepth; + templateDepth = ++openTokenDepth; + return; + } + if (ch == '`') + return; + if (ch == '\\' && pos + 1 < end) { + pos++; + countNewline(*pos); + } else { + countNewline(ch); + } + } + syntaxError(lexer_error::UNTERMINATED_TEMPLATE_STRING); + } + + bool identifier(char startCh) { + if (!isIdentifierStart(static_cast(startCh))) + return false; + pos++; + while (pos < end) { + char ch = *pos; + if (isIdentifierChar(static_cast(ch))) { + pos++; + } else { + break; + } + } + return true; + } + + // Check if string contains escape sequences + static bool needsUnescaping(std::string_view str) { +#ifdef MERVE_USE_SIMDUTF + // simdutf provides fast SIMD-based ASCII validation + // If the string is valid ASCII without high bytes, we can use a faster path + // But we still need to check for backslash + const char* ptr = simdutf::find(str.data(), str.data() + str.size(), '\\'); + return ptr != str.data() + str.size(); +#else + return str.find('\\') != std::string_view::npos; +#endif + } + + void addExport(std::string_view export_name, uint32_t at_line) { + // Skip surrounding quotes if present + if (!export_name.empty() && (export_name.front() == '\'' || export_name.front() == '"')) { + export_name.remove_prefix(1); + export_name.remove_suffix(1); + } + + // Fast path: no escaping needed, use string_view directly + if (!needsUnescaping(export_name)) { + // Check if this export already exists (avoid duplicates) + for (const auto& existing : exports) { + if (get_string_view(existing.name) == export_name) { + return; // Already exists, skip + } + } + exports.push_back(export_entry{export_name, at_line}); + return; + } + + // Slow path: unescape the export name (handles \u{XXXX}, \uHHHH, etc.) + // Returns nullopt for invalid sequences like lone surrogates + auto unescaped = unescapeJsString(export_name); + if (!unescaped.has_value()) { + return; // Skip invalid escape sequences + } + + const std::string& name = unescaped.value(); + + // Check if this export already exists (avoid duplicates) + for (const auto& existing : exports) { + if (get_string_view(existing.name) == name) { + return; // Already exists, skip + } + } + exports.push_back(export_entry{std::move(unescaped.value()), at_line}); + } + + void addReexport(std::string_view reexport_name, uint32_t at_line) { + // Skip surrounding quotes if present + if (!reexport_name.empty() && (reexport_name.front() == '\'' || reexport_name.front() == '"')) { + reexport_name.remove_prefix(1); + reexport_name.remove_suffix(1); + } + + // Fast path: no escaping needed, use string_view directly + if (!needsUnescaping(reexport_name)) { + re_exports.push_back(export_entry{reexport_name, at_line}); + return; + } + + // Slow path: unescape the reexport name + auto unescaped = unescapeJsString(reexport_name); + if (!unescaped.has_value()) { + return; // Skip invalid escape sequences + } + + re_exports.push_back(export_entry{std::move(unescaped.value()), at_line}); + } + + bool readExportsOrModuleDotExports(char ch) { + const char* revertPos = pos; + if (ch == 'm' && matchesAt(pos + 1, end, "odule")) { + pos += 6; + ch = commentWhitespace(); + if (ch != '.') { + pos = revertPos; + return false; + } + pos++; + ch = commentWhitespace(); + } + if (ch == 'e' && matchesAt(pos + 1, end, "xports")) { + pos += 7; + return true; + } + pos = revertPos; + return false; + } + + bool tryParseRequire(RequireType requireType) { + const char* revertPos = pos; + if (!matchesAt(pos + 1, end, "equire")) { + return false; + } + pos += 7; + char ch = commentWhitespace(); + if (ch == '(') { + pos++; + ch = commentWhitespace(); + const char* reexportStart = pos; + if (ch == '\'' || ch == '"') { + stringLiteral(ch); + const char* reexportEnd = ++pos; + ch = commentWhitespace(); + if (ch == ')') { + switch (requireType) { + case RequireType::ExportStar: + case RequireType::ExportAssign: + addReexport(std::string_view(reexportStart, reexportEnd - reexportStart), line); + return true; + default: + if (starExportStack < STAR_EXPORT_STACK_END) { + starExportStack->specifier = std::string_view(reexportStart, reexportEnd - reexportStart); + } + return true; + } + } + } + } + pos = revertPos; + return false; + } + + // Helper to parse property value in object literal (identifier or require()) + bool tryParsePropertyValue(char& ch) { + if (ch == 'r' && tryParseRequire(RequireType::ExportAssign)) { + ch = *pos; + return true; + } + if (identifier(ch)) { + ch = *pos; + return true; + } + return false; + } + + void tryParseLiteralExports() { + const char* revertPos = pos - 1; + while (pos++ < end) { + char ch = commentWhitespace(); + const char* startPos = pos; + if (identifier(ch)) { + const char* endPos = pos; + ch = commentWhitespace(); + + // Check if this is a getter syntax: get identifier() + if (ch != ':' && endPos - startPos == 3 && matchesAt(startPos, end, "get")) { + // Skip getter: get identifier() { ... } + if (identifier(ch)) { + ch = commentWhitespace(); + if (ch == '(') { + // This is a getter, stop parsing here (early termination) + pos = revertPos; + return; + } + } + // Not a getter, revert and fail + pos = revertPos; + return; + } + + if (ch == ':') { + pos++; + ch = commentWhitespace(); + if (!tryParsePropertyValue(ch)) { + pos = revertPos; + return; + } + } + addExport(std::string_view(startPos, endPos - startPos), line); + } else if (ch == '\'' || ch == '"') { + const char* start = pos; + stringLiteral(ch); + const char* end_pos = ++pos; + ch = commentWhitespace(); + if (ch == ':') { + pos++; + ch = commentWhitespace(); + if (!tryParsePropertyValue(ch)) { + pos = revertPos; + return; + } + addExport(std::string_view(start, end_pos - start), line); + } + } else if (ch == '.' && matchesAt(pos + 1, end, "..")) { + pos += 3; + if (pos < end && *pos == 'r' && tryParseRequire(RequireType::ExportAssign)) { + pos++; + } else if (pos < end && !identifier(*pos)) { + pos = revertPos; + return; + } + ch = commentWhitespace(); + } else { + pos = revertPos; + return; + } + + if (ch == '}') + return; + + if (ch != ',') { + pos = revertPos; + return; + } + } + } + + void tryParseExportsDotAssign(bool assign) { + pos += 7; + const char* revertPos = pos - 1; + char ch = commentWhitespace(); + switch (ch) { + case '.': { + pos++; + ch = commentWhitespace(); + const char* startPos = pos; + if (identifier(ch)) { + const char* endPos = pos; + ch = commentWhitespace(); + if (ch == '=') { + addExport(std::string_view(startPos, endPos - startPos), line); + return; + } + } + break; + } + case '[': { + pos++; + ch = commentWhitespace(); + if (ch == '\'' || ch == '"') { + const char* startPos = pos; + stringLiteral(ch); + const char* endPos = ++pos; + ch = commentWhitespace(); + if (ch != ']') break; + pos++; + ch = commentWhitespace(); + if (ch != '=') break; + addExport(std::string_view(startPos, endPos - startPos), line); + } + break; + } + case '=': { + if (assign) { + re_exports.clear(); + pos++; + ch = commentWhitespace(); + if (ch == '{') { + tryParseLiteralExports(); + return; + } + if (ch == 'r') + tryParseRequire(RequireType::ExportAssign); + } + break; + } + } + pos = revertPos; + } + + void tryParseModuleExportsDotAssign() { + pos += 6; + const char* revertPos = pos - 1; + char ch = commentWhitespace(); + if (ch == '.') { + pos++; + ch = commentWhitespace(); + if (ch == 'e' && matchesAt(pos + 1, end, "xports")) { + tryParseExportsDotAssign(true); + return; + } + } + pos = revertPos; + } + + bool tryParseObjectHasOwnProperty(std::string_view it_id) { + char ch = commentWhitespace(); + if (ch != 'O' || !matchesAt(pos + 1, end, "bject")) return false; + pos += 6; + ch = commentWhitespace(); + if (ch != '.') return false; + pos++; + ch = commentWhitespace(); + if (ch == 'p') { + if (!matchesAt(pos + 1, end, "rototype")) return false; + pos += 9; + ch = commentWhitespace(); + if (ch != '.') return false; + pos++; + ch = commentWhitespace(); + } + if (ch != 'h' || !matchesAt(pos + 1, end, "asOwnProperty")) return false; + pos += 14; + ch = commentWhitespace(); + if (ch != '.') return false; + pos++; + ch = commentWhitespace(); + if (ch != 'c' || !matchesAt(pos + 1, end, "all")) return false; + pos += 4; + ch = commentWhitespace(); + if (ch != '(') return false; + pos++; + ch = commentWhitespace(); + if (!identifier(ch)) return false; + ch = commentWhitespace(); + if (ch != ',') return false; + pos++; + ch = commentWhitespace(); + if (!matchesAt(pos, end, it_id)) return false; + pos += it_id.size(); + ch = commentWhitespace(); + if (ch != ')') return false; + pos++; + return true; + } + + void tryParseObjectDefineOrKeys(bool keys) { + pos += 6; + const char* revertPos = pos - 1; + char ch = commentWhitespace(); + if (ch == '.') { + pos++; + ch = commentWhitespace(); + if (ch == 'd' && matchesAt(pos + 1, end, "efineProperty")) { + const char* exportStart = nullptr; + const char* exportEnd = nullptr; + while (true) { + pos += 14; + revertPos = pos - 1; + ch = commentWhitespace(); + if (ch != '(') break; + pos++; + ch = commentWhitespace(); + if (!readExportsOrModuleDotExports(ch)) break; + ch = commentWhitespace(); + if (ch != ',') break; + pos++; + ch = commentWhitespace(); + if (ch != '\'' && ch != '"') break; + exportStart = pos; + stringLiteral(ch); + exportEnd = ++pos; + ch = commentWhitespace(); + if (ch != ',') break; + pos++; + ch = commentWhitespace(); + if (ch != '{') break; + pos++; + ch = commentWhitespace(); + if (ch == 'e') { + if (!matchesAt(pos + 1, end, "numerable")) break; + pos += 10; + ch = commentWhitespace(); + if (ch != ':') break; + pos++; + ch = commentWhitespace(); + if (ch != 't' || !matchesAt(pos + 1, end, "rue")) break; + pos += 4; + ch = commentWhitespace(); + if (ch != ',') break; + pos++; + ch = commentWhitespace(); + } + if (ch == 'v') { + if (!matchesAt(pos + 1, end, "alue")) break; + pos += 5; + ch = commentWhitespace(); + if (ch != ':') break; + if (exportStart && exportEnd) + addExport(std::string_view(exportStart, exportEnd - exportStart), line); + pos = revertPos; + return; + } else if (ch == 'g') { + if (!matchesAt(pos + 1, end, "et")) break; + pos += 3; + ch = commentWhitespace(); + if (ch == ':') { + pos++; + ch = commentWhitespace(); + if (ch != 'f') break; + if (!matchesAt(pos + 1, end, "unction")) break; + pos += 8; + const char* lastPos = pos; + ch = commentWhitespace(); + if (ch != '(' && (lastPos == pos || !identifier(ch))) break; + ch = commentWhitespace(); + } + if (ch != '(') break; + pos++; + ch = commentWhitespace(); + if (ch != ')') break; + pos++; + ch = commentWhitespace(); + if (ch != '{') break; + pos++; + ch = commentWhitespace(); + if (ch != 'r') break; + if (!matchesAt(pos + 1, end, "eturn")) break; + pos += 6; + ch = commentWhitespace(); + if (!identifier(ch)) break; + ch = commentWhitespace(); + if (ch == '.') { + pos++; + ch = commentWhitespace(); + if (!identifier(ch)) break; + ch = commentWhitespace(); + } else if (ch == '[') { + pos++; + ch = commentWhitespace(); + if (ch == '\'' || ch == '"') { + stringLiteral(ch); + } else { + break; + } + pos++; + ch = commentWhitespace(); + if (ch != ']') break; + pos++; + ch = commentWhitespace(); + } + if (ch == ';') { + pos++; + ch = commentWhitespace(); + } + if (ch != '}') break; + pos++; + ch = commentWhitespace(); + if (ch == ',') { + pos++; + ch = commentWhitespace(); + } + if (ch != '}') break; + pos++; + ch = commentWhitespace(); + if (ch != ')') break; + if (exportStart && exportEnd) + addExport(std::string_view(exportStart, exportEnd - exportStart), line); + return; + } + break; + } + } else if (keys && ch == 'k' && matchesAt(pos + 1, end, "eys")) { + while (true) { + pos += 4; + revertPos = pos - 1; + ch = commentWhitespace(); + if (ch != '(') break; + pos++; + ch = commentWhitespace(); + const char* id_pos = pos; + if (!identifier(ch)) break; + std::string_view id(id_pos, static_cast(pos - id_pos)); + ch = commentWhitespace(); + if (ch != ')') break; + + revertPos = pos++; + ch = commentWhitespace(); + if (ch != '.') break; + pos++; + ch = commentWhitespace(); + if (ch != 'f' || !matchesAt(pos + 1, end, "orEach")) break; + pos += 7; + ch = commentWhitespace(); + revertPos = pos - 1; + if (ch != '(') break; + pos++; + ch = commentWhitespace(); + if (ch != 'f' || !matchesAt(pos + 1, end, "unction")) break; + pos += 8; + ch = commentWhitespace(); + if (ch != '(') break; + pos++; + ch = commentWhitespace(); + const char* it_id_pos = pos; + if (!identifier(ch)) break; + std::string_view it_id(it_id_pos, static_cast(pos - it_id_pos)); + ch = commentWhitespace(); + if (ch != ')') break; + pos++; + ch = commentWhitespace(); + if (ch != '{') break; + pos++; + ch = commentWhitespace(); + if (ch != 'i' || *(pos + 1) != 'f') break; + pos += 2; + ch = commentWhitespace(); + if (ch != '(') break; + pos++; + ch = commentWhitespace(); + if (!matchesAt(pos, end, it_id)) break; + pos += it_id.size(); + ch = commentWhitespace(); + + if (ch == '=') { + if (!matchesAt(pos + 1, end, "==")) break; + pos += 3; + ch = commentWhitespace(); + if (ch != '"' && ch != '\'') break; + char quot = ch; + if (!matchesAt(pos + 1, end, "default")) break; + pos += 8; + ch = commentWhitespace(); + if (ch != quot) break; + pos++; + ch = commentWhitespace(); + if (ch != '|' || *(pos + 1) != '|') break; + pos += 2; + ch = commentWhitespace(); + if (!matchesAt(pos, end, it_id)) break; + pos += it_id.size(); + ch = commentWhitespace(); + if (ch != '=' || !matchesAt(pos + 1, end, "==")) break; + pos += 3; + ch = commentWhitespace(); + if (ch != '"' && ch != '\'') break; + quot = ch; + if (!matchesAt(pos + 1, end, "__esModule")) break; + pos += 11; + ch = commentWhitespace(); + if (ch != quot) break; + pos++; + ch = commentWhitespace(); + if (ch != ')') break; + pos++; + ch = commentWhitespace(); + if (ch != 'r' || !matchesAt(pos + 1, end, "eturn")) break; + pos += 6; + ch = commentWhitespace(); + if (ch == ';') + pos++; + ch = commentWhitespace(); + + if (ch == 'i' && *(pos + 1) == 'f') { + bool inIf = true; + pos += 2; + ch = commentWhitespace(); + if (ch != '(') break; + pos++; + const char* ifInnerPos = pos; + + if (tryParseObjectHasOwnProperty(it_id)) { + ch = commentWhitespace(); + if (ch != ')') break; + pos++; + ch = commentWhitespace(); + if (ch != 'r' || !matchesAt(pos + 1, end, "eturn")) break; + pos += 6; + ch = commentWhitespace(); + if (ch == ';') + pos++; + ch = commentWhitespace(); + if (ch == 'i' && *(pos + 1) == 'f') { + pos += 2; + ch = commentWhitespace(); + if (ch != '(') break; + pos++; + } else { + inIf = false; + } + } else { + pos = ifInnerPos; + } + + if (inIf) { + if (!matchesAt(pos, end, it_id)) break; + pos += it_id.size(); + ch = commentWhitespace(); + if (ch != 'i' || !matchesAt(pos + 1, end, "n ")) break; + pos += 3; + ch = commentWhitespace(); + if (!readExportsOrModuleDotExports(ch)) break; + ch = commentWhitespace(); + if (ch != '&' || *(pos + 1) != '&') break; + pos += 2; + ch = commentWhitespace(); + if (!readExportsOrModuleDotExports(ch)) break; + ch = commentWhitespace(); + if (ch != '[') break; + pos++; + ch = commentWhitespace(); + if (!matchesAt(pos, end, it_id)) break; + pos += it_id.size(); + ch = commentWhitespace(); + if (ch != ']') break; + pos++; + ch = commentWhitespace(); + if (ch != '=' || !matchesAt(pos + 1, end, "==")) break; + pos += 3; + ch = commentWhitespace(); + if (!matchesAt(pos, end, id)) break; + pos += id.size(); + ch = commentWhitespace(); + if (ch != '[') break; + pos++; + ch = commentWhitespace(); + if (!matchesAt(pos, end, it_id)) break; + pos += it_id.size(); + ch = commentWhitespace(); + if (ch != ']') break; + pos++; + ch = commentWhitespace(); + if (ch != ')') break; + pos++; + ch = commentWhitespace(); + if (ch != 'r' || !matchesAt(pos + 1, end, "eturn")) break; + pos += 6; + ch = commentWhitespace(); + if (ch == ';') + pos++; + ch = commentWhitespace(); + } + } + } else if (ch == '!') { + if (!matchesAt(pos + 1, end, "==")) break; + pos += 3; + ch = commentWhitespace(); + if (ch != '"' && ch != '\'') break; + char quot = ch; + if (!matchesAt(pos + 1, end, "default")) break; + pos += 8; + ch = commentWhitespace(); + if (ch != quot) break; + pos++; + ch = commentWhitespace(); + if (ch == '&') { + if (*(pos + 1) != '&') break; + pos += 2; + ch = commentWhitespace(); + if (ch != '!') break; + pos++; + ch = commentWhitespace(); + if (ch == 'O' && matchesAt(pos + 1, end, "bject.")) { + if (!tryParseObjectHasOwnProperty(it_id)) break; + } else if (identifier(ch)) { + ch = commentWhitespace(); + if (ch != '.') break; + pos++; + ch = commentWhitespace(); + if (ch != 'h' || !matchesAt(pos + 1, end, "asOwnProperty")) break; + pos += 14; + ch = commentWhitespace(); + if (ch != '(') break; + pos++; + ch = commentWhitespace(); + if (!matchesAt(pos, end, it_id)) break; + pos += it_id.size(); + ch = commentWhitespace(); + if (ch != ')') break; + pos++; + } + ch = commentWhitespace(); + } + if (ch != ')') break; + pos++; + ch = commentWhitespace(); + } else { + break; + } + + if (readExportsOrModuleDotExports(ch)) { + ch = commentWhitespace(); + if (ch != '[') break; + pos++; + ch = commentWhitespace(); + if (!matchesAt(pos, end, it_id)) break; + pos += it_id.size(); + ch = commentWhitespace(); + if (ch != ']') break; + pos++; + ch = commentWhitespace(); + if (ch != '=') break; + pos++; + ch = commentWhitespace(); + if (!matchesAt(pos, end, id)) break; + pos += id.size(); + ch = commentWhitespace(); + if (ch != '[') break; + pos++; + ch = commentWhitespace(); + if (!matchesAt(pos, end, it_id)) break; + pos += it_id.size(); + ch = commentWhitespace(); + if (ch != ']') break; + pos++; + ch = commentWhitespace(); + if (ch == ';') { + pos++; + ch = commentWhitespace(); + } + } else if (ch == 'O') { + if (!matchesAt(pos + 1, end, "bject")) break; + pos += 6; + ch = commentWhitespace(); + if (ch != '.') break; + pos++; + ch = commentWhitespace(); + if (ch != 'd' || !matchesAt(pos + 1, end, "efineProperty")) break; + pos += 14; + ch = commentWhitespace(); + if (ch != '(') break; + pos++; + ch = commentWhitespace(); + if (!readExportsOrModuleDotExports(ch)) break; + ch = commentWhitespace(); + if (ch != ',') break; + pos++; + ch = commentWhitespace(); + if (!matchesAt(pos, end, it_id)) break; + pos += it_id.size(); + ch = commentWhitespace(); + if (ch != ',') break; + pos++; + ch = commentWhitespace(); + if (ch != '{') break; + pos++; + ch = commentWhitespace(); + if (ch != 'e' || !matchesAt(pos + 1, end, "numerable")) break; + pos += 10; + ch = commentWhitespace(); + if (ch != ':') break; + pos++; + ch = commentWhitespace(); + if (ch != 't' || !matchesAt(pos + 1, end, "rue")) break; + pos += 4; + ch = commentWhitespace(); + if (ch != ',') break; + pos++; + ch = commentWhitespace(); + if (ch != 'g' || !matchesAt(pos + 1, end, "et")) break; + pos += 3; + ch = commentWhitespace(); + if (ch == ':') { + pos++; + ch = commentWhitespace(); + if (ch != 'f') break; + if (!matchesAt(pos + 1, end, "unction")) break; + pos += 8; + const char* lastPos = pos; + ch = commentWhitespace(); + if (ch != '(' && (lastPos == pos || !identifier(ch))) break; + ch = commentWhitespace(); + } + if (ch != '(') break; + pos++; + ch = commentWhitespace(); + if (ch != ')') break; + pos++; + ch = commentWhitespace(); + if (ch != '{') break; + pos++; + ch = commentWhitespace(); + if (ch != 'r' || !matchesAt(pos + 1, end, "eturn")) break; + pos += 6; + ch = commentWhitespace(); + if (!matchesAt(pos, end, id)) break; + pos += id.size(); + ch = commentWhitespace(); + if (ch != '[') break; + pos++; + ch = commentWhitespace(); + if (!matchesAt(pos, end, it_id)) break; + pos += it_id.size(); + ch = commentWhitespace(); + if (ch != ']') break; + pos++; + ch = commentWhitespace(); + if (ch == ';') { + pos++; + ch = commentWhitespace(); + } + if (ch != '}') break; + pos++; + ch = commentWhitespace(); + if (ch == ',') { + pos++; + ch = commentWhitespace(); + } + if (ch != '}') break; + pos++; + ch = commentWhitespace(); + if (ch != ')') break; + pos++; + ch = commentWhitespace(); + if (ch == ';') { + pos++; + ch = commentWhitespace(); + } + } else { + break; + } + + if (ch != '}') break; + pos++; + ch = commentWhitespace(); + if (ch != ')') break; + + // Search through export bindings to see if this is a star export + StarExportBinding* curCheckBinding = &starExportStack_[0]; + while (curCheckBinding != starExportStack) { + if (curCheckBinding->id == id) { + addReexport(curCheckBinding->specifier, line); + pos = revertPos; + return; + } + curCheckBinding++; + } + return; + } + } + } + pos = revertPos; + } + + void tryBacktrackAddStarExportBinding(const char* bPos) { + while (*bPos == ' ' && bPos > source) + bPos--; + if (*bPos == '=') { + bPos--; + while (*bPos == ' ' && bPos > source) + bPos--; + const char* id_end = bPos; + bool identifierStart = false; + while (bPos > source) { + char ch = *bPos; + if (!isIdentifierChar(static_cast(ch))) + break; + identifierStart = isIdentifierStart(static_cast(ch)); + bPos--; + } + if (identifierStart && *bPos == ' ') { + if (starExportStack == STAR_EXPORT_STACK_END) + return; + starExportStack->id = std::string_view(bPos + 1, static_cast(id_end - bPos)); + while (*bPos == ' ' && bPos > source) + bPos--; + switch (*bPos) { + case 'r': + if (!readPrecedingKeyword(bPos - 1, "va")) + return; + break; + case 't': + if (!readPrecedingKeyword(bPos - 1, "le") && !readPrecedingKeyword(bPos - 1, "cons")) + return; + break; + default: + return; + } + starExportStack++; + } + } + } + + void throwIfImportStatement() { + const char* startPos = pos; + pos += 6; + char ch = commentWhitespace(); + switch (ch) { + case '(': + openTokenPosStack_[openTokenDepth++] = startPos; + return; + case '.': + // Check if followed by 'meta' (possibly with whitespace) + pos++; + ch = commentWhitespace(); + // Use str_eq4 for more efficient comparison + if (ch == 'm' && pos + 4 <= end && matchesAt(pos + 1, end, "eta")) { + // Check that 'meta' is not followed by an identifier character + if (pos + 4 < end && isIdentifierChar(static_cast(pos[4]))) { + // It's something like import.metaData, not import.meta + return; + } + syntaxError(lexer_error::UNEXPECTED_ESM_IMPORT_META); + } + return; + default: + if (pos == startPos + 6) + break; + [[fallthrough]]; + case '"': + case '\'': + case '{': + case '*': + if (openTokenDepth != 0) { + pos--; + return; + } + syntaxError(lexer_error::UNEXPECTED_ESM_IMPORT); + } + } + + void throwIfExportStatement() { + pos += 6; + const char* curPos = pos; + char ch = commentWhitespace(); + if (pos == curPos && !isPunctuator(ch)) + return; + syntaxError(lexer_error::UNEXPECTED_ESM_EXPORT); + } + +public: + CJSLexer(std::vector& out_exports, std::vector& out_re_exports) + : source(nullptr), pos(nullptr), end(nullptr), lastTokenPos(nullptr), + templateStackDepth(0), openTokenDepth(0), templateDepth(0), + line(1), + lastSlashWasDivision(false), nextBraceIsClass(false), + templateStack_{}, openTokenPosStack_{}, openClassPosStack{}, + starExportStack_{}, starExportStack(nullptr), STAR_EXPORT_STACK_END(nullptr), + exports(out_exports), re_exports(out_re_exports) {} + + bool parse(std::string_view file_contents) { + source = file_contents.data(); + pos = source - 1; + end = source + file_contents.size(); + // Initialize lastTokenPos to before source to detect start-of-input condition + // when checking if '/' should be treated as regex vs division operator + lastTokenPos = source - 1; + + templateStackDepth = 0; + openTokenDepth = 0; + templateDepth = std::numeric_limits::max(); + line = 1; + lastSlashWasDivision = false; + starExportStack = &starExportStack_[0]; + STAR_EXPORT_STACK_END = &starExportStack_[MAX_STAR_EXPORTS - 1]; + nextBraceIsClass = false; + + char ch = '\0'; + + // Handle shebang + if (file_contents.size() >= 2 && source[0] == '#' && source[1] == '!') { + if (file_contents.size() == 2) + return true; + pos += 2; + while (pos < end) { + ch = *pos; + if (ch == '\n' || ch == '\r') + break; + pos++; + } + lastTokenPos = pos; // Update lastTokenPos after shebang + } + + while (pos++ < end) { + ch = *pos; + + if (ch == ' ' || (ch < 14 && ch > 8)) { + countNewline(ch); + continue; + } + + if (openTokenDepth == 0) { + switch (ch) { + case 'i': + if (pos + 6 < end && matchesAt(pos + 1, end, "mport") && keywordStart(pos)) + throwIfImportStatement(); + lastTokenPos = pos; + continue; + case 'r': { + const char* startPos = pos; + if (tryParseRequire(RequireType::Import) && keywordStart(startPos)) + tryBacktrackAddStarExportBinding(startPos - 1); + lastTokenPos = pos; + continue; + } + case '_': + if (pos + 23 < end && matchesAt(pos + 1, end, "interopRequireWildcard") && (keywordStart(pos) || *(pos - 1) == '.')) { + const char* startPos = pos; + pos += 23; + if (*pos == '(') { + pos++; + openTokenPosStack_[openTokenDepth++] = lastTokenPos; + if (tryParseRequire(RequireType::Import) && keywordStart(startPos)) + tryBacktrackAddStarExportBinding(startPos - 1); + } + } else if (pos + 8 < end && matchesAt(pos + 1, end, "_export") && (keywordStart(pos) || *(pos - 1) == '.')) { + pos += 8; + if (pos + 4 < end && matchesAt(pos, end, "Star")) + pos += 4; + if (*pos == '(') { + openTokenPosStack_[openTokenDepth++] = lastTokenPos; + if (*(pos + 1) == 'r') { + pos++; + tryParseRequire(RequireType::ExportStar); + } + } + } + lastTokenPos = pos; + continue; + } + } + + switch (ch) { + case 'e': + if (pos + 6 < end && matchesAt(pos + 1, end, "xport") && keywordStart(pos)) { + if (pos + 7 < end && *(pos + 6) == 's') + tryParseExportsDotAssign(false); + else if (openTokenDepth == 0) + throwIfExportStatement(); + } + break; + case 'c': + if (keywordStart(pos) && matchesAt(pos + 1, end, "lass") && isBrOrWs(*(pos + 5))) + nextBraceIsClass = true; + break; + case 'm': + if (pos + 6 < end && matchesAt(pos + 1, end, "odule") && keywordStart(pos)) + tryParseModuleExportsDotAssign(); + break; + case 'O': + if (pos + 6 < end && matchesAt(pos + 1, end, "bject") && keywordStart(pos)) + tryParseObjectDefineOrKeys(openTokenDepth == 0); + break; + case '(': + openTokenPosStack_[openTokenDepth++] = lastTokenPos; + break; + case ')': + if (openTokenDepth == 0) { + syntaxError(lexer_error::UNEXPECTED_PAREN); + return false; + } + openTokenDepth--; + break; + case '{': + openClassPosStack[openTokenDepth] = nextBraceIsClass; + nextBraceIsClass = false; + openTokenPosStack_[openTokenDepth++] = lastTokenPos; + break; + case '}': + if (openTokenDepth == 0) { + syntaxError(lexer_error::UNEXPECTED_BRACE); + return false; + } + if (openTokenDepth-- == templateDepth) { + templateDepth = templateStack_[--templateStackDepth]; + templateString(); + } else { + if (templateDepth != std::numeric_limits::max() && openTokenDepth < templateDepth) { + syntaxError(lexer_error::UNTERMINATED_TEMPLATE_STRING); + return false; + } + } + break; + case '\'': + case '"': + stringLiteral(ch); + break; + case '/': { + char next_ch = pos + 1 < end ? *(pos + 1) : '\0'; + if (next_ch == '/') { + lineComment(); + continue; + } else if (next_ch == '*') { + blockComment(); + continue; + } else { + // Check if lastTokenPos is before the source (start of input) + bool isStartOfInput = lastTokenPos < source; + char lastToken = isStartOfInput ? '\0' : *lastTokenPos; + + if ((isExpressionPunctuator(lastToken) && + !(lastToken == '.' && lastTokenPos > source && *(lastTokenPos - 1) >= '0' && *(lastTokenPos - 1) <= '9') && + !(lastToken == '+' && lastTokenPos > source && *(lastTokenPos - 1) == '+') && + !(lastToken == '-' && lastTokenPos > source && *(lastTokenPos - 1) == '-')) || + (lastToken == ')' && isParenKeyword(openTokenPosStack_[openTokenDepth])) || + (lastToken == '}' && (openTokenPosStack_[openTokenDepth] < source || isExpressionTerminator(openTokenPosStack_[openTokenDepth]) || openClassPosStack[openTokenDepth])) || + (lastToken == '/' && lastSlashWasDivision) || + (!isStartOfInput && isExpressionKeyword(lastTokenPos)) || + !lastToken || isStartOfInput) { + regularExpression(); + lastSlashWasDivision = false; + } else { + lastSlashWasDivision = true; + } + } + break; + } + case '`': + if (templateDepth == std::numeric_limits::max() - 1) { + syntaxError(lexer_error::TEMPLATE_NEST_OVERFLOW); + return false; + } + templateString(); + break; + } + lastTokenPos = pos; + } + + if (templateDepth != std::numeric_limits::max() || openTokenDepth || last_error) { + return false; + } + + return true; + } +}; + +std::optional parse_commonjs(std::string_view file_contents) { + last_error.reset(); + + lexer_analysis result; + CJSLexer lexer(result.exports, result.re_exports); + + if (lexer.parse(file_contents)) { + return result; // NRVO or implicit move applies + } + + return std::nullopt; +} + +const std::optional& get_last_error() { + return last_error; +} + +} // namespace lexer +/* end file parser.cpp */ +/* begin file merve_c.cpp */ +/* begin file merve.h */ +#ifndef MERVE_H +#define MERVE_H + + +#endif // MERVE_H +/* end file merve.h */ +/* begin file merve_c.h */ +/** + * @file merve_c.h + * @brief Includes the C definitions for merve. This is a C file, not C++. + */ +#ifndef MERVE_C_H +#define MERVE_C_H + +#include +#include +#include + +/** + * @brief Non-owning string reference. + * + * The data pointer is NOT null-terminated. Always use the length field. + * + * The data is valid as long as: + * - The merve_analysis handle that produced it has not been freed. + * - For string_view-backed exports: the original source buffer is alive. + */ +typedef struct { + const char* data; + size_t length; +} merve_string; + +/** + * @brief Opaque handle to a CommonJS parse result. + * + * Created by merve_parse_commonjs(). Must be freed with merve_free(). + */ +typedef void* merve_analysis; + +/** + * @brief Version number components. + */ +typedef struct { + int major; + int minor; + int revision; +} merve_version_components; + +/* Error codes corresponding to lexer::lexer_error values. */ +#define MERVE_ERROR_TODO 0 +#define MERVE_ERROR_UNEXPECTED_PAREN 1 +#define MERVE_ERROR_UNEXPECTED_BRACE 2 +#define MERVE_ERROR_UNTERMINATED_PAREN 3 +#define MERVE_ERROR_UNTERMINATED_BRACE 4 +#define MERVE_ERROR_UNTERMINATED_TEMPLATE_STRING 5 +#define MERVE_ERROR_UNTERMINATED_STRING_LITERAL 6 +#define MERVE_ERROR_UNTERMINATED_REGEX_CHARACTER_CLASS 7 +#define MERVE_ERROR_UNTERMINATED_REGEX 8 +#define MERVE_ERROR_UNEXPECTED_ESM_IMPORT_META 9 +#define MERVE_ERROR_UNEXPECTED_ESM_IMPORT 10 +#define MERVE_ERROR_UNEXPECTED_ESM_EXPORT 11 +#define MERVE_ERROR_TEMPLATE_NEST_OVERFLOW 12 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Parse CommonJS source code and extract export information. + * + * The source buffer must remain valid while accessing string_view-backed + * export names from the returned handle. + * + * You must call merve_free() on the returned handle when done. + * + * @param input Pointer to the JavaScript source (need not be null-terminated). + * NULL is treated as an empty string. + * @param length Length of the input in bytes. + * @return A handle to the parse result, or NULL on out-of-memory. + * Use merve_is_valid() to check if parsing succeeded. + */ +merve_analysis merve_parse_commonjs(const char* input, size_t length); + +/** + * Check whether the parse result is valid (parsing succeeded). + * + * @param result Handle returned by merve_parse_commonjs(). NULL returns false. + * @return true if parsing succeeded, false otherwise. + */ +bool merve_is_valid(merve_analysis result); + +/** + * Free a parse result and all associated memory. + * + * @param result Handle returned by merve_parse_commonjs(). NULL is a no-op. + */ +void merve_free(merve_analysis result); + +/** + * Get the number of named exports found. + * + * @param result A parse result handle. NULL returns 0. + * @return Number of exports, or 0 if result is NULL or invalid. + */ +size_t merve_get_exports_count(merve_analysis result); + +/** + * Get the number of re-export module specifiers found. + * + * @param result A parse result handle. NULL returns 0. + * @return Number of re-exports, or 0 if result is NULL or invalid. + */ +size_t merve_get_reexports_count(merve_analysis result); + +/** + * Get the name of an export at the given index. + * + * @param result A valid parse result handle. + * @param index Zero-based index (must be < merve_get_exports_count()). + * @return Non-owning string reference. Returns {NULL, 0} on error. + */ +merve_string merve_get_export_name(merve_analysis result, size_t index); + +/** + * Get the 1-based source line number of an export. + * + * @param result A valid parse result handle. + * @param index Zero-based index (must be < merve_get_exports_count()). + * @return 1-based line number, or 0 on error. + */ +uint32_t merve_get_export_line(merve_analysis result, size_t index); + +/** + * Get the module specifier of a re-export at the given index. + * + * @param result A valid parse result handle. + * @param index Zero-based index (must be < merve_get_reexports_count()). + * @return Non-owning string reference. Returns {NULL, 0} on error. + */ +merve_string merve_get_reexport_name(merve_analysis result, size_t index); + +/** + * Get the 1-based source line number of a re-export. + * + * @param result A valid parse result handle. + * @param index Zero-based index (must be < merve_get_reexports_count()). + * @return 1-based line number, or 0 on error. + */ +uint32_t merve_get_reexport_line(merve_analysis result, size_t index); + +/** + * Get the error code from the last merve_parse_commonjs() call. + * + * @return One of the MERVE_ERROR_* constants, or -1 if the last parse + * succeeded. + * @note This is global state, overwritten by each merve_parse_commonjs() call. + */ +int merve_get_last_error(void); + +/** + * Get the merve library version string. + * + * @return Null-terminated version string (e.g. "1.0.1"). Never NULL. + */ +const char* merve_get_version(void); + +/** + * Get the merve library version as individual components. + * + * @return Struct with major, minor, and revision fields. + */ +merve_version_components merve_get_version_components(void); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* MERVE_C_H */ +/* end file merve_c.h */ + +#include + +struct merve_analysis_impl { + std::optional result{}; +}; + +static merve_string merve_string_create(const char* data, size_t length) { + merve_string out{}; + out.data = data; + out.length = length; + return out; +} + +extern "C" { + +merve_analysis merve_parse_commonjs(const char* input, size_t length) { + merve_analysis_impl* impl = new (std::nothrow) merve_analysis_impl(); + if (!impl) return nullptr; + if (input != nullptr) { + impl->result = lexer::parse_commonjs(std::string_view(input, length)); + } else { + impl->result = lexer::parse_commonjs(std::string_view("", 0)); + } + return static_cast(impl); +} + +bool merve_is_valid(merve_analysis result) { + if (!result) return false; + return static_cast(result)->result.has_value(); +} + +void merve_free(merve_analysis result) { + if (!result) return; + delete static_cast(result); +} + +size_t merve_get_exports_count(merve_analysis result) { + if (!result) return 0; + merve_analysis_impl* impl = static_cast(result); + if (!impl->result.has_value()) return 0; + return impl->result->exports.size(); +} + +size_t merve_get_reexports_count(merve_analysis result) { + if (!result) return 0; + merve_analysis_impl* impl = static_cast(result); + if (!impl->result.has_value()) return 0; + return impl->result->re_exports.size(); +} + +merve_string merve_get_export_name(merve_analysis result, size_t index) { + if (!result) return merve_string_create(nullptr, 0); + merve_analysis_impl* impl = static_cast(result); + if (!impl->result.has_value()) return merve_string_create(nullptr, 0); + if (index >= impl->result->exports.size()) + return merve_string_create(nullptr, 0); + std::string_view sv = + lexer::get_string_view(impl->result->exports[index]); + return merve_string_create(sv.data(), sv.size()); +} + +uint32_t merve_get_export_line(merve_analysis result, size_t index) { + if (!result) return 0; + merve_analysis_impl* impl = static_cast(result); + if (!impl->result.has_value()) return 0; + if (index >= impl->result->exports.size()) return 0; + return impl->result->exports[index].line; +} + +merve_string merve_get_reexport_name(merve_analysis result, size_t index) { + if (!result) return merve_string_create(nullptr, 0); + merve_analysis_impl* impl = static_cast(result); + if (!impl->result.has_value()) return merve_string_create(nullptr, 0); + if (index >= impl->result->re_exports.size()) + return merve_string_create(nullptr, 0); + std::string_view sv = + lexer::get_string_view(impl->result->re_exports[index]); + return merve_string_create(sv.data(), sv.size()); +} + +uint32_t merve_get_reexport_line(merve_analysis result, size_t index) { + if (!result) return 0; + merve_analysis_impl* impl = static_cast(result); + if (!impl->result.has_value()) return 0; + if (index >= impl->result->re_exports.size()) return 0; + return impl->result->re_exports[index].line; +} + +int merve_get_last_error(void) { + const std::optional& err = lexer::get_last_error(); + if (!err.has_value()) return -1; + return static_cast(err.value()); +} + +const char* merve_get_version(void) { return MERVE_VERSION; } + +merve_version_components merve_get_version_components(void) { + merve_version_components vc{}; + vc.major = lexer::MERVE_VERSION_MAJOR; + vc.minor = lexer::MERVE_VERSION_MINOR; + vc.revision = lexer::MERVE_VERSION_REVISION; + return vc; +} + +} /* extern "C" */ +/* end file merve_c.cpp */ diff --git a/rust/deps/merve.h b/rust/deps/merve.h new file mode 100644 index 0000000..6dd34d7 --- /dev/null +++ b/rust/deps/merve.h @@ -0,0 +1,181 @@ +/* begin file merve.h */ +#ifndef MERVE_H +#define MERVE_H + +/* begin file merve/parser.h */ +#ifndef MERVE_PARSER_H +#define MERVE_PARSER_H + +/* begin file merve/version.h */ +/** + * @file version.h + * @brief Definitions for merve's version number. + */ +#ifndef MERVE_VERSION_H +#define MERVE_VERSION_H + +#define MERVE_VERSION "1.0.1" // x-release-please-version + +namespace lexer { + +enum { + MERVE_VERSION_MAJOR = 1, // x-release-please-major + MERVE_VERSION_MINOR = 0, // x-release-please-minor + MERVE_VERSION_REVISION = 1, // x-release-please-patch +}; + +} // namespace lexer + +#endif // MERVE_VERSION_H +/* end file merve/version.h */ + +#include +#include +#include +#include +#include +#include + +namespace lexer { + +/** + * @brief Error codes returned by the lexer when parsing fails. + */ +enum lexer_error { + TODO, // Reserved for future use + + // Syntax errors - indicate malformed JavaScript + UNEXPECTED_PAREN, ///< Unexpected closing parenthesis + UNEXPECTED_BRACE, ///< Unexpected closing brace + UNTERMINATED_PAREN, ///< Unclosed parenthesis + UNTERMINATED_BRACE, ///< Unclosed brace + UNTERMINATED_TEMPLATE_STRING, ///< Unclosed template literal + UNTERMINATED_STRING_LITERAL, ///< Unclosed string literal + UNTERMINATED_REGEX_CHARACTER_CLASS, ///< Unclosed regex character class + UNTERMINATED_REGEX, ///< Unclosed regular expression + + // ESM syntax errors - indicate the file should be parsed as ESM instead + UNEXPECTED_ESM_IMPORT_META, ///< Found import.meta (ESM only) + UNEXPECTED_ESM_IMPORT, ///< Found import declaration (ESM only) + UNEXPECTED_ESM_EXPORT, ///< Found export declaration (ESM only) + + // Resource limit errors + TEMPLATE_NEST_OVERFLOW, ///< Template literal nesting too deep +}; + +/** + * @brief Type alias for export names. + * + * Uses std::variant to optimize memory: + * - std::string_view: For simple identifiers (zero-copy, points to source) + * - std::string: For exports requiring unescaping (e.g., Unicode escapes) + * + * Use get_string_view() to access the value uniformly. + */ +using export_string = std::variant; + +/** + * @brief An export name together with its 1-based source line number. + */ +struct export_entry { + export_string name; + uint32_t line; // 1-based line number +}; + +/** + * @brief Result of parsing a CommonJS module. + */ +struct lexer_analysis { + /** + * @brief Named exports found in the module. + * + * Includes exports from patterns like: + * - exports.foo = value + * - exports['bar'] = value + * - module.exports.baz = value + * - module.exports = { a, b, c } + * - Object.defineProperty(exports, 'name', {...}) + */ + std::vector exports{}; + + /** + * @brief Module specifiers from re-export patterns. + * + * Includes specifiers from patterns like: + * - module.exports = require('other') + * - module.exports = { ...require('other') } + * - __export(require('other')) + * - Object.keys(require('other')).forEach(...) + */ + std::vector re_exports{}; +}; + +/** + * @brief Get a string_view from an export_string variant. + * + * @param s The export_string to convert + * @return std::string_view A view into the string data + * + * @note The returned string_view is valid as long as: + * - For string_view variant: the original source is valid + * - For string variant: the export_string is valid + */ +inline std::string_view get_string_view(const export_string& s) { + return std::visit([](const auto& v) -> std::string_view { return v; }, s); +} + +/** + * @brief Get a string_view from an export_entry (delegates to the name field). + */ +inline std::string_view get_string_view(const export_entry& e) { + return get_string_view(e.name); +} + +/** + * @brief Parse CommonJS source code and extract export information. + * + * Performs static analysis to detect CommonJS export patterns without + * executing the code. Handles various patterns including: + * - Direct exports (exports.x, module.exports.x) + * - Bracket notation (exports['x']) + * - Object literal assignment (module.exports = {...}) + * - Object.defineProperty patterns + * - Re-export patterns from transpilers (Babel, TypeScript) + * + * @param file_contents The JavaScript source code to analyze + * @return std::optional The analysis result, or std::nullopt + * if parsing failed. Use get_last_error() to get error details. + * + * @note The source must remain valid while using string_view exports. + * @note ESM syntax (import/export declarations) will cause an error. + * + * Example: + * @code + * auto result = lexer::parse_commonjs("exports.foo = 1;"); + * if (result) { + * for (const auto& exp : result->exports) { + * std::cout << lexer::get_string_view(exp) << std::endl; + * } + * } + * @endcode + */ +std::optional parse_commonjs(std::string_view file_contents); + +/** + * @brief Get the error from the last failed parse operation. + * + * @return const std::optional& The last error, or std::nullopt + * if the last parse succeeded. + * + * @note This is a global state and may be overwritten by subsequent calls + * to parse_commonjs(). + */ +const std::optional& get_last_error(); + +} // namespace lexer + +#endif // MERVE_PARSER_H +/* end file merve/parser.h */ + +#endif // MERVE_H +/* end file merve.h */ diff --git a/rust/deps/merve_c.h b/rust/deps/merve_c.h new file mode 100644 index 0000000..af4a9d7 --- /dev/null +++ b/rust/deps/merve_c.h @@ -0,0 +1,171 @@ +/** + * @file merve_c.h + * @brief Includes the C definitions for merve. This is a C file, not C++. + */ +#ifndef MERVE_C_H +#define MERVE_C_H + +#include +#include +#include + +/** + * @brief Non-owning string reference. + * + * The data pointer is NOT null-terminated. Always use the length field. + * + * The data is valid as long as: + * - The merve_analysis handle that produced it has not been freed. + * - For string_view-backed exports: the original source buffer is alive. + */ +typedef struct { + const char* data; + size_t length; +} merve_string; + +/** + * @brief Opaque handle to a CommonJS parse result. + * + * Created by merve_parse_commonjs(). Must be freed with merve_free(). + */ +typedef void* merve_analysis; + +/** + * @brief Version number components. + */ +typedef struct { + int major; + int minor; + int revision; +} merve_version_components; + +/* Error codes corresponding to lexer::lexer_error values. */ +#define MERVE_ERROR_TODO 0 +#define MERVE_ERROR_UNEXPECTED_PAREN 1 +#define MERVE_ERROR_UNEXPECTED_BRACE 2 +#define MERVE_ERROR_UNTERMINATED_PAREN 3 +#define MERVE_ERROR_UNTERMINATED_BRACE 4 +#define MERVE_ERROR_UNTERMINATED_TEMPLATE_STRING 5 +#define MERVE_ERROR_UNTERMINATED_STRING_LITERAL 6 +#define MERVE_ERROR_UNTERMINATED_REGEX_CHARACTER_CLASS 7 +#define MERVE_ERROR_UNTERMINATED_REGEX 8 +#define MERVE_ERROR_UNEXPECTED_ESM_IMPORT_META 9 +#define MERVE_ERROR_UNEXPECTED_ESM_IMPORT 10 +#define MERVE_ERROR_UNEXPECTED_ESM_EXPORT 11 +#define MERVE_ERROR_TEMPLATE_NEST_OVERFLOW 12 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Parse CommonJS source code and extract export information. + * + * The source buffer must remain valid while accessing string_view-backed + * export names from the returned handle. + * + * You must call merve_free() on the returned handle when done. + * + * @param input Pointer to the JavaScript source (need not be null-terminated). + * NULL is treated as an empty string. + * @param length Length of the input in bytes. + * @return A handle to the parse result, or NULL on out-of-memory. + * Use merve_is_valid() to check if parsing succeeded. + */ +merve_analysis merve_parse_commonjs(const char* input, size_t length); + +/** + * Check whether the parse result is valid (parsing succeeded). + * + * @param result Handle returned by merve_parse_commonjs(). NULL returns false. + * @return true if parsing succeeded, false otherwise. + */ +bool merve_is_valid(merve_analysis result); + +/** + * Free a parse result and all associated memory. + * + * @param result Handle returned by merve_parse_commonjs(). NULL is a no-op. + */ +void merve_free(merve_analysis result); + +/** + * Get the number of named exports found. + * + * @param result A parse result handle. NULL returns 0. + * @return Number of exports, or 0 if result is NULL or invalid. + */ +size_t merve_get_exports_count(merve_analysis result); + +/** + * Get the number of re-export module specifiers found. + * + * @param result A parse result handle. NULL returns 0. + * @return Number of re-exports, or 0 if result is NULL or invalid. + */ +size_t merve_get_reexports_count(merve_analysis result); + +/** + * Get the name of an export at the given index. + * + * @param result A valid parse result handle. + * @param index Zero-based index (must be < merve_get_exports_count()). + * @return Non-owning string reference. Returns {NULL, 0} on error. + */ +merve_string merve_get_export_name(merve_analysis result, size_t index); + +/** + * Get the 1-based source line number of an export. + * + * @param result A valid parse result handle. + * @param index Zero-based index (must be < merve_get_exports_count()). + * @return 1-based line number, or 0 on error. + */ +uint32_t merve_get_export_line(merve_analysis result, size_t index); + +/** + * Get the module specifier of a re-export at the given index. + * + * @param result A valid parse result handle. + * @param index Zero-based index (must be < merve_get_reexports_count()). + * @return Non-owning string reference. Returns {NULL, 0} on error. + */ +merve_string merve_get_reexport_name(merve_analysis result, size_t index); + +/** + * Get the 1-based source line number of a re-export. + * + * @param result A valid parse result handle. + * @param index Zero-based index (must be < merve_get_reexports_count()). + * @return 1-based line number, or 0 on error. + */ +uint32_t merve_get_reexport_line(merve_analysis result, size_t index); + +/** + * Get the error code from the last merve_parse_commonjs() call. + * + * @return One of the MERVE_ERROR_* constants, or -1 if the last parse + * succeeded. + * @note This is global state, overwritten by each merve_parse_commonjs() call. + */ +int merve_get_last_error(void); + +/** + * Get the merve library version string. + * + * @return Null-terminated version string (e.g. "1.0.1"). Never NULL. + */ +const char* merve_get_version(void); + +/** + * Get the merve library version as individual components. + * + * @return Struct with major, minor, and revision fields. + */ +merve_version_components merve_get_version_components(void); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* MERVE_C_H */ diff --git a/rust/src/ffi.rs b/rust/src/ffi.rs new file mode 100644 index 0000000..8ac5552 --- /dev/null +++ b/rust/src/ffi.rs @@ -0,0 +1,62 @@ +#![allow(non_camel_case_types)] +use core::ffi::{c_char, c_int, c_void}; + +/// Non-owning string reference returned by the C API. +/// +/// The `data` pointer is NOT null-terminated. Always use `length`. +/// +/// Valid as long as: +/// - The `merve_analysis` handle that produced it has not been freed. +/// - For `string_view`-backed exports: the original source buffer is alive. +#[repr(C)] +pub struct merve_string { + pub data: *const c_char, + pub length: usize, +} + +impl merve_string { + /// Convert to a Rust `&str` with an arbitrary lifetime. + /// + /// Returns `""` when `length` is 0 (which includes the case where `data` is null). + /// + /// # Safety + /// The caller must ensure that the backing data outlives `'a` and is valid UTF-8. + /// The `merve_string` itself is a temporary POD value; the data it points to + /// lives in the original source buffer or the analysis handle. + #[must_use] + pub unsafe fn as_str<'a>(&self) -> &'a str { + if self.length == 0 { + return ""; + } + unsafe { + let slice = core::slice::from_raw_parts(self.data.cast(), self.length); + core::str::from_utf8_unchecked(slice) + } + } +} + +/// Opaque handle to a CommonJS parse result. +pub type merve_analysis = *mut c_void; + +/// Version number components. +#[repr(C)] +pub struct merve_version_components { + pub major: c_int, + pub minor: c_int, + pub revision: c_int, +} + +unsafe extern "C" { + pub fn merve_parse_commonjs(input: *const c_char, length: usize) -> merve_analysis; + pub fn merve_is_valid(result: merve_analysis) -> bool; + pub fn merve_free(result: merve_analysis); + pub fn merve_get_exports_count(result: merve_analysis) -> usize; + pub fn merve_get_reexports_count(result: merve_analysis) -> usize; + pub fn merve_get_export_name(result: merve_analysis, index: usize) -> merve_string; + pub fn merve_get_export_line(result: merve_analysis, index: usize) -> u32; + pub fn merve_get_reexport_name(result: merve_analysis, index: usize) -> merve_string; + pub fn merve_get_reexport_line(result: merve_analysis, index: usize) -> u32; + pub fn merve_get_last_error() -> c_int; + pub fn merve_get_version() -> *const c_char; + pub fn merve_get_version_components() -> merve_version_components; +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs new file mode 100644 index 0000000..4e319a7 --- /dev/null +++ b/rust/src/lib.rs @@ -0,0 +1,565 @@ +//! # Merve +//! +//! Merve is a fast CommonJS export lexer written in C++. +//! This crate provides safe Rust bindings via the C API. +//! +//! ## Usage +//! +//! ``` +//! use merve::parse_commonjs; +//! +//! let source = "exports.foo = 1; exports.bar = 2;"; +//! let analysis = parse_commonjs(source).expect("parse failed"); +//! +//! assert_eq!(analysis.exports_count(), 2); +//! for export in analysis.exports() { +//! println!("{} (line {})", export.name, export.line); +//! } +//! ``` +//! +//! ## no-std +//! +//! This crate supports `no_std` environments. Disable default features: +//! +//! ```toml +//! merve = { version = "0.1", default-features = false } +//! ``` + +#![cfg_attr(not(feature = "std"), no_std)] + +mod ffi; + +#[cfg(feature = "std")] +extern crate std; + +use core::fmt; +use core::marker::PhantomData; + +/// Error codes returned by the merve lexer. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum LexerError { + EmptySource, + UnexpectedParen, + UnexpectedBrace, + UnterminatedParen, + UnterminatedBrace, + UnterminatedTemplateString, + UnterminatedStringLiteral, + UnterminatedRegexCharacterClass, + UnterminatedRegex, + UnexpectedEsmImportMeta, + UnexpectedEsmImport, + UnexpectedEsmExport, + TemplateNestOverflow, + /// An error code not recognized by these bindings. + Unknown(i32), +} + +impl LexerError { + /// Convert a C API error code to a `LexerError`. + #[must_use] + pub fn from_code(code: i32) -> Self { + match code { + 0 => Self::EmptySource, + 1 => Self::UnexpectedParen, + 2 => Self::UnexpectedBrace, + 3 => Self::UnterminatedParen, + 4 => Self::UnterminatedBrace, + 5 => Self::UnterminatedTemplateString, + 6 => Self::UnterminatedStringLiteral, + 7 => Self::UnterminatedRegexCharacterClass, + 8 => Self::UnterminatedRegex, + 9 => Self::UnexpectedEsmImportMeta, + 10 => Self::UnexpectedEsmImport, + 11 => Self::UnexpectedEsmExport, + 12 => Self::TemplateNestOverflow, + other => Self::Unknown(other), + } + } + + /// Return the short name of this error variant. + #[must_use] + pub fn as_str(&self) -> &'static str { + match self { + Self::EmptySource => "empty source", + Self::UnexpectedParen => "unexpected parenthesis", + Self::UnexpectedBrace => "unexpected brace", + Self::UnterminatedParen => "unterminated parenthesis", + Self::UnterminatedBrace => "unterminated brace", + Self::UnterminatedTemplateString => "unterminated template string", + Self::UnterminatedStringLiteral => "unterminated string literal", + Self::UnterminatedRegexCharacterClass => "unterminated regex character class", + Self::UnterminatedRegex => "unterminated regex", + Self::UnexpectedEsmImportMeta => "unexpected ESM import.meta", + Self::UnexpectedEsmImport => "unexpected ESM import", + Self::UnexpectedEsmExport => "unexpected ESM export", + Self::TemplateNestOverflow => "template nesting overflow", + Self::Unknown(_) => "unknown error", + } + } +} + +impl fmt::Display for LexerError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Unknown(code) => write!(f, "merve lexer error: unknown (code {})", code), + _ => write!(f, "merve lexer error: {}", self.as_str()), + } + } +} + +#[cfg(feature = "std")] +impl std::error::Error for LexerError {} + +/// A parsed CommonJS analysis result. +/// +/// The lifetime `'a` is tied to the source string passed to [`parse_commonjs`], +/// because export names may reference slices of the original source buffer +/// (zero-copy `string_view` exports from the C++ side). +/// +/// The handle is freed on drop. +pub struct Analysis<'a> { + handle: ffi::merve_analysis, + _source: PhantomData<&'a [u8]>, +} + +impl<'a> Drop for Analysis<'a> { + fn drop(&mut self) { + unsafe { ffi::merve_free(self.handle) } + } +} + +// Safety: The C++ implementation does not use thread-local state in the +// analysis struct itself (`merve_get_last_error` is global, but `Analysis` +// does not rely on it after construction). +unsafe impl Send for Analysis<'_> {} +unsafe impl Sync for Analysis<'_> {} + +impl<'a> Analysis<'a> { + /// Number of named exports found. + #[must_use] + pub fn exports_count(&self) -> usize { + unsafe { ffi::merve_get_exports_count(self.handle) } + } + + /// Number of re-export module specifiers found. + #[must_use] + pub fn reexports_count(&self) -> usize { + unsafe { ffi::merve_get_reexports_count(self.handle) } + } + + /// Get the name of the export at `index`. + /// + /// Returns `None` if `index` is out of bounds. + #[must_use] + pub fn export_name(&self, index: usize) -> Option<&'a str> { + if index >= self.exports_count() { + return None; + } + let s = unsafe { ffi::merve_get_export_name(self.handle, index) }; + Some(unsafe { s.as_str() }) + } + + /// Get the 1-based source line number of the export at `index`. + /// + /// Returns `None` if `index` is out of bounds. + #[must_use] + pub fn export_line(&self, index: usize) -> Option { + if index >= self.exports_count() { + return None; + } + let line = unsafe { ffi::merve_get_export_line(self.handle, index) }; + if line == 0 { None } else { Some(line) } + } + + /// Get the module specifier of the re-export at `index`. + /// + /// Returns `None` if `index` is out of bounds. + #[must_use] + pub fn reexport_name(&self, index: usize) -> Option<&'a str> { + if index >= self.reexports_count() { + return None; + } + let s = unsafe { ffi::merve_get_reexport_name(self.handle, index) }; + Some(unsafe { s.as_str() }) + } + + /// Get the 1-based source line number of the re-export at `index`. + /// + /// Returns `None` if `index` is out of bounds. + #[must_use] + pub fn reexport_line(&self, index: usize) -> Option { + if index >= self.reexports_count() { + return None; + } + let line = unsafe { ffi::merve_get_reexport_line(self.handle, index) }; + if line == 0 { None } else { Some(line) } + } + + /// Iterate over all named exports. + #[must_use] + pub fn exports(&self) -> ExportIter<'a, '_> { + ExportIter { + analysis: self, + kind: ExportKind::Export, + index: 0, + count: self.exports_count(), + } + } + + /// Iterate over all re-exports. + #[must_use] + pub fn reexports(&self) -> ExportIter<'a, '_> { + ExportIter { + analysis: self, + kind: ExportKind::ReExport, + index: 0, + count: self.reexports_count(), + } + } +} + +impl fmt::Debug for Analysis<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Analysis") + .field("exports_count", &self.exports_count()) + .field("reexports_count", &self.reexports_count()) + .finish() + } +} + +/// A single export entry: a name and its source line number. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Export<'a> { + /// The export name (or module specifier for re-exports). + pub name: &'a str, + /// 1-based source line number. + pub line: u32, +} + +impl fmt::Display for Export<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{} (line {})", self.name, self.line) + } +} + +/// Distinguishes between exports and re-exports in [`ExportIter`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum ExportKind { + Export, + ReExport, +} + +/// Iterator over exports or re-exports. +/// +/// Created by [`Analysis::exports`] or [`Analysis::reexports`]. +pub struct ExportIter<'a, 'b> { + analysis: &'b Analysis<'a>, + kind: ExportKind, + index: usize, + count: usize, +} + +impl<'a> Iterator for ExportIter<'a, '_> { + type Item = Export<'a>; + + fn next(&mut self) -> Option { + if self.index >= self.count { + return None; + } + let i = self.index; + self.index += 1; + let (name, line) = match self.kind { + ExportKind::Export => ( + self.analysis.export_name(i).unwrap_or(""), + self.analysis.export_line(i).unwrap_or(0), + ), + ExportKind::ReExport => ( + self.analysis.reexport_name(i).unwrap_or(""), + self.analysis.reexport_line(i).unwrap_or(0), + ), + }; + Some(Export { name, line }) + } + + fn size_hint(&self) -> (usize, Option) { + let remaining = self.count - self.index; + (remaining, Some(remaining)) + } +} + +impl ExactSizeIterator for ExportIter<'_, '_> {} + +/// Parse CommonJS source code and extract export information. +/// +/// The returned [`Analysis`] borrows from `source` because some export names +/// may point directly into the source buffer (zero-copy `string_view` exports). +/// +/// # Errors +/// +/// Returns a [`LexerError`] if the input contains ESM syntax or other +/// unsupported constructs. +/// +/// # Examples +/// +/// ``` +/// use merve::parse_commonjs; +/// +/// let source = "exports.hello = 1;"; +/// let analysis = parse_commonjs(source).unwrap(); +/// assert_eq!(analysis.exports_count(), 1); +/// assert_eq!(analysis.export_name(0), Some("hello")); +/// ``` +pub fn parse_commonjs(source: &str) -> Result, LexerError> { + if source.is_empty() { + return Err(LexerError::EmptySource); + } + let handle = unsafe { ffi::merve_parse_commonjs(source.as_ptr().cast(), source.len()) }; + if handle.is_null() { + // NULL means allocation failure; map to a generic error + let code = unsafe { ffi::merve_get_last_error() }; + return Err(if code >= 0 { + LexerError::from_code(code) + } else { + LexerError::Unknown(code) + }); + } + if !unsafe { ffi::merve_is_valid(handle) } { + let code = unsafe { ffi::merve_get_last_error() }; + let err = if code >= 0 { + LexerError::from_code(code) + } else { + LexerError::Unknown(code) + }; + unsafe { ffi::merve_free(handle) }; + return Err(err); + } + Ok(Analysis { + handle, + _source: PhantomData, + }) +} + +/// Get the merve library version string (e.g. `"1.0.1"`). +#[must_use] +pub fn version() -> &'static str { + unsafe { + let ptr = ffi::merve_get_version(); + let len = { + let mut n = 0usize; + while *ptr.add(n) != 0 { + n += 1; + } + n + }; + let slice = core::slice::from_raw_parts(ptr.cast(), len); + core::str::from_utf8_unchecked(slice) + } +} + +/// Get the merve library version as `(major, minor, revision)`. +#[must_use] +pub fn version_components() -> (i32, i32, i32) { + let v = unsafe { ffi::merve_get_version_components() }; + (v.major, v.minor, v.revision) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn version_is_not_empty() { + let v = version(); + assert!(!v.is_empty()); + assert!(v.contains('.'), "version should contain a dot: {v}"); + } + + #[test] + fn version_components_are_nonnegative() { + let (major, minor, rev) = version_components(); + assert!(major >= 0); + assert!(minor >= 0); + assert!(rev >= 0); + } + + #[test] + fn parse_simple_exports() { + let source = "exports.foo = 1; exports.bar = 2;"; + let analysis = parse_commonjs(source).expect("should parse"); + assert_eq!(analysis.exports_count(), 2); + assert_eq!(analysis.export_name(0), Some("foo")); + assert_eq!(analysis.export_name(1), Some("bar")); + assert_eq!(analysis.reexports_count(), 0); + } + + #[cfg(feature = "std")] + #[test] + fn parse_module_exports() { + let source = "module.exports = { a, b, c };"; + let analysis = parse_commonjs(source).expect("should parse"); + assert_eq!(analysis.exports_count(), 3); + assert_eq!(analysis.export_name(0), Some("a")); + assert_eq!(analysis.export_name(1), Some("b")); + assert_eq!(analysis.export_name(2), Some("c")); + } + + #[test] + fn parse_reexports() { + let source = r#"module.exports = require("./other");"#; + let analysis = parse_commonjs(source).expect("should parse"); + assert_eq!(analysis.reexports_count(), 1); + assert_eq!(analysis.reexport_name(0), Some("./other")); + } + + #[test] + fn esm_import_returns_error() { + let source = "import { foo } from 'bar';"; + let result = parse_commonjs(source); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err, LexerError::UnexpectedEsmImport); + } + + #[test] + fn esm_export_returns_error() { + let source = "export const x = 1;"; + let result = parse_commonjs(source); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err, LexerError::UnexpectedEsmExport); + } + + #[test] + fn empty_input() { + let result = parse_commonjs(""); + assert!(result.is_err()); + assert_eq!(result.unwrap_err(), LexerError::EmptySource); + } + + #[test] + fn out_of_bounds_returns_none() { + let source = "exports.x = 1;"; + let analysis = parse_commonjs(source).expect("should parse"); + assert_eq!(analysis.export_name(999), None); + assert_eq!(analysis.export_line(999), None); + assert_eq!(analysis.reexport_name(0), None); + assert_eq!(analysis.reexport_line(0), None); + } + + #[test] + fn export_lines() { + let source = "exports.a = 1;\nexports.b = 2;\nexports.c = 3;"; + let analysis = parse_commonjs(source).expect("should parse"); + assert_eq!(analysis.export_line(0), Some(1)); + assert_eq!(analysis.export_line(1), Some(2)); + assert_eq!(analysis.export_line(2), Some(3)); + } + + #[cfg(feature = "std")] + #[test] + fn exports_iterator() { + let source = "exports.x = 1; exports.y = 2;"; + let analysis = parse_commonjs(source).expect("should parse"); + let exports: Vec> = analysis.exports().collect(); + assert_eq!(exports.len(), 2); + assert_eq!(exports[0].name, "x"); + assert_eq!(exports[1].name, "y"); + } + + #[test] + fn exports_iterator_exact_size() { + let source = "exports.a = 1; exports.b = 2; exports.c = 3;"; + let analysis = parse_commonjs(source).expect("should parse"); + let iter = analysis.exports(); + assert_eq!(iter.len(), 3); + } + + #[cfg(feature = "std")] + #[test] + fn reexports_iterator() { + let source = r#"module.exports = require("./a");"#; + let analysis = parse_commonjs(source).expect("should parse"); + let reexports: Vec> = analysis.reexports().collect(); + assert_eq!(reexports.len(), 1); + assert_eq!(reexports[0].name, "./a"); + } + + #[cfg(feature = "std")] + #[test] + fn debug_impl() { + let source = "exports.z = 1;"; + let analysis = parse_commonjs(source).expect("should parse"); + let dbg = format!("{:?}", analysis); + assert!(dbg.contains("Analysis")); + assert!(dbg.contains("exports_count: 1")); + } + + #[cfg(feature = "std")] + #[test] + fn export_display_impl() { + let e = Export { + name: "foo", + line: 42, + }; + assert_eq!(format!("{e}"), "foo (line 42)"); + } + + #[cfg(feature = "std")] + #[test] + fn error_display() { + let err = LexerError::UnexpectedEsmImport; + let s = format!("{err}"); + assert!(s.contains("unexpected ESM import"), "got: {s}"); + } + + #[cfg(feature = "std")] + #[test] + fn error_display_unknown() { + let err = LexerError::Unknown(99); + let s = format!("{err}"); + assert!(s.contains("99"), "got: {s}"); + } + + #[test] + fn error_from_code_roundtrip() { + for code in 0..=12 { + let err = LexerError::from_code(code); + assert_ne!(err, LexerError::Unknown(code)); + } + assert_eq!(LexerError::from_code(999), LexerError::Unknown(999)); + } + + #[cfg(feature = "std")] + #[test] + fn error_is_std_error() { + fn assert_error() {} + assert_error::(); + } + + #[test] + fn bracket_notation_exports() { + let source = r#"exports["hello-world"] = 1;"#; + let analysis = parse_commonjs(source).expect("should parse"); + assert_eq!(analysis.exports_count(), 1); + assert_eq!(analysis.export_name(0), Some("hello-world")); + } + + #[test] + fn multiple_independent_parses() { + let src1 = "exports.a = 1;"; + let src2 = "exports.b = 2;"; + let a1 = parse_commonjs(src1).expect("should parse"); + let a2 = parse_commonjs(src2).expect("should parse"); + assert_eq!(a1.export_name(0), Some("a")); + assert_eq!(a2.export_name(0), Some("b")); + } + + #[test] + fn send_and_sync() { + fn assert_send() {} + fn assert_sync() {} + assert_send::>(); + assert_sync::>(); + } +} diff --git a/rust/wasi_to_unknown.cpp b/rust/wasi_to_unknown.cpp new file mode 100644 index 0000000..06b06ed --- /dev/null +++ b/rust/wasi_to_unknown.cpp @@ -0,0 +1,54 @@ +// Some shims for WASI symbols used by the WASI libc environment initializer, +// but not actually required by Ada. This allows to compile Ada Rust to +// wasm32-unknown-unknown with WASI SDK. + +#include + +extern "C" { + +int32_t __imported_wasi_snapshot_preview1_environ_get(int32_t, int32_t) { + __builtin_unreachable(); +} + +int32_t __imported_wasi_snapshot_preview1_environ_sizes_get(int32_t, int32_t) { + __builtin_unreachable(); +} + +int32_t __imported_wasi_snapshot_preview1_fd_close(int32_t) { + __builtin_unreachable(); +} + +int32_t __imported_wasi_snapshot_preview1_fd_fdstat_get(int32_t, int32_t) { + __builtin_unreachable(); +} + +int32_t __imported_wasi_snapshot_preview1_fd_read(int32_t, + int32_t, + int32_t, + int32_t) { + __builtin_unreachable(); +} + +int32_t __imported_wasi_snapshot_preview1_fd_seek(int32_t, + int64_t, + int32_t, + int32_t) { + __builtin_unreachable(); +} + +int32_t __imported_wasi_snapshot_preview1_fd_write(int32_t, + int32_t, + int32_t, + int32_t) { + __builtin_unreachable(); +} + +int32_t __imported_wasi_snapshot_preview1_sched_yield() { + return 0; +} + +_Noreturn void __imported_wasi_snapshot_preview1_proc_exit(int32_t) { + __builtin_unreachable(); +} + +} // extern "C" diff --git a/src/parser.cpp b/src/parser.cpp index a899b53..c977db0 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -310,8 +310,8 @@ struct StarExportBinding { std::string_view id; }; -// Global state for error tracking -std::optional last_error; +// Thread-local state for error tracking (safe for concurrent parse calls). +thread_local std::optional last_error; // Lexer state class class CJSLexer {