diff --git a/cli/cli/src/cmd/view/enrich.js b/cli/cli/src/cmd/view/enrich.js new file mode 100644 index 0000000..8ae5d98 --- /dev/null +++ b/cli/cli/src/cmd/view/enrich.js @@ -0,0 +1,264 @@ +import { createReadStream } from 'node:fs'; +import { createInterface } from 'node:readline'; +import { parseArgs } from 'node:util'; +import { Cache, createStorageDriver, createPackumentKey } from '@_all_docs/cache'; + +export const usage = `Usage: _all_docs view enrich [options] + +Enrich package specs with fields from cached packuments. + +Options: + -i, --input Input NDJSON file ('-' for stdin) + --add Add field from packument (repeatable) + --origin Packument origin (default: npm) + --name-field Input field for name (default: name) + --version-field Input field for version (default: version) + --on-missing skip, null, or error (default: null) + --progress Show progress + +Add Expression Syntax: + as + + Use .field to reference input record fields: + time[.version] as addedAt + versions[.version].dist.integrity as integrity + +Examples: + # Add publish dates + _all_docs view enrich -i specs.ndjson --add 'time[.version] as addedAt' + + # Add multiple fields + _all_docs view enrich -i specs.ndjson \\ + --add 'time[.version] as publishedAt' \\ + --add 'versions[.version].dist.integrity as integrity' +`; + +export const command = async (cli) => { + if (cli.values.help) { + console.log(usage); + return; + } + + // Parse command-specific args + const { values } = parseArgs({ + args: cli._, + options: { + input: { type: 'string', short: 'i' }, + add: { type: 'string', multiple: true }, + origin: { type: 'string', default: 'npm' }, + 'name-field': { type: 'string', default: 'name' }, + 'version-field': { type: 'string', default: 'version' }, + 'on-missing': { type: 'string', default: 'null' }, + progress: { type: 'boolean', default: false } + }, + allowPositionals: true + }); + + // Also check cli.values for global flags + const input = values.input || cli.values.input; + const addExprs = values.add || cli.values.add || []; + const origin = values.origin || cli.values.origin || 'npm'; + const nameField = values['name-field'] || cli.values['name-field'] || 'name'; + const versionField = values['version-field'] || cli.values['version-field'] || 'version'; + const onMissing = values['on-missing'] || cli.values['on-missing'] || 'null'; + const showProgress = values.progress || cli.values.progress; + + if (!input) { + console.error('Error: --input required'); + console.log(usage); + process.exit(1); + } + + if (!addExprs || addExprs.length === 0) { + console.error('Error: at least one --add expression required'); + process.exit(1); + } + + // Parse add expressions + const enrichments = addExprs.map(parseAddExpression); + + // Setup cache + const driver = await createStorageDriver({ CACHE_DIR: cli.dir('packuments') }); + const cache = new Cache({ path: cli.dir('packuments'), driver }); + + // Packument cache (avoid re-fetching for same package) + const packumentCache = new Map(); + + // Setup input stream + const inputStream = input === '-' + ? process.stdin + : createReadStream(input); + + const rl = createInterface({ input: inputStream, crlfDelay: Infinity }); + + let processed = 0; + let enriched = 0; + let skipped = 0; + + for await (const line of rl) { + if (!line.trim()) continue; + + processed++; + if (showProgress && processed % 1000 === 0) { + process.stderr.write(`\rProcessed ${processed}, enriched ${enriched}, skipped ${skipped}...`); + } + + try { + const record = JSON.parse(line); + const name = record[nameField]; + + if (!name) { + if (onMissing === 'skip') { skipped++; continue; } + if (onMissing === 'error') throw new Error('Missing name field'); + console.log(line); // Pass through unchanged + continue; + } + + // Get packument (cached) + let packument = packumentCache.get(name); + if (packument === undefined) { + const key = createPackumentKey(name, origin === 'npm' ? 'https://registry.npmjs.org' : origin); + try { + const entry = await cache.fetch(key); + packument = entry?.body || entry || null; + } catch { + packument = null; + } + packumentCache.set(name, packument); + } + + if (!packument) { + if (onMissing === 'skip') { skipped++; continue; } + if (onMissing === 'error') { + throw new Error(`Packument not found: ${name}`); + } + // null mode: output with null values + for (const e of enrichments) { + record[e.alias] = null; + } + console.log(JSON.stringify(record)); + continue; + } + + // Apply enrichments + for (const enrichment of enrichments) { + const value = extractValue(packument, enrichment.selector, record); + record[enrichment.alias] = value; + } + + console.log(JSON.stringify(record)); + enriched++; + + } catch (err) { + if (onMissing === 'error') { + throw err; + } + console.error(`Error processing line ${processed}: ${err.message}`); + } + } + + if (showProgress) { + process.stderr.write(`\rCompleted: ${processed} processed, ${enriched} enriched, ${skipped} skipped\n`); + } +}; + +/** + * Parse "selector as alias" expression + * @param {string} expr - Expression like "time[.version] as addedAt" + * @returns {{ selector: string, alias: string }} + */ +export function parseAddExpression(expr) { + const match = expr.match(/^(.+?)\s+as\s+(\w+)$/); + if (!match) { + throw new Error(`Invalid --add expression: ${expr}\nExpected: as `); + } + return { + selector: match[1].trim(), + alias: match[2].trim() + }; +} + +/** + * Extract value from packument using selector with record field references + * @param {object} packument - The packument data + * @param {string} selector - Selector with optional .field references + * @param {object} record - Input record for .field resolution + * @returns {*} The extracted value + */ +export function extractValue(packument, selector, record) { + // Replace .field references with actual values from record + const resolvedSelector = selector.replace(/\[\.(\w+)\]/g, (_, field) => { + const val = record[field]; + if (val === undefined) return '[null]'; + // Escape special characters in the value + const escaped = String(val).replace(/"/g, '\\"'); + return `["${escaped}"]`; + }); + + // Now evaluate the selector against packument + return evaluateSelector(packument, resolvedSelector); +} + +/** + * Simple selector evaluation + * Handles: field.nested, field["key"], field[0] + * @param {object} obj - Object to evaluate against + * @param {string} selector - Selector path + * @returns {*} The value at the path + */ +export function evaluateSelector(obj, selector) { + // Parse selector into segments + const parts = []; + let current = ''; + let inBracket = false; + let bracketContent = ''; + + for (let i = 0; i < selector.length; i++) { + const char = selector[i]; + + if (char === '[' && !inBracket) { + if (current) { + parts.push({ type: 'field', value: current }); + current = ''; + } + inBracket = true; + bracketContent = ''; + } else if (char === ']' && inBracket) { + // Remove quotes from bracket content if present + let key = bracketContent; + if ((key.startsWith('"') && key.endsWith('"')) || + (key.startsWith("'") && key.endsWith("'"))) { + key = key.slice(1, -1); + } + parts.push({ type: 'bracket', value: key }); + inBracket = false; + } else if (char === '.' && !inBracket) { + if (current) { + parts.push({ type: 'field', value: current }); + current = ''; + } + } else if (inBracket) { + bracketContent += char; + } else { + current += char; + } + } + + if (current) { + parts.push({ type: 'field', value: current }); + } + + // Traverse the object + let result = obj; + for (const part of parts) { + if (result === null || result === undefined) return null; + + if (part.type === 'bracket') { + result = result[part.value]; + } else { + result = result[part.value]; + } + } + + return result ?? null; +} diff --git a/cli/cli/src/cmd/view/enrich.test.js b/cli/cli/src/cmd/view/enrich.test.js new file mode 100644 index 0000000..64f19e2 --- /dev/null +++ b/cli/cli/src/cmd/view/enrich.test.js @@ -0,0 +1,219 @@ +import { describe, it } from 'node:test'; +import { strict as assert } from 'node:assert'; +import { + parseAddExpression, + extractValue, + evaluateSelector +} from './enrich.js'; + +describe('parseAddExpression', () => { + it('parses simple selector with alias', () => { + const result = parseAddExpression('time as publishTime'); + assert.deepEqual(result, { + selector: 'time', + alias: 'publishTime' + }); + }); + + it('parses bracket notation selector', () => { + const result = parseAddExpression('time[.version] as addedAt'); + assert.deepEqual(result, { + selector: 'time[.version]', + alias: 'addedAt' + }); + }); + + it('parses nested selector with brackets', () => { + const result = parseAddExpression('versions[.version].dist.integrity as integrity'); + assert.deepEqual(result, { + selector: 'versions[.version].dist.integrity', + alias: 'integrity' + }); + }); + + it('throws on missing alias', () => { + assert.throws(() => { + parseAddExpression('time'); + }, /Expected: as /); + }); + + it('throws on invalid format', () => { + assert.throws(() => { + parseAddExpression(''); + }, /Invalid --add expression/); + }); + + it('handles spaces around as keyword', () => { + const result = parseAddExpression('time as alias'); + assert.deepEqual(result, { + selector: 'time', + alias: 'alias' + }); + }); +}); + +describe('evaluateSelector', () => { + it('evaluates simple field access', () => { + const obj = { name: 'lodash' }; + assert.equal(evaluateSelector(obj, 'name'), 'lodash'); + }); + + it('evaluates nested field access', () => { + const obj = { time: { modified: '2024-01-01' } }; + assert.equal(evaluateSelector(obj, 'time.modified'), '2024-01-01'); + }); + + it('evaluates bracket notation with quotes', () => { + const obj = { time: { '4.17.21': '2021-02-20' } }; + assert.equal(evaluateSelector(obj, 'time["4.17.21"]'), '2021-02-20'); + }); + + it('evaluates mixed dot and bracket notation', () => { + const obj = { + versions: { + '1.0.0': { + dist: { integrity: 'sha512-abc' } + } + } + }; + assert.equal( + evaluateSelector(obj, 'versions["1.0.0"].dist.integrity'), + 'sha512-abc' + ); + }); + + it('evaluates numeric bracket notation', () => { + const obj = { items: ['first', 'second', 'third'] }; + assert.equal(evaluateSelector(obj, 'items[0]'), 'first'); + }); + + it('returns null for missing field', () => { + const obj = { name: 'lodash' }; + assert.equal(evaluateSelector(obj, 'missing'), null); + }); + + it('returns null for nested missing field', () => { + const obj = { time: {} }; + assert.equal(evaluateSelector(obj, 'time.modified'), null); + }); + + it('returns null when traversing through null', () => { + const obj = { parent: null }; + assert.equal(evaluateSelector(obj, 'parent.child'), null); + }); +}); + +describe('extractValue', () => { + it('resolves .field reference from record', () => { + const packument = { + time: { '4.17.21': '2021-02-20T15:42:16.891Z' } + }; + const record = { version: '4.17.21' }; + + const result = extractValue(packument, 'time[.version]', record); + assert.equal(result, '2021-02-20T15:42:16.891Z'); + }); + + it('resolves multiple .field references', () => { + const packument = { + data: { + npm: { count: 100 } + } + }; + const record = { registry: 'npm' }; + + const result = extractValue(packument, 'data[.registry].count', record); + assert.equal(result, 100); + }); + + it('resolves nested .field access', () => { + const packument = { + versions: { + '1.0.0': { dist: { integrity: 'sha512-abc' } } + } + }; + const record = { version: '1.0.0' }; + + const result = extractValue( + packument, + 'versions[.version].dist.integrity', + record + ); + assert.equal(result, 'sha512-abc'); + }); + + it('handles missing .field value', () => { + const packument = { time: {} }; + const record = {}; // No version field + + const result = extractValue(packument, 'time[.version]', record); + assert.equal(result, null); + }); + + it('handles special characters in .field value', () => { + const packument = { + time: { '@scope/pkg@1.0.0': '2024-01-01' } + }; + const record = { spec: '@scope/pkg@1.0.0' }; + + const result = extractValue(packument, 'time[.spec]', record); + assert.equal(result, '2024-01-01'); + }); + + it('evaluates static selector without .field', () => { + const packument = { name: 'lodash' }; + const record = {}; + + const result = extractValue(packument, 'name', record); + assert.equal(result, 'lodash'); + }); +}); + +describe('enrichment flow', () => { + it('enriches record with packument data', () => { + const packument = { + name: 'lodash', + time: { '4.17.21': '2021-02-20T15:42:16.891Z' }, + versions: { + '4.17.21': { dist: { integrity: 'sha512-abc' } } + } + }; + + const record = { name: 'lodash', version: '4.17.21' }; + + // Simulate enrichment + const enrichments = [ + parseAddExpression('time[.version] as addedAt'), + parseAddExpression('versions[.version].dist.integrity as integrity') + ]; + + for (const enrichment of enrichments) { + record[enrichment.alias] = extractValue( + packument, + enrichment.selector, + record + ); + } + + assert.equal(record.addedAt, '2021-02-20T15:42:16.891Z'); + assert.equal(record.integrity, 'sha512-abc'); + assert.equal(record.name, 'lodash'); + assert.equal(record.version, '4.17.21'); + }); + + it('handles missing packument with null values', () => { + const record = { name: 'unknown', version: '1.0.0' }; + + const enrichments = [ + parseAddExpression('time[.version] as addedAt') + ]; + + // When packument is null, add null values + for (const enrichment of enrichments) { + record[enrichment.alias] = null; + } + + assert.equal(record.addedAt, null); + assert.equal(record.name, 'unknown'); + }); +}); diff --git a/cli/cli/src/cmd/view/index.js b/cli/cli/src/cmd/view/index.js index c357733..5dfe306 100644 --- a/cli/cli/src/cmd/view/index.js +++ b/cli/cli/src/cmd/view/index.js @@ -7,6 +7,7 @@ export { command as show, usage as showUsage } from './show.js'; export { command as deleteView, usage as deleteUsage } from './delete.js'; export { command as query, usage as queryUsage } from './query.js'; export { command as join, usage as joinUsage } from './join.js'; +export { command as enrich, usage as enrichUsage } from './enrich.js'; export const usage = `Usage: _all_docs view [options] @@ -19,6 +20,7 @@ Commands: delete Delete a view query Query a view (output ndjson) join Join two views + enrich Enrich NDJSON with packument data A view is a predicate (origin filter) plus a projection (field selection). Views enable efficient queries and joins across different registry caches. diff --git a/doc/cli-reference.md b/doc/cli-reference.md index 68b397b..b76e05b 100644 --- a/doc/cli-reference.md +++ b/doc/cli-reference.md @@ -745,6 +745,60 @@ npx _all_docs view join npm-pkgs cgr-pkgs --diff --select 'name' npx _all_docs view join npm-pkgs cgr-pkgs --inner ``` +### view enrich + +Enrich NDJSON records with fields from cached packuments. + +```bash +npx _all_docs view enrich [options] +``` + +**Options:** +- `-i, --input ` - Input NDJSON file ('-' for stdin) +- `--add ` - Field to add from packument (repeatable) +- `--origin ` - Packument origin (default: npm) +- `--name-field ` - Input field for name (default: name) +- `--version-field ` - Input field for version (default: version) +- `--on-missing ` - How to handle missing packuments: skip, null, error (default: null) +- `--progress` - Show progress on stderr + +**Add Expression Syntax:** + +``` + as +``` + +Use `.field` to reference input record fields: + +| Expression | Meaning | +|------------|---------| +| `time[.version]` | `packument.time[record.version]` | +| `versions[.version].dist.integrity` | `packument.versions[record.version].dist.integrity` | + +**Examples:** + +```bash +# Add publish dates to package specs +npx _all_docs view enrich \ + --input specs.ndjson \ + --add 'time[.version] as addedAt' \ + > specs-with-dates.ndjson + +# Add multiple fields +npx _all_docs view enrich \ + --input build-order.ndjson \ + --add 'time[.version] as publishedAt' \ + --add 'versions[.version].dist.integrity as integrity' \ + > build-manifest.ndjson + +# Build manifest generation +cat build-order.ndjson | npx _all_docs view enrich \ + --input - \ + --add 'versions[.version].dist.integrity as expectedIntegrity' \ + --add 'time[.version] as publishedAt' \ + > build-manifest.ndjson +``` + ### view list List all defined views.