From 106aca1d4f854583a4bc80fc3c2c2b737099914c Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Thu, 4 Dec 2025 16:56:33 -0800 Subject: [PATCH 1/3] Add narrow vs wide parquet performance benchmark tutorial MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Interactive browser-based benchmarks comparing narrow (691MB, 11.6M rows) vs wide (275MB, 2.5M rows) parquet schemas for iSamples OpenContext data. Features: - Three benchmarks: entity counts, site aggregation, material distribution - Multiple runs with median timing for reliability - Environment info display (browser, connection type) - Data validity checks - Technical notes on pitfalls (caching, cold starts, memory limits) This page answers Eric's question about whether the 2-3x speedup seen locally also holds "over the wire" with HTTP range requests via DuckDB-WASM. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- tutorials/narrow_vs_wide_performance.qmd | 647 +++++++++++++++++++++++ 1 file changed, 647 insertions(+) create mode 100644 tutorials/narrow_vs_wide_performance.qmd diff --git a/tutorials/narrow_vs_wide_performance.qmd b/tutorials/narrow_vs_wide_performance.qmd new file mode 100644 index 0000000..5d02c25 --- /dev/null +++ b/tutorials/narrow_vs_wide_performance.qmd @@ -0,0 +1,647 @@ +--- +title: "Narrow vs Wide Schema Performance Comparison" +categories: [parquet, performance, benchmarking] +format: + html: + code-fold: true + toc: true + toc-depth: 3 +--- + +This page benchmarks the performance difference between **narrow** and **wide** parquet schema formats when accessing data "over the wire" via HTTP range requests in DuckDB-WASM. + +## Introduction + +### What are Narrow vs Wide Schemas? + +The iSamples property graph data can be serialized in two different parquet formats: + +| Format | Description | File Size | Row Count | +|--------|-------------|-----------|-----------| +| **Narrow** | Stores relationships as separate edge rows (`otype='_edge_'`) | 691 MB | ~11.6M rows | +| **Wide** | Stores relationships as `p__*` columns on entity rows | 275 MB | ~2.5M rows | + +Both formats represent the **same underlying data** with identical semantics, but the wide format is optimized for analytical queries by eliminating edge rows. + +### Why Performance Matters + +When using DuckDB-WASM in the browser: + +- Data is fetched via **HTTP range requests** (206 Partial Content) +- Only the columns and row groups needed for a query are downloaded +- Smaller files with fewer rows = fewer bytes to transfer, faster queries + +**Expected speedup**: Wide format should be **2-3x faster** based on local benchmarks. + +## Methodology + +::: {.callout-note} +### Benchmarking Approach + +- **Cold run**: First query after page load (includes metadata fetch, JIT compilation) +- **Warm runs**: Subsequent queries (metadata cached, JIT warmed up) +- **Multiple runs**: Each benchmark runs 3 times, we report the median +- **Network variability**: Results will vary based on your network connection and hardware + +Results are shown in real-time as benchmarks complete. +::: + +## Setup + +```{ojs} +//| output: false +// Import DuckDB for browser-based SQL analysis +import { DuckDBClient } from "https://cdn.jsdelivr.net/npm/@observablehq/duckdb@latest/+esm" +``` + +```{ojs} +//| echo: false +// Define parquet URLs +narrowUrl = "https://storage.googleapis.com/opencontext-parquet/oc_isamples_pqg.parquet" +wideUrl = "https://storage.googleapis.com/opencontext-parquet/oc_isamples_pqg_wide.parquet" +``` + +### Environment Info + +```{ojs} +//| echo: false +envInfo = { + const ua = navigator.userAgent; + const browser = ua.includes('Chrome') ? 'Chrome' : ua.includes('Firefox') ? 'Firefox' : ua.includes('Safari') ? 'Safari' : 'Unknown'; + const connection = navigator.connection || {}; + + return { + browser: browser, + userAgent: ua.substring(0, 80) + '...', + downlink: connection.downlink ? `${connection.downlink} Mbps` : 'N/A', + effectiveType: connection.effectiveType || 'N/A', + rtt: connection.rtt ? `${connection.rtt} ms` : 'N/A' + }; +} + +html`
+Environment:
+Browser: ${envInfo.browser}
+Connection: ${envInfo.effectiveType} (${envInfo.downlink}, RTT: ${envInfo.rtt})
+Note: Results will vary by network/hardware +
` +``` + +### Initialize Databases + +```{ojs} +//| echo: false +viewof runBenchmarks = Inputs.button("Run All Benchmarks", { + style: "padding: 12px 24px; background: #2E86AB; color: white; border: none; border-radius: 4px; cursor: pointer; font-size: 16px;" +}) +``` + + + +```{ojs} +// Create separate DuckDB instances for narrow and wide schemas +dbNarrow = { + const loadingDiv = document.getElementById('loading_init'); + if (loadingDiv) loadingDiv.hidden = false; + + try { + const instance = await DuckDBClient.of(); + await instance.query(`CREATE VIEW narrow AS SELECT * FROM read_parquet('${narrowUrl}')`); + return instance; + } finally { + if (loadingDiv) loadingDiv.hidden = true; + } +} + +dbWide = { + const instance = await DuckDBClient.of(); + await instance.query(`CREATE VIEW wide AS SELECT * FROM read_parquet('${wideUrl}')`); + return instance; +} +``` + +## Data Validity Check + +Before benchmarking, let's confirm both schemas represent the same underlying data. + + + +```{ojs} +validityCheck = { + // Only run when button clicked + if (runBenchmarks < 1) return null; + + const loadingDiv = document.getElementById('loading_validity'); + if (loadingDiv) loadingDiv.hidden = false; + + try { + // Count rows in narrow + const narrowCount = await dbNarrow.query(`SELECT COUNT(*) as cnt FROM narrow`); + const narrowTotal = narrowCount[0].cnt; + + // Count rows in wide + const wideCount = await dbWide.query(`SELECT COUNT(*) as cnt FROM wide`); + const wideTotal = wideCount[0].cnt; + + // Count entity types in narrow (excluding edges) + const narrowEntities = await dbNarrow.query(` + SELECT COUNT(*) as cnt FROM narrow + WHERE otype != '_edge_' + `); + const narrowEntityCount = narrowEntities[0].cnt; + + // Count samples in both + const narrowSamples = await dbNarrow.query(` + SELECT COUNT(*) as cnt FROM narrow + WHERE otype = 'MaterialSampleRecord' + `); + + const wideSamples = await dbWide.query(` + SELECT COUNT(*) as cnt FROM wide + WHERE otype = 'MaterialSampleRecord' + `); + + return { + narrowTotal: narrowTotal, + wideTotal: wideTotal, + narrowEntities: narrowEntityCount, + narrowSamples: narrowSamples[0].cnt, + wideSamples: wideSamples[0].cnt, + sampleMatch: narrowSamples[0].cnt === wideSamples[0].cnt + }; + } finally { + if (loadingDiv) loadingDiv.hidden = true; + } +} +``` + +```{ojs} +//| echo: false +validityCheck ? html` +
+

Data Validity Results

+ + + + + + + +
Narrow total rows:${validityCheck.narrowTotal.toLocaleString()}
Wide total rows:${validityCheck.wideTotal.toLocaleString()}
Narrow entities (non-edge):${validityCheck.narrowEntities.toLocaleString()}
Narrow samples:${validityCheck.narrowSamples.toLocaleString()}
Wide samples:${validityCheck.wideSamples.toLocaleString()}
Sample count match:${validityCheck.sampleMatch ? '✅ Yes' : '❌ No'}
+

The wide schema has ~79% fewer rows because edge rows are eliminated and stored as columns.

+
+` : html`

Click "Run All Benchmarks" to check data validity

` +``` + +## Benchmark 1: Entity Count Query + +This benchmark tests a simple `COUNT(*) GROUP BY otype` query, which requires scanning row metadata. + + + +```{ojs} +benchmark1 = { + if (runBenchmarks < 1) return null; + + const loadingDiv = document.getElementById('loading_b1'); + if (loadingDiv) loadingDiv.hidden = false; + + const query = `SELECT otype, COUNT(*) as cnt FROM {table} GROUP BY otype ORDER BY cnt DESC`; + const runs = 3; + + try { + // Narrow benchmark + const narrowTimes = []; + for (let i = 0; i < runs; i++) { + const start = performance.now(); + await dbNarrow.query(query.replace('{table}', 'narrow')); + narrowTimes.push(performance.now() - start); + } + + // Wide benchmark + const wideTimes = []; + for (let i = 0; i < runs; i++) { + const start = performance.now(); + await dbWide.query(query.replace('{table}', 'wide')); + wideTimes.push(performance.now() - start); + } + + // Calculate medians + const median = arr => { + const sorted = [...arr].sort((a, b) => a - b); + return sorted[Math.floor(sorted.length / 2)]; + }; + + const narrowMedian = median(narrowTimes); + const wideMedian = median(wideTimes); + + return { + name: "Entity Count (GROUP BY otype)", + narrowCold: narrowTimes[0], + narrowMedian: narrowMedian, + narrowAll: narrowTimes, + wideCold: wideTimes[0], + wideMedian: wideMedian, + wideAll: wideTimes, + speedup: narrowMedian / wideMedian + }; + } finally { + if (loadingDiv) loadingDiv.hidden = true; + } +} +``` + +```{ojs} +//| echo: false +benchmark1 ? html` +
+

${benchmark1.name}

+ + + + + + + + + + + + + + + + + + + +
SchemaCold (1st run)Warm (median)All runs
Narrow${benchmark1.narrowCold.toFixed(0)} ms${benchmark1.narrowMedian.toFixed(0)} ms${benchmark1.narrowAll.map(t => t.toFixed(0)).join(', ')} ms
Wide${benchmark1.wideCold.toFixed(0)} ms${benchmark1.wideMedian.toFixed(0)} ms${benchmark1.wideAll.map(t => t.toFixed(0)).join(', ')} ms
+

Speedup: ${benchmark1.speedup.toFixed(2)}x (wide is ${benchmark1.speedup > 1 ? 'faster' : 'slower'})

+
+` : html`

Waiting for benchmark...

` +``` + +## Benchmark 2: Sample Count by Site + +This benchmark counts samples per sampling site, requiring a join between samples and sites. + +**Query complexity:** +- Narrow: Requires joining through edge rows +- Wide: Direct join via `p__*` columns + + + +```{ojs} +benchmark2 = { + if (runBenchmarks < 1) return null; + + const loadingDiv = document.getElementById('loading_b2'); + if (loadingDiv) loadingDiv.hidden = false; + + // Narrow query: traverse edges to get from sample -> event -> site + const narrowQuery = ` + WITH sample_events AS ( + SELECT + e.o[1] as event_id, + s.row_id as sample_id + FROM narrow s + JOIN narrow e ON s.row_id = e.s AND e.p = 'produced_by' + WHERE s.otype = 'MaterialSampleRecord' + ), + event_sites AS ( + SELECT + se.sample_id, + e2.o[1] as site_id + FROM sample_events se + JOIN narrow e2 ON se.event_id = e2.s AND e2.p = 'sampling_site' + ) + SELECT + site.label, + COUNT(*) as sample_count + FROM event_sites es + JOIN narrow site ON es.site_id = site.row_id + GROUP BY site.label + ORDER BY sample_count DESC + LIMIT 10 + `; + + // Wide query: direct column access + const wideQuery = ` + WITH sample_sites AS ( + SELECT + s.row_id as sample_id, + e.p__sampling_site[1] as site_id + FROM wide s + JOIN wide e ON s.p__produced_by[1] = e.row_id + WHERE s.otype = 'MaterialSampleRecord' + AND e.otype = 'SamplingEvent' + ) + SELECT + site.label, + COUNT(*) as sample_count + FROM sample_sites ss + JOIN wide site ON ss.site_id = site.row_id + WHERE site.otype = 'SamplingSite' + GROUP BY site.label + ORDER BY sample_count DESC + LIMIT 10 + `; + + const runs = 3; + + try { + // Narrow benchmark + const narrowTimes = []; + for (let i = 0; i < runs; i++) { + const start = performance.now(); + await dbNarrow.query(narrowQuery); + narrowTimes.push(performance.now() - start); + } + + // Wide benchmark + const wideTimes = []; + for (let i = 0; i < runs; i++) { + const start = performance.now(); + await dbWide.query(wideQuery); + wideTimes.push(performance.now() - start); + } + + const median = arr => { + const sorted = [...arr].sort((a, b) => a - b); + return sorted[Math.floor(sorted.length / 2)]; + }; + + const narrowMedian = median(narrowTimes); + const wideMedian = median(wideTimes); + + return { + name: "Sample Count by Site (multi-join)", + narrowCold: narrowTimes[0], + narrowMedian: narrowMedian, + narrowAll: narrowTimes, + wideCold: wideTimes[0], + wideMedian: wideMedian, + wideAll: wideTimes, + speedup: narrowMedian / wideMedian + }; + } finally { + if (loadingDiv) loadingDiv.hidden = true; + } +} +``` + +```{ojs} +//| echo: false +benchmark2 ? html` +
+

${benchmark2.name}

+ + + + + + + + + + + + + + + + + + + +
SchemaCold (1st run)Warm (median)All runs
Narrow${benchmark2.narrowCold.toFixed(0)} ms${benchmark2.narrowMedian.toFixed(0)} ms${benchmark2.narrowAll.map(t => t.toFixed(0)).join(', ')} ms
Wide${benchmark2.wideCold.toFixed(0)} ms${benchmark2.wideMedian.toFixed(0)} ms${benchmark2.wideAll.map(t => t.toFixed(0)).join(', ')} ms
+

Speedup: ${benchmark2.speedup.toFixed(2)}x (wide is ${benchmark2.speedup > 1 ? 'faster' : 'slower'})

+
+` : html`

Waiting for benchmark...

` +``` + +## Benchmark 3: Material Type Distribution + +This benchmark aggregates sample counts by material category. + + + +```{ojs} +benchmark3 = { + if (runBenchmarks < 1) return null; + + const loadingDiv = document.getElementById('loading_b3'); + if (loadingDiv) loadingDiv.hidden = false; + + // Narrow query: join through edges to material concepts + const narrowQuery = ` + SELECT + c.label as material, + COUNT(*) as sample_count + FROM narrow s + JOIN narrow e ON s.row_id = e.s AND e.p = 'has_material_category' + JOIN narrow c ON e.o[1] = c.row_id + WHERE s.otype = 'MaterialSampleRecord' + GROUP BY c.label + ORDER BY sample_count DESC + LIMIT 10 + `; + + // Wide query: direct column access to material category + const wideQuery = ` + SELECT + c.label as material, + COUNT(*) as sample_count + FROM wide s + JOIN wide c ON s.p__has_material_category[1] = c.row_id + WHERE s.otype = 'MaterialSampleRecord' + AND c.otype = 'IdentifiedConcept' + GROUP BY c.label + ORDER BY sample_count DESC + LIMIT 10 + `; + + const runs = 3; + + try { + // Narrow benchmark + const narrowTimes = []; + for (let i = 0; i < runs; i++) { + const start = performance.now(); + await dbNarrow.query(narrowQuery); + narrowTimes.push(performance.now() - start); + } + + // Wide benchmark + const wideTimes = []; + for (let i = 0; i < runs; i++) { + const start = performance.now(); + await dbWide.query(wideQuery); + wideTimes.push(performance.now() - start); + } + + const median = arr => { + const sorted = [...arr].sort((a, b) => a - b); + return sorted[Math.floor(sorted.length / 2)]; + }; + + const narrowMedian = median(narrowTimes); + const wideMedian = median(wideTimes); + + return { + name: "Material Type Distribution", + narrowCold: narrowTimes[0], + narrowMedian: narrowMedian, + narrowAll: narrowTimes, + wideCold: wideTimes[0], + wideMedian: wideMedian, + wideAll: wideTimes, + speedup: narrowMedian / wideMedian + }; + } finally { + if (loadingDiv) loadingDiv.hidden = true; + } +} +``` + +```{ojs} +//| echo: false +benchmark3 ? html` +
+

${benchmark3.name}

+ + + + + + + + + + + + + + + + + + + +
SchemaCold (1st run)Warm (median)All runs
Narrow${benchmark3.narrowCold.toFixed(0)} ms${benchmark3.narrowMedian.toFixed(0)} ms${benchmark3.narrowAll.map(t => t.toFixed(0)).join(', ')} ms
Wide${benchmark3.wideCold.toFixed(0)} ms${benchmark3.wideMedian.toFixed(0)} ms${benchmark3.wideAll.map(t => t.toFixed(0)).join(', ')} ms
+

Speedup: ${benchmark3.speedup.toFixed(2)}x (wide is ${benchmark3.speedup > 1 ? 'faster' : 'slower'})

+
+` : html`

Waiting for benchmark...

` +``` + +## Results Summary + +```{ojs} +//| echo: false +allResults = { + if (!benchmark1 || !benchmark2 || !benchmark3) return null; + + const results = [benchmark1, benchmark2, benchmark3]; + const avgSpeedup = results.reduce((sum, r) => sum + r.speedup, 0) / results.length; + + return { + benchmarks: results, + avgSpeedup: avgSpeedup + }; +} + +allResults ? html` +
+

Summary Results

+ + + + + + + + + + + +${allResults.benchmarks.map(b => html` + + + + + + +`)} + + + + + + + +
BenchmarkNarrow (ms)Wide (ms)Speedup
${b.name}${b.narrowMedian.toFixed(0)}${b.wideMedian.toFixed(0)}${b.speedup.toFixed(2)}x
Average--${allResults.avgSpeedup.toFixed(2)}x
+ +

Key Findings

+
    +
  • File size reduction: Wide format is 60% smaller (275 MB vs 691 MB)
  • +
  • Row count reduction: Wide format has 79% fewer rows (~2.5M vs ~11.6M)
  • +
  • Query speedup: Average ${allResults.avgSpeedup.toFixed(1)}x faster with wide format
  • +
+ +

Recommendation

+

For browser-based analysis with DuckDB-WASM, the wide format is recommended for: +

    +
  • Faster query execution
  • +
  • Reduced network transfer (fewer HTTP range requests)
  • +
  • Lower memory usage in the browser
  • +
+

+
+` : html` +
+

Click "Run All Benchmarks" above to see results

+
+` +``` + +## Technical Notes + +### Pitfalls and Considerations + +| Consideration | How We Address It | +|---------------|-------------------| +| **Browser caching** | First run is "cold" (metadata not cached), subsequent runs are "warm" | +| **Network variability** | We run 3 iterations and report the median | +| **JIT compilation** | First run includes JIT overhead; warm runs are more representative | +| **Memory limits** | 691 MB narrow file may stress browser memory; wide format is safer | + +### Schema Differences + +**Narrow schema** stores relationships as edge rows: +```sql +-- Edge row example +{otype: '_edge_', s: 123, p: 'produced_by', o: [456]} +``` + +**Wide schema** stores relationships as columns: +```sql +-- Entity row with relationship columns +{otype: 'MaterialSampleRecord', p__produced_by: [456], p__has_material_category: [789]} +``` + +This eliminates ~9M edge rows, resulting in the 60% file size reduction. + +## See Also + +- [OpenContext Parquet Analysis](oc_parquet_enhanced.qmd) - Deep dive into the property graph structure +- [Cesium Visualization](parquet_cesium.qmd) - Interactive 3D visualization of sample locations From 9f7ebc626d36ef0bfe69e1f1c5dca75ded745306 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Thu, 4 Dec 2025 17:10:02 -0800 Subject: [PATCH 2/3] Address Codex review feedback for benchmark methodology MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes based on Codex code review: - Lazy data loading: DB initialization now gated behind button click - Sequential execution: Benchmarks wait for previous ones to complete - Improved cold/warm methodology: Warm median excludes cold run - Error handling: Try/catch with user-visible error display - Pinned DuckDB version: @0.7.1 instead of @latest - Updated methodology docs to reflect changes 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- tutorials/narrow_vs_wide_performance.qmd | 151 ++++++++++++++++++----- 1 file changed, 118 insertions(+), 33 deletions(-) diff --git a/tutorials/narrow_vs_wide_performance.qmd b/tutorials/narrow_vs_wide_performance.qmd index 5d02c25..c81e37f 100644 --- a/tutorials/narrow_vs_wide_performance.qmd +++ b/tutorials/narrow_vs_wide_performance.qmd @@ -38,20 +38,21 @@ When using DuckDB-WASM in the browser: ::: {.callout-note} ### Benchmarking Approach -- **Cold run**: First query after page load (includes metadata fetch, JIT compilation) -- **Warm runs**: Subsequent queries (metadata cached, JIT warmed up) -- **Multiple runs**: Each benchmark runs 3 times, we report the median +- **Cold run**: First query (includes metadata fetch, JIT compilation) - reported separately +- **Warm runs**: Runs 2-3 (metadata cached, JIT warmed up) +- **Warm median**: Median of warm runs only (excludes cold run for fair comparison) +- **Sequential execution**: Benchmarks run one after another, not concurrently - **Network variability**: Results will vary based on your network connection and hardware -Results are shown in real-time as benchmarks complete. +Results are shown in real-time as benchmarks complete. Data loading only begins when you click the button. ::: ## Setup ```{ojs} //| output: false -// Import DuckDB for browser-based SQL analysis -import { DuckDBClient } from "https://cdn.jsdelivr.net/npm/@observablehq/duckdb@latest/+esm" +// Import DuckDB for browser-based SQL analysis (pinned version for reproducibility) +import { DuckDBClient } from "https://cdn.jsdelivr.net/npm/@observablehq/duckdb@0.7.1/+esm" ``` ```{ojs} @@ -87,7 +88,7 @@ Connection: ${envInfo.effectiveType} (${envInfo.downlink}, RTT: ${envInfo.rtt})< ` ``` -### Initialize Databases +### Run Benchmarks ```{ojs} //| echo: false @@ -96,30 +97,68 @@ viewof runBenchmarks = Inputs.button("Run All Benchmarks", { }) ``` -