From 16c3e9eb844c1c17a732435753b10196b35e35f6 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Mon, 8 Jun 2026 17:38:42 +0100 Subject: [PATCH 01/19] feat: add Tinybird datasources for packages-db tables Signed-off-by: Joana Maia --- .../datasources/advisories.datasource | 40 +++++++++++ .../advisoryAffectedRanges.datasource | 28 ++++++++ .../datasources/advisoryPackages.datasource | 24 +++++++ .../datasources/maintainers.datasource | 28 ++++++++ .../packageDependencies.datasource | 31 +++++++++ .../datasources/packageMaintainers.datasource | 22 ++++++ .../datasources/packageRepos.datasource | 25 +++++++ .../tinybird/datasources/packages.datasource | 69 +++++++++++++++++++ .../repoScorecardChecks.datasource | 24 +++++++ .../tinybird/datasources/repos.datasource | 59 ++++++++++++++++ .../tinybird/datasources/versions.datasource | 39 +++++++++++ 11 files changed, 389 insertions(+) create mode 100644 services/libs/tinybird/datasources/advisories.datasource create mode 100644 services/libs/tinybird/datasources/advisoryAffectedRanges.datasource create mode 100644 services/libs/tinybird/datasources/advisoryPackages.datasource create mode 100644 services/libs/tinybird/datasources/maintainers.datasource create mode 100644 services/libs/tinybird/datasources/packageDependencies.datasource create mode 100644 services/libs/tinybird/datasources/packageMaintainers.datasource create mode 100644 services/libs/tinybird/datasources/packageRepos.datasource create mode 100644 services/libs/tinybird/datasources/packages.datasource create mode 100644 services/libs/tinybird/datasources/repoScorecardChecks.datasource create mode 100644 services/libs/tinybird/datasources/repos.datasource create mode 100644 services/libs/tinybird/datasources/versions.datasource diff --git a/services/libs/tinybird/datasources/advisories.datasource b/services/libs/tinybird/datasources/advisories.datasource new file mode 100644 index 0000000000..b6fa7f0f3a --- /dev/null +++ b/services/libs/tinybird/datasources/advisories.datasource @@ -0,0 +1,40 @@ +DESCRIPTION > + - `advisories` contains OSV-shaped security advisories ingested from deps.dev BigQuery. + - Replicated from Postgres packages-db — each row is one advisory (CVE, GHSA, MAL-*, etc.). + - Used to surface critical vulnerabilities, track CVSS scores, and power security overlays in LFX Insights. + - `id` is the internal primary key. + - `osvId` is the canonical advisory identifier (e.g. GHSA-xxx, CVE-xxx, OSV-xxx) — globally unique. + - `source` is the originating database: 'GHSA', 'OSV', 'NVD', 'NSWG', etc. (empty string if unknown). + - `sourceUrl` is the upstream advisory URL (empty string if not provided). + - `aliases` is an array of alternate identifiers for the same advisory (e.g. a GHSA may alias a CVE). + - `severity` is the qualitative risk level: 'LOW', 'MEDIUM', 'HIGH', or 'CRITICAL' (empty string if unknown). + - `cvss` is the numeric CVSS score (0 if not available). + - `cvssSource` documents the provenance of the CVSS value: 'osv_cvss_v3', 'osv_qualitative_fallback', 'osv_malicious_package', etc. + - `isCritical` is 1 when cvss >= 7.0 (HIGH or CRITICAL), 0 otherwise — computed from the score. + - `summary` is a short human-readable description of the vulnerability (empty string if not provided). + - `details` is the full advisory text (empty string if not provided). + - `publishedAt` is when the advisory was first published upstream (empty string if unknown). + - `modifiedAt` is when the advisory was last modified upstream; NULL for BQ-sourced rows (empty string if unknown). + - `createdAt` and `updatedAt` are row-level audit timestamps for Tinybird watermark-based sync. + +SCHEMA > + `id` UInt64 `json:$.record.id`, + `osvId` String `json:$.record.osv_id`, + `source` String `json:$.record.source` DEFAULT '', + `sourceUrl` String `json:$.record.source_url` DEFAULT '', + `aliases` Array(String) `json:$.record.aliases[:]` DEFAULT [], + `severity` String `json:$.record.severity` DEFAULT '', + `cvss` Float32 `json:$.record.cvss` DEFAULT 0, + `cvssSource` String `json:$.record.cvss_source` DEFAULT '', + `isCritical` UInt8 `json:$.record.is_critical` DEFAULT 0, + `summary` String `json:$.record.summary` DEFAULT '', + `details` String `json:$.record.details` DEFAULT '', + `publishedAt` DateTime64(3) `json:$.record.published_at` DEFAULT '', + `modifiedAt` DateTime64(3) `json:$.record.modified_at` DEFAULT '', + `createdAt` DateTime64(3) `json:$.record.created_at`, + `updatedAt` DateTime64(3) `json:$.record.updated_at` + +ENGINE ReplacingMergeTree +ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_SORTING_KEY osvId +ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/advisoryAffectedRanges.datasource b/services/libs/tinybird/datasources/advisoryAffectedRanges.datasource new file mode 100644 index 0000000000..3bfaa5ec84 --- /dev/null +++ b/services/libs/tinybird/datasources/advisoryAffectedRanges.datasource @@ -0,0 +1,28 @@ +DESCRIPTION > + - `advisoryAffectedRanges` stores the version ranges affected by a security advisory per package. + - Replicated from Postgres packages-db — each row defines one vulnerable window (introduced → fixed/last_affected). + - Used to determine whether a specific package version falls within a known vulnerable range. + - `id` is the internal primary key. + - `advisoryPackageId` links to the parent advisory_packages row. + - `introducedVersion` is the version where the vulnerability was introduced (empty string if unknown start). + - `fixedVersion` is the version where the vulnerability was patched (empty string if no fix exists yet). + - `lastAffected` is the last known affected version when there is no fixed version (empty string if not applicable). + - `rangeRaw` is the raw AffectedVersions string from the deps.dev BigQuery source (empty string if OSV-sourced). + - `unaffectedRaw` is the raw UnaffectedVersions string from deps.dev BigQuery (empty string if OSV-sourced). + - `createdAt` and `updatedAt` are row-level audit timestamps for Tinybird watermark-based sync. + +SCHEMA > + `id` UInt64 `json:$.record.id`, + `advisoryPackageId` UInt64 `json:$.record.advisory_package_id`, + `introducedVersion` String `json:$.record.introduced_version` DEFAULT '', + `fixedVersion` String `json:$.record.fixed_version` DEFAULT '', + `lastAffected` String `json:$.record.last_affected` DEFAULT '', + `rangeRaw` String `json:$.record.range_raw` DEFAULT '', + `unaffectedRaw` String `json:$.record.unaffected_raw` DEFAULT '', + `createdAt` DateTime64(3) `json:$.record.created_at`, + `updatedAt` DateTime64(3) `json:$.record.updated_at` + +ENGINE ReplacingMergeTree +ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_SORTING_KEY advisoryPackageId, id +ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/advisoryPackages.datasource b/services/libs/tinybird/datasources/advisoryPackages.datasource new file mode 100644 index 0000000000..25213d50ae --- /dev/null +++ b/services/libs/tinybird/datasources/advisoryPackages.datasource @@ -0,0 +1,24 @@ +DESCRIPTION > + - `advisoryPackages` maps security advisories to the packages they affect. + - Replicated from Postgres packages-db — one advisory can affect multiple packages across different ecosystems. + - Used to determine which packages are impacted by a given advisory, and to backfill `has_critical_vulnerability` on packages. + - `id` is the internal primary key. + - `advisoryId` links to the parent advisory record. + - `packageId` links to the matching packages row (0 if the package exists in OSV but not yet in our DB). + - `ecosystem` is the package ecosystem (npm, go, maven, pypi, etc.). + - `packageName` is the package name within its ecosystem as reported by OSV. + - `createdAt` and `updatedAt` are row-level audit timestamps for Tinybird watermark-based sync. + +SCHEMA > + `id` UInt64 `json:$.record.id`, + `advisoryId` UInt64 `json:$.record.advisory_id`, + `packageId` UInt64 `json:$.record.package_id` DEFAULT 0, + `ecosystem` String `json:$.record.ecosystem`, + `packageName` String `json:$.record.package_name`, + `createdAt` DateTime64(3) `json:$.record.created_at`, + `updatedAt` DateTime64(3) `json:$.record.updated_at` + +ENGINE ReplacingMergeTree +ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_SORTING_KEY ecosystem, packageName, id +ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/maintainers.datasource b/services/libs/tinybird/datasources/maintainers.datasource new file mode 100644 index 0000000000..4c1adf06e1 --- /dev/null +++ b/services/libs/tinybird/datasources/maintainers.datasource @@ -0,0 +1,28 @@ +DESCRIPTION > + - `maintainers` contains package maintainer profiles sourced from package registries (npm, PyPI, etc.). + - Replicated from Postgres packages-db — one row per unique (ecosystem, username) identity. + - Used to identify who maintains critical packages, correlate maintainers across ecosystems, and support contributor analytics. + - `id` is the internal primary key. + - `ecosystem` is the package registry this identity belongs to: 'npm', 'pypi', 'cargo', 'maven', etc. + - `username` is the maintainer's registry username — unique within an ecosystem. + - `displayName` is the maintainer's human-readable name as published in the registry (empty string if not provided). + - `url` is the maintainer's profile URL on the registry (empty string if not provided). + - `emailHash` is a SHA-256 hash of the maintainer's email address — never the raw email (GDPR compliance). + - `githubLogin` is the maintainer's GitHub username if resolved (empty string if not linked). + - `createdAt` and `updatedAt` are row-level audit timestamps for Tinybird watermark-based sync. + +SCHEMA > + `id` UInt64 `json:$.record.id`, + `ecosystem` String `json:$.record.ecosystem`, + `username` String `json:$.record.username`, + `displayName` String `json:$.record.display_name` DEFAULT '', + `url` String `json:$.record.url` DEFAULT '', + `emailHash` String `json:$.record.email_hash` DEFAULT '', + `githubLogin` String `json:$.record.github_login` DEFAULT '', + `createdAt` DateTime64(3) `json:$.record.created_at`, + `updatedAt` DateTime64(3) `json:$.record.updated_at` + +ENGINE ReplacingMergeTree +ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_SORTING_KEY ecosystem, username +ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/packageDependencies.datasource b/services/libs/tinybird/datasources/packageDependencies.datasource new file mode 100644 index 0000000000..deadb50a6a --- /dev/null +++ b/services/libs/tinybird/datasources/packageDependencies.datasource @@ -0,0 +1,31 @@ +DESCRIPTION > + - `packageDependencies` stores the dependency graph between package versions. + - Replicated from Postgres packages-db — one row per (version, dependency) edge in the dependency graph. + - Partitioned by hash(depends_on_id) in Postgres for fast downstream lookups; sorted here for analytical queries. + - Used to answer "what depends on package X?" (downstream consumers) and to compute dependent_repos_count / dependent_packages_count. + - `id` is the internal primary key. + - `packageId` is the ID of the package that contains the depending version. + - `versionId` is the specific version that declares the dependency. + - `dependsOnId` is the package being depended upon — the hot lookup key for vulnerability blast-radius queries. + - `dependsOnVersionId` is the resolved specific version of the dependency (0 if the exact version is unknown). + - `versionConstraint` is the declared version constraint string (e.g. '^1.2.3', '>=2.0.0'); empty string if not specified. + - `dependencyKind` is the dependency type: 'direct', 'dev', or 'peer'. + - `isOptional` is 1 if the dependency is marked optional, 0 otherwise. + - `createdAt` and `updatedAt` are row-level audit timestamps for Tinybird watermark-based sync. + +SCHEMA > + `id` UInt64 `json:$.record.id`, + `packageId` UInt64 `json:$.record.package_id`, + `versionId` UInt64 `json:$.record.version_id`, + `dependsOnId` UInt64 `json:$.record.depends_on_id`, + `dependsOnVersionId` UInt64 `json:$.record.depends_on_version_id` DEFAULT 0, + `versionConstraint` String `json:$.record.version_constraint` DEFAULT '', + `dependencyKind` String `json:$.record.dependency_kind`, + `isOptional` UInt8 `json:$.record.is_optional` DEFAULT 0, + `createdAt` DateTime64(3) `json:$.record.created_at`, + `updatedAt` DateTime64(3) `json:$.record.updated_at` + +ENGINE ReplacingMergeTree +ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_SORTING_KEY dependsOnId, versionId, id +ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/packageMaintainers.datasource b/services/libs/tinybird/datasources/packageMaintainers.datasource new file mode 100644 index 0000000000..c51ed3ee37 --- /dev/null +++ b/services/libs/tinybird/datasources/packageMaintainers.datasource @@ -0,0 +1,22 @@ +DESCRIPTION > + - `packageMaintainers` maps packages to their maintainers with role information. + - Replicated from Postgres packages-db — junction table linking packages and maintainers. + - Used to answer "who maintains this package?" and "what packages does this maintainer own?". + - `id` is the internal primary key. + - `packageId` links to the parent packages row. + - `maintainerId` links to the parent maintainers row. + - `role` is the maintainer's role on the package: 'author', 'maintainer', or empty string if not specified. + - `createdAt` and `updatedAt` are row-level audit timestamps for Tinybird watermark-based sync. + +SCHEMA > + `id` UInt64 `json:$.record.id`, + `packageId` UInt64 `json:$.record.package_id`, + `maintainerId` UInt64 `json:$.record.maintainer_id`, + `role` String `json:$.record.role` DEFAULT '', + `createdAt` DateTime64(3) `json:$.record.created_at`, + `updatedAt` DateTime64(3) `json:$.record.updated_at` + +ENGINE ReplacingMergeTree +ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_SORTING_KEY packageId, maintainerId +ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/packageRepos.datasource b/services/libs/tinybird/datasources/packageRepos.datasource new file mode 100644 index 0000000000..80316c3c6b --- /dev/null +++ b/services/libs/tinybird/datasources/packageRepos.datasource @@ -0,0 +1,25 @@ +DESCRIPTION > + - `packageRepos` stores the provenance mapping between packages and their source repositories. + - Replicated from Postgres packages-db — one row per (package, repo) pair; a package may link to multiple repos (monorepos). + - Used to join package data with repository health signals (stars, scorecard, activity) without a cross-table join in Postgres. + - `id` is the internal primary key. + - `packageId` links to the parent packages row. + - `repoId` links to the associated repos row. + - `source` identifies how the link was established: 'declared', 'deps_dev', 'heuristic', or 'manual'. + - `confidence` is a 0.00–1.00 score representing how certain the link is (1.00 = verified/declared). + - `verifiedAt` is when the link was last confirmed or upserted — serves as the updated_at watermark for sync. + - `createdAt` is the row-insert timestamp — set once on first insert, never updated. + +SCHEMA > + `id` UInt64 `json:$.record.id`, + `packageId` UInt64 `json:$.record.package_id`, + `repoId` UInt64 `json:$.record.repo_id`, + `source` String `json:$.record.source`, + `confidence` Float32 `json:$.record.confidence`, + `verifiedAt` DateTime64(3) `json:$.record.verified_at`, + `createdAt` DateTime64(3) `json:$.record.created_at` + +ENGINE ReplacingMergeTree +ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_SORTING_KEY packageId, repoId +ENGINE_VER verifiedAt diff --git a/services/libs/tinybird/datasources/packages.datasource b/services/libs/tinybird/datasources/packages.datasource new file mode 100644 index 0000000000..db43b0bf80 --- /dev/null +++ b/services/libs/tinybird/datasources/packages.datasource @@ -0,0 +1,69 @@ +DESCRIPTION > + - `packages` contains Tier-2 package metadata for tracked OSS packages across multiple ecosystems. + - Replicated from Postgres packages-db — seeded from deps.dev BigQuery, enriched by registry-specific workers (npm, Maven, etc.). + - Used to power package search, vulnerability overlays, criticality scoring, and ecosystem health analytics. + - `id` is the internal primary key. + - `purl` is the Package URL (PURL) — the globally unique identifier following the PURL standard. + - `ecosystem` is the package registry: 'npm', 'go', 'maven', 'pypi', 'nuget', 'cargo', etc. + - `namespace` is the package scope or group (e.g. '@angular' for npm scoped packages, 'org.apache' for Maven); empty string if not applicable. + - `name` is the package name within its ecosystem. + - `registryUrl` is the package's canonical registry page URL (empty string if not resolved). + - `status` is the lifecycle status: 'active', 'deprecated', 'unpublished', or 'yanked' (empty string if unknown). + - `description` is the package description from the registry (empty string if not provided). + - `homepage` is the project homepage URL (empty string if not provided). + - `declaredRepositoryUrl` is the repository URL as declared in the package manifest (empty string if not set). + - `repositoryUrl` is the resolved canonical repository URL (empty string if not resolved). + - `licenses` is an array of SPDX-normalized license identifiers (empty array if unknown). + - `licensesRaw` is the raw license string before SPDX normalization (empty string if not provided). + - `keywords` is an array of package keywords from the registry manifest (empty array if none). + - `distTagsLatest`, `distTagsNext`, `distTagsBeta` are npm dist-tag values; empty string for non-npm packages. + - `versionsCount` is the total number of published versions (0 if unknown). + - `latestVersion` is the most recent stable version string (empty string if unknown). + - `firstReleaseAt` is the timestamp of the earliest published version (empty string if unknown). + - `latestReleaseAt` is the timestamp of the most recent published version (empty string if unknown). + - `dependentPackagesCount` is the number of other packages that directly depend on this package (0 if unknown). + - `dependentReposCount` is the number of repositories that depend on this package (0 if unknown). + - `hasCriticalVulnerability` is 1 if the latest version has an active critical advisory (CVSS >= 7.0), 0 otherwise. + - `criticalityScore` is the composite criticality score from the ranking function (0 if not yet ranked). + - `isCritical` is 1 if this package ranks in the top N by ecosystem in the most recent ranking pass, 0 otherwise. + - `lastRankPassAt` is when the criticality ranking worker last processed this package (empty string if never ranked). + - `ingestionSource` identifies which worker last wrote the row: 'npm-registry', 'deps_dev', etc. (empty string if unknown). + - `lastSyncedAt` is when the row was last written by any worker — serves as the updated_at watermark for sync. + - `createdAt` is the row-insert timestamp — set once on first insert, never updated. + +SCHEMA > + `id` UInt64 `json:$.record.id`, + `purl` String `json:$.record.purl`, + `ecosystem` String `json:$.record.ecosystem`, + `namespace` String `json:$.record.namespace` DEFAULT '', + `name` String `json:$.record.name`, + `registryUrl` String `json:$.record.registry_url` DEFAULT '', + `status` String `json:$.record.status` DEFAULT '', + `description` String `json:$.record.description` DEFAULT '', + `homepage` String `json:$.record.homepage` DEFAULT '', + `declaredRepositoryUrl` String `json:$.record.declared_repository_url` DEFAULT '', + `repositoryUrl` String `json:$.record.repository_url` DEFAULT '', + `licenses` Array(String) `json:$.record.licenses[:]` DEFAULT [], + `licensesRaw` String `json:$.record.licenses_raw` DEFAULT '', + `keywords` Array(String) `json:$.record.keywords[:]` DEFAULT [], + `distTagsLatest` String `json:$.record.dist_tags_latest` DEFAULT '', + `distTagsNext` String `json:$.record.dist_tags_next` DEFAULT '', + `distTagsBeta` String `json:$.record.dist_tags_beta` DEFAULT '', + `versionsCount` UInt32 `json:$.record.versions_count` DEFAULT 0, + `latestVersion` String `json:$.record.latest_version` DEFAULT '', + `firstReleaseAt` DateTime64(3) `json:$.record.first_release_at` DEFAULT '', + `latestReleaseAt` DateTime64(3) `json:$.record.latest_release_at` DEFAULT '', + `dependentPackagesCount` UInt32 `json:$.record.dependent_packages_count` DEFAULT 0, + `dependentReposCount` UInt32 `json:$.record.dependent_repos_count` DEFAULT 0, + `hasCriticalVulnerability` UInt8 `json:$.record.has_critical_vulnerability` DEFAULT 0, + `criticalityScore` Float32 `json:$.record.criticality_score` DEFAULT 0, + `isCritical` UInt8 `json:$.record.is_critical` DEFAULT 0, + `lastRankPassAt` DateTime64(3) `json:$.record.last_rank_pass_at` DEFAULT '', + `ingestionSource` String `json:$.record.ingestion_source` DEFAULT '', + `lastSyncedAt` DateTime64(3) `json:$.record.last_synced_at`, + `createdAt` DateTime64(3) `json:$.record.created_at` + +ENGINE ReplacingMergeTree +ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_SORTING_KEY ecosystem, purl +ENGINE_VER lastSyncedAt diff --git a/services/libs/tinybird/datasources/repoScorecardChecks.datasource b/services/libs/tinybird/datasources/repoScorecardChecks.datasource new file mode 100644 index 0000000000..73b4050047 --- /dev/null +++ b/services/libs/tinybird/datasources/repoScorecardChecks.datasource @@ -0,0 +1,24 @@ +DESCRIPTION > + - `repoScorecardChecks` contains per-check OpenSSF Scorecard results for tracked repositories. + - Replicated from Postgres packages-db — one row per (repo, check_name) pair; ~18 named checks per repo. + - Used to drill into specific security posture signals beyond the aggregate scorecard score on the repos table. + - `id` is the internal primary key. + - `repoId` links to the parent repos row. + - `checkName` is the Scorecard check identifier, e.g. 'Binary-Artifacts', 'Branch-Protection', 'Code-Review'. + - `score` is the numeric check score on a 0–10 scale (0 if the check failed or was not applicable). + - `reason` is the human-readable explanation of the score (empty string if not provided). + - `createdAt` and `updatedAt` are row-level audit timestamps for Tinybird watermark-based sync. + +SCHEMA > + `id` UInt64 `json:$.record.id`, + `repoId` UInt64 `json:$.record.repo_id`, + `checkName` String `json:$.record.check_name`, + `score` Float32 `json:$.record.score` DEFAULT 0, + `reason` String `json:$.record.reason` DEFAULT '', + `createdAt` DateTime64(3) `json:$.record.created_at`, + `updatedAt` DateTime64(3) `json:$.record.updated_at` + +ENGINE ReplacingMergeTree +ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_SORTING_KEY repoId, checkName +ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/repos.datasource b/services/libs/tinybird/datasources/repos.datasource new file mode 100644 index 0000000000..6a753dd3b3 --- /dev/null +++ b/services/libs/tinybird/datasources/repos.datasource @@ -0,0 +1,59 @@ +DESCRIPTION > + - `repos` contains GitHub/GitLab/Bitbucket repository metadata for repositories linked to tracked packages. + - Replicated from Postgres packages-db — initially seeded from deps.dev BigQuery, enriched by the GitHub API worker. + - Used to surface repository health signals: stars, forks, activity, scorecard score, and criticality indicators. + - `id` is the internal primary key. + - `url` is the canonical repository URL — the unique identifier across all sources. + - `host` is the hosting platform: 'github', 'gitlab', 'bitbucket', or 'other' (empty string if not yet resolved). + - `owner` is the repository owner or organization name (empty string if not yet enriched). + - `name` is the repository name (empty string if not yet enriched). + - `description` is the repository description (empty string if not provided). + - `primaryLanguage` is the dominant programming language (empty string if unknown). + - `topics` is an array of repository topics or tags (empty array if none). + - `stars` is the GitHub star count (0 if not yet enriched). + - `forks` is the fork count (0 if not yet enriched). + - `watchers` is the watcher count (0 if not yet enriched). + - `openIssues` is the number of open issues (0 if not yet enriched). + - `lastCommitAt` is the timestamp of the most recent commit (empty string if unknown). + - `archived` is 1 if the repository is archived, 0 otherwise (0 default until enriched). + - `disabled` is 1 if the repository is disabled, 0 otherwise (0 default until enriched). + - `isFork` is 1 if this repository is a fork of another, 0 otherwise (0 default until enriched). + - `createdAt` is the repository creation date on GitHub/GitLab — a domain timestamp, not a row-insert timestamp. + - `homepage` is the project homepage URL (empty string if not provided). + - `rawProjectType` is the deps.dev project type string (e.g. 'GITHUB', 'GITLAB') for identity resolution. + - `rawProjectName` is the deps.dev project name (e.g. 'github.com/owner/repo') for identity resolution. + - `scorecardScore` is the OpenSSF Scorecard aggregate score (0 if not yet scored). + - `scorecardLastRunAt` is when the Scorecard was last evaluated (empty string if never run). + - `skipEnrichment` is 1 if the GitHub enricher should skip this repo (e.g. invalid/transient URLs), 0 otherwise. + - `lastSyncedAt` is when the row was last written by any worker — serves as the updated_at watermark for sync. + +SCHEMA > + `id` UInt64 `json:$.record.id`, + `url` String `json:$.record.url`, + `host` String `json:$.record.host` DEFAULT '', + `owner` String `json:$.record.owner` DEFAULT '', + `name` String `json:$.record.name` DEFAULT '', + `description` String `json:$.record.description` DEFAULT '', + `primaryLanguage` String `json:$.record.primary_language` DEFAULT '', + `topics` Array(String) `json:$.record.topics[:]` DEFAULT [], + `stars` Int32 `json:$.record.stars` DEFAULT 0, + `forks` Int32 `json:$.record.forks` DEFAULT 0, + `watchers` Int32 `json:$.record.watchers` DEFAULT 0, + `openIssues` Int32 `json:$.record.open_issues` DEFAULT 0, + `lastCommitAt` DateTime64(3) `json:$.record.last_commit_at` DEFAULT '', + `archived` UInt8 `json:$.record.archived` DEFAULT 0, + `disabled` UInt8 `json:$.record.disabled` DEFAULT 0, + `isFork` UInt8 `json:$.record.is_fork` DEFAULT 0, + `createdAt` DateTime64(3) `json:$.record.created_at` DEFAULT '', + `homepage` String `json:$.record.homepage` DEFAULT '', + `rawProjectType` String `json:$.record.raw_project_type` DEFAULT '', + `rawProjectName` String `json:$.record.raw_project_name` DEFAULT '', + `scorecardScore` Float32 `json:$.record.scorecard_score` DEFAULT 0, + `scorecardLastRunAt` DateTime64(3) `json:$.record.scorecard_last_run_at` DEFAULT '', + `skipEnrichment` UInt8 `json:$.record.skip_enrichment` DEFAULT 0, + `lastSyncedAt` DateTime64(3) `json:$.record.last_synced_at` + +ENGINE ReplacingMergeTree +ENGINE_PARTITION_KEY toYear(lastSyncedAt) +ENGINE_SORTING_KEY id +ENGINE_VER lastSyncedAt diff --git a/services/libs/tinybird/datasources/versions.datasource b/services/libs/tinybird/datasources/versions.datasource new file mode 100644 index 0000000000..e1954fbca5 --- /dev/null +++ b/services/libs/tinybird/datasources/versions.datasource @@ -0,0 +1,39 @@ +DESCRIPTION > + - `versions` contains published version metadata for tracked packages across all ecosystems. + - Replicated from Postgres packages-db — one row per (package, version number) pair. + - Used to identify the latest version, detect pre-releases, track publish timelines, and resolve dependency constraints. + - `id` is the internal primary key (included in the composite primary key with package_id for partitioning in Postgres). + - `packageId` links to the parent packages row. + - `ecosystem` is the package registry: 'npm', 'go', 'maven', 'pypi', 'nuget', 'cargo', etc. + - `namespace` is the package scope or group; empty string if not applicable. + - `name` is the package name — denormalized from packages for fast dependency resolution without a join. + - `number` is the version string (e.g. '1.2.3', '2.0.0-beta.1'). + - `publishedAt` is when this version was published to the registry (empty string if unknown). + - `isLatest` is 1 if this is the current latest stable version according to the registry, 0 otherwise (0 default until enriched). + - `isYanked` is 1 if this version was retracted/yanked from the registry, 0 otherwise (0 default until enriched). + - `isPrerelease` is 1 if the version string contains a pre-release identifier (alpha, beta, rc, etc.), 0 otherwise. + - `licenses` is an array of SPDX-normalized license identifiers for this specific version (may differ from the package-level value). + - `downloadCount` is the per-version download count where available (npm, crates); 0 if not tracked. + - `lastSyncedAt` is when the row was last written by any worker — serves as the updated_at watermark for sync. + - `createdAt` is the row-insert timestamp — set once on first insert, never updated. + +SCHEMA > + `id` UInt64 `json:$.record.id`, + `packageId` UInt64 `json:$.record.package_id`, + `ecosystem` String `json:$.record.ecosystem`, + `namespace` String `json:$.record.namespace` DEFAULT '', + `name` String `json:$.record.name`, + `number` String `json:$.record.number`, + `publishedAt` DateTime64(3) `json:$.record.published_at` DEFAULT '', + `isLatest` UInt8 `json:$.record.is_latest` DEFAULT 0, + `isYanked` UInt8 `json:$.record.is_yanked` DEFAULT 0, + `isPrerelease` UInt8 `json:$.record.is_prerelease` DEFAULT 0, + `licenses` Array(String) `json:$.record.licenses[:]` DEFAULT [], + `downloadCount` UInt64 `json:$.record.download_count` DEFAULT 0, + `lastSyncedAt` DateTime64(3) `json:$.record.last_synced_at`, + `createdAt` DateTime64(3) `json:$.record.created_at` + +ENGINE ReplacingMergeTree +ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_SORTING_KEY packageId, number +ENGINE_VER lastSyncedAt From 4a91e1bc63d71c8293781f26c30b36a6052e5a45 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Mon, 8 Jun 2026 17:48:56 +0100 Subject: [PATCH 02/19] chore: align partition key with ENGINE_VER in packages-db datasources Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/advisories.datasource | 2 +- .../libs/tinybird/datasources/advisoryAffectedRanges.datasource | 2 +- services/libs/tinybird/datasources/advisoryPackages.datasource | 2 +- services/libs/tinybird/datasources/maintainers.datasource | 2 +- .../libs/tinybird/datasources/packageDependencies.datasource | 2 +- .../libs/tinybird/datasources/packageMaintainers.datasource | 2 +- services/libs/tinybird/datasources/packageRepos.datasource | 2 +- services/libs/tinybird/datasources/packages.datasource | 2 +- .../libs/tinybird/datasources/repoScorecardChecks.datasource | 2 +- services/libs/tinybird/datasources/versions.datasource | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/services/libs/tinybird/datasources/advisories.datasource b/services/libs/tinybird/datasources/advisories.datasource index b6fa7f0f3a..6fd114176a 100644 --- a/services/libs/tinybird/datasources/advisories.datasource +++ b/services/libs/tinybird/datasources/advisories.datasource @@ -35,6 +35,6 @@ SCHEMA > `updatedAt` DateTime64(3) `json:$.record.updated_at` ENGINE ReplacingMergeTree -ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_PARTITION_KEY toYear(updatedAt) ENGINE_SORTING_KEY osvId ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/advisoryAffectedRanges.datasource b/services/libs/tinybird/datasources/advisoryAffectedRanges.datasource index 3bfaa5ec84..4a6710e524 100644 --- a/services/libs/tinybird/datasources/advisoryAffectedRanges.datasource +++ b/services/libs/tinybird/datasources/advisoryAffectedRanges.datasource @@ -23,6 +23,6 @@ SCHEMA > `updatedAt` DateTime64(3) `json:$.record.updated_at` ENGINE ReplacingMergeTree -ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_PARTITION_KEY toYear(updatedAt) ENGINE_SORTING_KEY advisoryPackageId, id ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/advisoryPackages.datasource b/services/libs/tinybird/datasources/advisoryPackages.datasource index 25213d50ae..e71415a932 100644 --- a/services/libs/tinybird/datasources/advisoryPackages.datasource +++ b/services/libs/tinybird/datasources/advisoryPackages.datasource @@ -19,6 +19,6 @@ SCHEMA > `updatedAt` DateTime64(3) `json:$.record.updated_at` ENGINE ReplacingMergeTree -ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_PARTITION_KEY toYear(updatedAt) ENGINE_SORTING_KEY ecosystem, packageName, id ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/maintainers.datasource b/services/libs/tinybird/datasources/maintainers.datasource index 4c1adf06e1..d86adc506f 100644 --- a/services/libs/tinybird/datasources/maintainers.datasource +++ b/services/libs/tinybird/datasources/maintainers.datasource @@ -23,6 +23,6 @@ SCHEMA > `updatedAt` DateTime64(3) `json:$.record.updated_at` ENGINE ReplacingMergeTree -ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_PARTITION_KEY toYear(updatedAt) ENGINE_SORTING_KEY ecosystem, username ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/packageDependencies.datasource b/services/libs/tinybird/datasources/packageDependencies.datasource index deadb50a6a..de922a2bf8 100644 --- a/services/libs/tinybird/datasources/packageDependencies.datasource +++ b/services/libs/tinybird/datasources/packageDependencies.datasource @@ -26,6 +26,6 @@ SCHEMA > `updatedAt` DateTime64(3) `json:$.record.updated_at` ENGINE ReplacingMergeTree -ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_PARTITION_KEY toYear(updatedAt) ENGINE_SORTING_KEY dependsOnId, versionId, id ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/packageMaintainers.datasource b/services/libs/tinybird/datasources/packageMaintainers.datasource index c51ed3ee37..e4c187c7ed 100644 --- a/services/libs/tinybird/datasources/packageMaintainers.datasource +++ b/services/libs/tinybird/datasources/packageMaintainers.datasource @@ -17,6 +17,6 @@ SCHEMA > `updatedAt` DateTime64(3) `json:$.record.updated_at` ENGINE ReplacingMergeTree -ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_PARTITION_KEY toYear(updatedAt) ENGINE_SORTING_KEY packageId, maintainerId ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/packageRepos.datasource b/services/libs/tinybird/datasources/packageRepos.datasource index 80316c3c6b..8a462da4c0 100644 --- a/services/libs/tinybird/datasources/packageRepos.datasource +++ b/services/libs/tinybird/datasources/packageRepos.datasource @@ -20,6 +20,6 @@ SCHEMA > `createdAt` DateTime64(3) `json:$.record.created_at` ENGINE ReplacingMergeTree -ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_PARTITION_KEY toYear(verifiedAt) ENGINE_SORTING_KEY packageId, repoId ENGINE_VER verifiedAt diff --git a/services/libs/tinybird/datasources/packages.datasource b/services/libs/tinybird/datasources/packages.datasource index db43b0bf80..c947855ed4 100644 --- a/services/libs/tinybird/datasources/packages.datasource +++ b/services/libs/tinybird/datasources/packages.datasource @@ -64,6 +64,6 @@ SCHEMA > `createdAt` DateTime64(3) `json:$.record.created_at` ENGINE ReplacingMergeTree -ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_PARTITION_KEY toYear(lastSyncedAt) ENGINE_SORTING_KEY ecosystem, purl ENGINE_VER lastSyncedAt diff --git a/services/libs/tinybird/datasources/repoScorecardChecks.datasource b/services/libs/tinybird/datasources/repoScorecardChecks.datasource index 73b4050047..2451732930 100644 --- a/services/libs/tinybird/datasources/repoScorecardChecks.datasource +++ b/services/libs/tinybird/datasources/repoScorecardChecks.datasource @@ -19,6 +19,6 @@ SCHEMA > `updatedAt` DateTime64(3) `json:$.record.updated_at` ENGINE ReplacingMergeTree -ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_PARTITION_KEY toYear(updatedAt) ENGINE_SORTING_KEY repoId, checkName ENGINE_VER updatedAt diff --git a/services/libs/tinybird/datasources/versions.datasource b/services/libs/tinybird/datasources/versions.datasource index e1954fbca5..32b668b0da 100644 --- a/services/libs/tinybird/datasources/versions.datasource +++ b/services/libs/tinybird/datasources/versions.datasource @@ -34,6 +34,6 @@ SCHEMA > `createdAt` DateTime64(3) `json:$.record.created_at` ENGINE ReplacingMergeTree -ENGINE_PARTITION_KEY toYear(createdAt) +ENGINE_PARTITION_KEY toYear(lastSyncedAt) ENGINE_SORTING_KEY packageId, number ENGINE_VER lastSyncedAt From 21995757fd1d6c73a54b84c027fda4eedd127769 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Mon, 8 Jun 2026 17:52:11 +0100 Subject: [PATCH 03/19] chore: remove email_hash from maintainers datasource Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/maintainers.datasource | 2 -- 1 file changed, 2 deletions(-) diff --git a/services/libs/tinybird/datasources/maintainers.datasource b/services/libs/tinybird/datasources/maintainers.datasource index d86adc506f..c49a248369 100644 --- a/services/libs/tinybird/datasources/maintainers.datasource +++ b/services/libs/tinybird/datasources/maintainers.datasource @@ -7,7 +7,6 @@ DESCRIPTION > - `username` is the maintainer's registry username — unique within an ecosystem. - `displayName` is the maintainer's human-readable name as published in the registry (empty string if not provided). - `url` is the maintainer's profile URL on the registry (empty string if not provided). - - `emailHash` is a SHA-256 hash of the maintainer's email address — never the raw email (GDPR compliance). - `githubLogin` is the maintainer's GitHub username if resolved (empty string if not linked). - `createdAt` and `updatedAt` are row-level audit timestamps for Tinybird watermark-based sync. @@ -17,7 +16,6 @@ SCHEMA > `username` String `json:$.record.username`, `displayName` String `json:$.record.display_name` DEFAULT '', `url` String `json:$.record.url` DEFAULT '', - `emailHash` String `json:$.record.email_hash` DEFAULT '', `githubLogin` String `json:$.record.github_login` DEFAULT '', `createdAt` DateTime64(3) `json:$.record.created_at`, `updatedAt` DateTime64(3) `json:$.record.updated_at` From 453fbac9738de9f1b6f576ce6cae9bc2fe87e879 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Mon, 8 Jun 2026 18:00:42 +0100 Subject: [PATCH 04/19] chore: fix packages-db datasources to match latest migrations Signed-off-by: Joana Maia --- .../libs/tinybird/datasources/maintainers.datasource | 2 ++ services/libs/tinybird/datasources/packages.datasource | 10 ++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/services/libs/tinybird/datasources/maintainers.datasource b/services/libs/tinybird/datasources/maintainers.datasource index c49a248369..07c5c79a3d 100644 --- a/services/libs/tinybird/datasources/maintainers.datasource +++ b/services/libs/tinybird/datasources/maintainers.datasource @@ -7,6 +7,7 @@ DESCRIPTION > - `username` is the maintainer's registry username — unique within an ecosystem. - `displayName` is the maintainer's human-readable name as published in the registry (empty string if not provided). - `url` is the maintainer's profile URL on the registry (empty string if not provided). + - `email` is the maintainer's email address as published in the registry (empty string if not provided). - `githubLogin` is the maintainer's GitHub username if resolved (empty string if not linked). - `createdAt` and `updatedAt` are row-level audit timestamps for Tinybird watermark-based sync. @@ -16,6 +17,7 @@ SCHEMA > `username` String `json:$.record.username`, `displayName` String `json:$.record.display_name` DEFAULT '', `url` String `json:$.record.url` DEFAULT '', + `email` String `json:$.record.email` DEFAULT '', `githubLogin` String `json:$.record.github_login` DEFAULT '', `createdAt` DateTime64(3) `json:$.record.created_at`, `updatedAt` DateTime64(3) `json:$.record.updated_at` diff --git a/services/libs/tinybird/datasources/packages.datasource b/services/libs/tinybird/datasources/packages.datasource index c947855ed4..4831082020 100644 --- a/services/libs/tinybird/datasources/packages.datasource +++ b/services/libs/tinybird/datasources/packages.datasource @@ -21,10 +21,11 @@ DESCRIPTION > - `latestVersion` is the most recent stable version string (empty string if unknown). - `firstReleaseAt` is the timestamp of the earliest published version (empty string if unknown). - `latestReleaseAt` is the timestamp of the most recent published version (empty string if unknown). - - `dependentPackagesCount` is the number of other packages that directly depend on this package (0 if unknown). + - `dependentCount` is the number of packages that directly depend on this package (0 if unknown). + - `transitiveDependentCount` is the number of packages that transitively depend on this package (0 if unknown). - `dependentReposCount` is the number of repositories that depend on this package (0 if unknown). - `hasCriticalVulnerability` is 1 if the latest version has an active critical advisory (CVSS >= 7.0), 0 otherwise. - - `criticalityScore` is the composite criticality score from the ranking function (0 if not yet ranked). + - `impact` is the composite criticality impact score from the ranking function (0 if not yet ranked). - `isCritical` is 1 if this package ranks in the top N by ecosystem in the most recent ranking pass, 0 otherwise. - `lastRankPassAt` is when the criticality ranking worker last processed this package (empty string if never ranked). - `ingestionSource` identifies which worker last wrote the row: 'npm-registry', 'deps_dev', etc. (empty string if unknown). @@ -53,10 +54,11 @@ SCHEMA > `latestVersion` String `json:$.record.latest_version` DEFAULT '', `firstReleaseAt` DateTime64(3) `json:$.record.first_release_at` DEFAULT '', `latestReleaseAt` DateTime64(3) `json:$.record.latest_release_at` DEFAULT '', - `dependentPackagesCount` UInt32 `json:$.record.dependent_packages_count` DEFAULT 0, + `dependentCount` UInt32 `json:$.record.dependent_count` DEFAULT 0, + `transitiveDependentCount` UInt64 `json:$.record.transitive_dependent_count` DEFAULT 0, `dependentReposCount` UInt32 `json:$.record.dependent_repos_count` DEFAULT 0, `hasCriticalVulnerability` UInt8 `json:$.record.has_critical_vulnerability` DEFAULT 0, - `criticalityScore` Float32 `json:$.record.criticality_score` DEFAULT 0, + `impact` Float32 `json:$.record.impact` DEFAULT 0, `isCritical` UInt8 `json:$.record.is_critical` DEFAULT 0, `lastRankPassAt` DateTime64(3) `json:$.record.last_rank_pass_at` DEFAULT '', `ingestionSource` String `json:$.record.ingestion_source` DEFAULT '', From 6f7601d6e29d762022d2c74f05f78c1c3e4cfbb0 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 11:48:10 +0100 Subject: [PATCH 05/19] fix: schema Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/versions.datasource | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/libs/tinybird/datasources/versions.datasource b/services/libs/tinybird/datasources/versions.datasource index 32b668b0da..08f750e1be 100644 --- a/services/libs/tinybird/datasources/versions.datasource +++ b/services/libs/tinybird/datasources/versions.datasource @@ -24,7 +24,7 @@ SCHEMA > `namespace` String `json:$.record.namespace` DEFAULT '', `name` String `json:$.record.name`, `number` String `json:$.record.number`, - `publishedAt` DateTime64(3) `json:$.record.published_at` DEFAULT '', + `publishedAt` Nullable(DateTime64(3)) `json:$.record.published_at`, `isLatest` UInt8 `json:$.record.is_latest` DEFAULT 0, `isYanked` UInt8 `json:$.record.is_yanked` DEFAULT 0, `isPrerelease` UInt8 `json:$.record.is_prerelease` DEFAULT 0, From 9de69dbda423d083f2806850984c0b75e2256b8b Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 11:48:20 +0100 Subject: [PATCH 06/19] fix: schema Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/packages.datasource | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/libs/tinybird/datasources/packages.datasource b/services/libs/tinybird/datasources/packages.datasource index 4831082020..4ccbcefa18 100644 --- a/services/libs/tinybird/datasources/packages.datasource +++ b/services/libs/tinybird/datasources/packages.datasource @@ -60,7 +60,7 @@ SCHEMA > `hasCriticalVulnerability` UInt8 `json:$.record.has_critical_vulnerability` DEFAULT 0, `impact` Float32 `json:$.record.impact` DEFAULT 0, `isCritical` UInt8 `json:$.record.is_critical` DEFAULT 0, - `lastRankPassAt` DateTime64(3) `json:$.record.last_rank_pass_at` DEFAULT '', + `lastRankPassAt` Nullable(DateTime64(3)) `json:$.record.last_rank_pass_at`, `ingestionSource` String `json:$.record.ingestion_source` DEFAULT '', `lastSyncedAt` DateTime64(3) `json:$.record.last_synced_at`, `createdAt` DateTime64(3) `json:$.record.created_at` From 853eba2e197c4590d0cc97d1b42ba9fd5d53a1ea Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 11:48:36 +0100 Subject: [PATCH 07/19] fix: schema Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/packages.datasource | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/libs/tinybird/datasources/packages.datasource b/services/libs/tinybird/datasources/packages.datasource index 4ccbcefa18..3937f3b57e 100644 --- a/services/libs/tinybird/datasources/packages.datasource +++ b/services/libs/tinybird/datasources/packages.datasource @@ -52,8 +52,8 @@ SCHEMA > `distTagsBeta` String `json:$.record.dist_tags_beta` DEFAULT '', `versionsCount` UInt32 `json:$.record.versions_count` DEFAULT 0, `latestVersion` String `json:$.record.latest_version` DEFAULT '', - `firstReleaseAt` DateTime64(3) `json:$.record.first_release_at` DEFAULT '', - `latestReleaseAt` DateTime64(3) `json:$.record.latest_release_at` DEFAULT '', + `firstReleaseAt` Nullable(DateTime64(3)) `json:$.record.first_release_at`, + `latestReleaseAt` Nullable(DateTime64(3)) `json:$.record.latest_release_at`, `dependentCount` UInt32 `json:$.record.dependent_count` DEFAULT 0, `transitiveDependentCount` UInt64 `json:$.record.transitive_dependent_count` DEFAULT 0, `dependentReposCount` UInt32 `json:$.record.dependent_repos_count` DEFAULT 0, From bec85152e1553b9117e6f3bae01b5923d29227b8 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 11:51:39 +0100 Subject: [PATCH 08/19] fix: schema Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/advisories.datasource | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/libs/tinybird/datasources/advisories.datasource b/services/libs/tinybird/datasources/advisories.datasource index 6fd114176a..931b804025 100644 --- a/services/libs/tinybird/datasources/advisories.datasource +++ b/services/libs/tinybird/datasources/advisories.datasource @@ -29,8 +29,8 @@ SCHEMA > `isCritical` UInt8 `json:$.record.is_critical` DEFAULT 0, `summary` String `json:$.record.summary` DEFAULT '', `details` String `json:$.record.details` DEFAULT '', - `publishedAt` DateTime64(3) `json:$.record.published_at` DEFAULT '', - `modifiedAt` DateTime64(3) `json:$.record.modified_at` DEFAULT '', + `publishedAt` Nullable(DateTime64(3)) `json:$.record.published_at`, + `modifiedAt` Nullable(DateTime64(3)) `json:$.record.modified_at`, `createdAt` DateTime64(3) `json:$.record.created_at`, `updatedAt` DateTime64(3) `json:$.record.updated_at` From ea9a7b7540df892bc735ccd0631207910857a110 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 11:52:46 +0100 Subject: [PATCH 09/19] fix: description Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/packages.datasource | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/libs/tinybird/datasources/packages.datasource b/services/libs/tinybird/datasources/packages.datasource index 3937f3b57e..4c8451a5b7 100644 --- a/services/libs/tinybird/datasources/packages.datasource +++ b/services/libs/tinybird/datasources/packages.datasource @@ -27,7 +27,7 @@ DESCRIPTION > - `hasCriticalVulnerability` is 1 if the latest version has an active critical advisory (CVSS >= 7.0), 0 otherwise. - `impact` is the composite criticality impact score from the ranking function (0 if not yet ranked). - `isCritical` is 1 if this package ranks in the top N by ecosystem in the most recent ranking pass, 0 otherwise. - - `lastRankPassAt` is when the criticality ranking worker last processed this package (empty string if never ranked). + - `lastRankPassAt` is when the criticality ranking worker last processed this package (NULL if never ranked). - `ingestionSource` identifies which worker last wrote the row: 'npm-registry', 'deps_dev', etc. (empty string if unknown). - `lastSyncedAt` is when the row was last written by any worker — serves as the updated_at watermark for sync. - `createdAt` is the row-insert timestamp — set once on first insert, never updated. From 896b17417f6a30c35c28abbcce0a523210fa5c96 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 11:53:14 +0100 Subject: [PATCH 10/19] fix: schema Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/repos.datasource | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/services/libs/tinybird/datasources/repos.datasource b/services/libs/tinybird/datasources/repos.datasource index 6a753dd3b3..707cceb689 100644 --- a/services/libs/tinybird/datasources/repos.datasource +++ b/services/libs/tinybird/datasources/repos.datasource @@ -49,11 +49,12 @@ SCHEMA > `rawProjectType` String `json:$.record.raw_project_type` DEFAULT '', `rawProjectName` String `json:$.record.raw_project_name` DEFAULT '', `scorecardScore` Float32 `json:$.record.scorecard_score` DEFAULT 0, - `scorecardLastRunAt` DateTime64(3) `json:$.record.scorecard_last_run_at` DEFAULT '', + `scorecardLastRunAt` Nullable(DateTime64(3)) `json:$.record.scorecard_last_run_at`, `skipEnrichment` UInt8 `json:$.record.skip_enrichment` DEFAULT 0, - `lastSyncedAt` DateTime64(3) `json:$.record.last_synced_at` + `lastSyncedAt` Nullable(DateTime64(3)) `json:$.record.last_synced_at`, + `updatedAt` DateTime64(3) `json:$.record.updated_at` ENGINE ReplacingMergeTree -ENGINE_PARTITION_KEY toYear(lastSyncedAt) +ENGINE_PARTITION_KEY toYear(updatedAt) ENGINE_SORTING_KEY id -ENGINE_VER lastSyncedAt +ENGINE_VER updatedAt From 050ba768bf4397c7327fed4242d704f5791e286a Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 11:53:25 +0100 Subject: [PATCH 11/19] fix: schema Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/repos.datasource | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/libs/tinybird/datasources/repos.datasource b/services/libs/tinybird/datasources/repos.datasource index 707cceb689..4e5ce49c56 100644 --- a/services/libs/tinybird/datasources/repos.datasource +++ b/services/libs/tinybird/datasources/repos.datasource @@ -40,7 +40,7 @@ SCHEMA > `forks` Int32 `json:$.record.forks` DEFAULT 0, `watchers` Int32 `json:$.record.watchers` DEFAULT 0, `openIssues` Int32 `json:$.record.open_issues` DEFAULT 0, - `lastCommitAt` DateTime64(3) `json:$.record.last_commit_at` DEFAULT '', + `lastCommitAt` Nullable(DateTime64(3)) `json:$.record.last_commit_at`, `archived` UInt8 `json:$.record.archived` DEFAULT 0, `disabled` UInt8 `json:$.record.disabled` DEFAULT 0, `isFork` UInt8 `json:$.record.is_fork` DEFAULT 0, From c8299b6145c1c347b23bf5909ab36fe0e0ed9161 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 11:53:41 +0100 Subject: [PATCH 12/19] fix: description Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/advisories.datasource | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/libs/tinybird/datasources/advisories.datasource b/services/libs/tinybird/datasources/advisories.datasource index 931b804025..c30d1467af 100644 --- a/services/libs/tinybird/datasources/advisories.datasource +++ b/services/libs/tinybird/datasources/advisories.datasource @@ -13,8 +13,8 @@ DESCRIPTION > - `isCritical` is 1 when cvss >= 7.0 (HIGH or CRITICAL), 0 otherwise — computed from the score. - `summary` is a short human-readable description of the vulnerability (empty string if not provided). - `details` is the full advisory text (empty string if not provided). - - `publishedAt` is when the advisory was first published upstream (empty string if unknown). - - `modifiedAt` is when the advisory was last modified upstream; NULL for BQ-sourced rows (empty string if unknown). + - `publishedAt` is when the advisory was first published upstream (NULL if unknown). + - `modifiedAt` is when the advisory was last modified upstream; NULL for BQ-sourced rows. - `createdAt` and `updatedAt` are row-level audit timestamps for Tinybird watermark-based sync. SCHEMA > From b5aa57945506611a4d13a2d86baa908fceccafe7 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 11:53:59 +0100 Subject: [PATCH 13/19] fix: schema Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/repos.datasource | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/libs/tinybird/datasources/repos.datasource b/services/libs/tinybird/datasources/repos.datasource index 4e5ce49c56..8218767d43 100644 --- a/services/libs/tinybird/datasources/repos.datasource +++ b/services/libs/tinybird/datasources/repos.datasource @@ -44,7 +44,7 @@ SCHEMA > `archived` UInt8 `json:$.record.archived` DEFAULT 0, `disabled` UInt8 `json:$.record.disabled` DEFAULT 0, `isFork` UInt8 `json:$.record.is_fork` DEFAULT 0, - `createdAt` DateTime64(3) `json:$.record.created_at` DEFAULT '', + `createdAt` Nullable(DateTime64(3)) `json:$.record.created_at`, `homepage` String `json:$.record.homepage` DEFAULT '', `rawProjectType` String `json:$.record.raw_project_type` DEFAULT '', `rawProjectName` String `json:$.record.raw_project_name` DEFAULT '', From cc1be8ba2f6c5fe302fff1694abbb4bfe79ab199 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 11:54:12 +0100 Subject: [PATCH 14/19] fix: description Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/repos.datasource | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/services/libs/tinybird/datasources/repos.datasource b/services/libs/tinybird/datasources/repos.datasource index 8218767d43..6a41b4c2d6 100644 --- a/services/libs/tinybird/datasources/repos.datasource +++ b/services/libs/tinybird/datasources/repos.datasource @@ -14,7 +14,7 @@ DESCRIPTION > - `forks` is the fork count (0 if not yet enriched). - `watchers` is the watcher count (0 if not yet enriched). - `openIssues` is the number of open issues (0 if not yet enriched). - - `lastCommitAt` is the timestamp of the most recent commit (empty string if unknown). + - `lastCommitAt` is the timestamp of the most recent commit (NULL if unknown). - `archived` is 1 if the repository is archived, 0 otherwise (0 default until enriched). - `disabled` is 1 if the repository is disabled, 0 otherwise (0 default until enriched). - `isFork` is 1 if this repository is a fork of another, 0 otherwise (0 default until enriched). @@ -23,9 +23,10 @@ DESCRIPTION > - `rawProjectType` is the deps.dev project type string (e.g. 'GITHUB', 'GITLAB') for identity resolution. - `rawProjectName` is the deps.dev project name (e.g. 'github.com/owner/repo') for identity resolution. - `scorecardScore` is the OpenSSF Scorecard aggregate score (0 if not yet scored). - - `scorecardLastRunAt` is when the Scorecard was last evaluated (empty string if never run). + - `scorecardLastRunAt` is when the Scorecard was last evaluated (NULL if never run). - `skipEnrichment` is 1 if the GitHub enricher should skip this repo (e.g. invalid/transient URLs), 0 otherwise. - - `lastSyncedAt` is when the row was last written by any worker — serves as the updated_at watermark for sync. + - `lastSyncedAt` is the timestamp of the last GitHub enrichment attempt (NULL if never enriched or transient failure). + - `updatedAt` is the row-level audit timestamp used as the Tinybird watermark for sync. SCHEMA > `id` UInt64 `json:$.record.id`, From bd550edfd64d51ac10bdfef3bb7f71c6bf4fef25 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 11:54:23 +0100 Subject: [PATCH 15/19] fix: description Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/packages.datasource | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/libs/tinybird/datasources/packages.datasource b/services/libs/tinybird/datasources/packages.datasource index 4c8451a5b7..a23db40c97 100644 --- a/services/libs/tinybird/datasources/packages.datasource +++ b/services/libs/tinybird/datasources/packages.datasource @@ -19,8 +19,8 @@ DESCRIPTION > - `distTagsLatest`, `distTagsNext`, `distTagsBeta` are npm dist-tag values; empty string for non-npm packages. - `versionsCount` is the total number of published versions (0 if unknown). - `latestVersion` is the most recent stable version string (empty string if unknown). - - `firstReleaseAt` is the timestamp of the earliest published version (empty string if unknown). - - `latestReleaseAt` is the timestamp of the most recent published version (empty string if unknown). + - `firstReleaseAt` is the timestamp of the earliest published version (NULL if unknown). + - `latestReleaseAt` is the timestamp of the most recent published version (NULL if unknown). - `dependentCount` is the number of packages that directly depend on this package (0 if unknown). - `transitiveDependentCount` is the number of packages that transitively depend on this package (0 if unknown). - `dependentReposCount` is the number of repositories that depend on this package (0 if unknown). From bc0f92e1b73ea59d2c001aceaa1a28b194ffd20b Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 11:54:31 +0100 Subject: [PATCH 16/19] fix: description Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Signed-off-by: Joana Maia --- services/libs/tinybird/datasources/versions.datasource | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/libs/tinybird/datasources/versions.datasource b/services/libs/tinybird/datasources/versions.datasource index 08f750e1be..f2a533a4ff 100644 --- a/services/libs/tinybird/datasources/versions.datasource +++ b/services/libs/tinybird/datasources/versions.datasource @@ -8,7 +8,7 @@ DESCRIPTION > - `namespace` is the package scope or group; empty string if not applicable. - `name` is the package name — denormalized from packages for fast dependency resolution without a join. - `number` is the version string (e.g. '1.2.3', '2.0.0-beta.1'). - - `publishedAt` is when this version was published to the registry (empty string if unknown). + - `publishedAt` is when this version was published to the registry (NULL if unknown). - `isLatest` is 1 if this is the current latest stable version according to the registry, 0 otherwise (0 default until enriched). - `isYanked` is 1 if this version was retracted/yanked from the registry, 0 otherwise (0 default until enriched). - `isPrerelease` is 1 if the version string contains a pre-release identifier (alpha, beta, rc, etc.), 0 otherwise. From 5ed5eab9232144671757399005b96aeb5234dc48 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 14:10:44 +0100 Subject: [PATCH 17/19] chore: add sequin publication for packages-db tables Signed-off-by: Joana Maia --- ...1781009234__createPublicationForSequin.sql | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 backend/src/osspckgs/migrations/V1781009234__createPublicationForSequin.sql diff --git a/backend/src/osspckgs/migrations/V1781009234__createPublicationForSequin.sql b/backend/src/osspckgs/migrations/V1781009234__createPublicationForSequin.sql new file mode 100644 index 0000000000..2e9e86de23 --- /dev/null +++ b/backend/src/osspckgs/migrations/V1781009234__createPublicationForSequin.sql @@ -0,0 +1,52 @@ +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_publication WHERE pubname = 'sequin_pub' + ) THEN + CREATE PUBLICATION sequin_pub + FOR TABLE + packages, + versions, + package_dependencies, + package_maintainers, + package_repos, + maintainers, + repos, + repo_scorecard_checks, + advisories, + advisory_packages, + advisory_affected_ranges + WITH (publish_via_partition_root = true); + END IF; +END$$; + +ALTER TABLE public.packages REPLICA IDENTITY FULL; +ALTER TABLE public.versions REPLICA IDENTITY FULL; +ALTER TABLE public.package_dependencies REPLICA IDENTITY FULL; +ALTER TABLE public.package_maintainers REPLICA IDENTITY FULL; +ALTER TABLE public.package_repos REPLICA IDENTITY FULL; +ALTER TABLE public.maintainers REPLICA IDENTITY FULL; +ALTER TABLE public.repos REPLICA IDENTITY FULL; +ALTER TABLE public.repo_scorecard_checks REPLICA IDENTITY FULL; +ALTER TABLE public.advisories REPLICA IDENTITY FULL; +ALTER TABLE public.advisory_packages REPLICA IDENTITY FULL; +ALTER TABLE public.advisory_affected_ranges REPLICA IDENTITY FULL; + +-- versions (32) and package_dependencies (64) are hash-partitioned. REPLICA +-- IDENTITY on the partitioned root does not cascade; set it on every leaf. +DO $$ +DECLARE + parent_table text; + partition_oid regclass; +BEGIN + FOREACH parent_table IN ARRAY ARRAY['public.versions', 'public.package_dependencies'] + LOOP + FOR partition_oid IN + SELECT inhrelid::regclass + FROM pg_inherits + WHERE inhparent = parent_table::regclass + LOOP + EXECUTE format('ALTER TABLE %s REPLICA IDENTITY FULL', partition_oid); + END LOOP; + END LOOP; +END$$; From e4bbce7a8d3986372fed0ef82c852ac1073632a5 Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 14:38:40 +0100 Subject: [PATCH 18/19] fix: align packages-db writers with Tinybird ENGINE_VER semantics Signed-off-by: Joana Maia --- ...00__rank_packages_bumps_last_synced_at.sql | 97 +++++++++++++++++++ .../src/deps-dev/workflows/ingestRepos.ts | 6 +- .../data-access-layer/src/osspckgs/repos.ts | 15 +-- .../data-access-layer/src/packages/osv.ts | 4 +- .../tinybird/datasources/repos.datasource | 12 +-- .../tinybird/datasources/versions.datasource | 8 +- 6 files changed, 121 insertions(+), 21 deletions(-) create mode 100644 backend/src/osspckgs/migrations/V1781100000__rank_packages_bumps_last_synced_at.sql diff --git a/backend/src/osspckgs/migrations/V1781100000__rank_packages_bumps_last_synced_at.sql b/backend/src/osspckgs/migrations/V1781100000__rank_packages_bumps_last_synced_at.sql new file mode 100644 index 0000000000..33673e12a5 --- /dev/null +++ b/backend/src/osspckgs/migrations/V1781100000__rank_packages_bumps_last_synced_at.sql @@ -0,0 +1,97 @@ +-- rank_packages() now bumps last_synced_at on every UPDATE that touches a +-- DS-exported field (impact, is_critical, last_rank_pass_at). last_synced_at +-- is the Tinybird ENGINE_VER for the packages datasource; without this bump, +-- ReplacingMergeTree may keep an older row when criticality changes without +-- any other write path touching the package row. + +CREATE OR REPLACE FUNCTION rank_packages( + weight_downloads numeric DEFAULT 0.25, + weight_dependent_packages numeric DEFAULT 0.25, + weight_transitive numeric DEFAULT 0.50, + critical_top_n_by_ecosystem jsonb DEFAULT '{"npm":400000,"go":100000,"maven":200000,"pypi":100000,"nuget":50000,"cargo":75000}'::jsonb +) +RETURNS TABLE(scored_rows int, ranked_rows int) +LANGUAGE plpgsql AS $$ +DECLARE + n_scored int; + n_ranked int; +BEGIN + -- Step 1: score + WITH percentile_scores AS ( + SELECT + id, + ( + weight_downloads * PERCENT_RANK() OVER ( + PARTITION BY ecosystem ORDER BY LOG(1 + COALESCE(downloads_last_30d, 0))) + + + weight_dependent_packages * PERCENT_RANK() OVER ( + PARTITION BY ecosystem ORDER BY LOG(1 + COALESCE(dependent_count, 0))) + + + weight_transitive * PERCENT_RANK() OVER ( + PARTITION BY ecosystem ORDER BY LOG(1 + COALESCE(transitive_dependent_count, 0))) + )::numeric(10, 4) AS new_impact + FROM packages + WHERE ecosystem IN (SELECT jsonb_object_keys(critical_top_n_by_ecosystem)) + ) + UPDATE packages p + SET impact = ps.new_impact, + last_synced_at = NOW() + FROM percentile_scores ps + WHERE p.id = ps.id + AND p.impact IS DISTINCT FROM ps.new_impact; + + GET DIAGNOSTICS n_scored = ROW_COUNT; + + -- Step 2: rank + flag + WITH ranked AS ( + SELECT + id, ecosystem, + ROW_NUMBER() OVER ( + PARTITION BY ecosystem + ORDER BY impact DESC NULLS LAST, id + ) AS r + FROM packages + WHERE purl IS NOT NULL + AND ecosystem IN (SELECT jsonb_object_keys(critical_top_n_by_ecosystem)) + ), + flagged AS ( + SELECT + id, r, + COALESCE( + r <= (critical_top_n_by_ecosystem ->> ecosystem)::int, + FALSE + ) AS new_is_critical + FROM ranked + ) + UPDATE packages p + SET rank_in_ecosystem = f.r, + is_critical = f.new_is_critical, + last_synced_at = NOW() + FROM flagged f + WHERE p.id = f.id + AND ( + p.rank_in_ecosystem IS DISTINCT FROM f.r + OR p.is_critical IS DISTINCT FROM f.new_is_critical + ); + + GET DIAGNOSTICS n_ranked = ROW_COUNT; + + -- Step 2.5: spotlight overrides + UPDATE packages p + SET is_critical = TRUE, + last_synced_at = NOW() + FROM package_criticality_spotlight s + WHERE p.ecosystem = s.ecosystem + AND (p.namespace IS NOT DISTINCT FROM s.namespace) + AND p.name = s.name + AND p.is_critical = FALSE; + + -- Step 3: stamp last_rank_pass_at unconditionally + UPDATE packages + SET last_rank_pass_at = NOW(), + last_synced_at = NOW() + WHERE ecosystem IN (SELECT jsonb_object_keys(critical_top_n_by_ecosystem)); + + RETURN QUERY SELECT n_scored, n_ranked; +END; +$$; diff --git a/services/apps/packages_worker/src/deps-dev/workflows/ingestRepos.ts b/services/apps/packages_worker/src/deps-dev/workflows/ingestRepos.ts index b6cc6b2063..c5a8cb7a6e 100644 --- a/services/apps/packages_worker/src/deps-dev/workflows/ingestRepos.ts +++ b/services/apps/packages_worker/src/deps-dev/workflows/ingestRepos.ts @@ -58,11 +58,13 @@ const REPOS_PG_COLUMNS = [ 'open_issues', ] +// last_synced_at is intentionally left NULL on seed — it is owned by the GitHub +// enricher as its freshness signal. created_at / updated_at use their column defaults. const REPOS_MERGE_SQL = ` INSERT INTO repos (url, raw_project_type, raw_project_name, host, owner, name, - description, homepage, stars, forks, open_issues, last_synced_at) + description, homepage, stars, forks, open_issues) SELECT s.canonical_url, s.raw_project_type, s.raw_project_name, s.host, s.owner, s.name, - s.description, s.homepage, s.stars, s.forks, s.open_issues, NOW() + s.description, s.homepage, s.stars, s.forks, s.open_issues FROM staging.osspckgs_repos_raw s ON CONFLICT (url) DO NOTHING ` diff --git a/services/libs/data-access-layer/src/osspckgs/repos.ts b/services/libs/data-access-layer/src/osspckgs/repos.ts index ab58036f72..6c67f0e569 100644 --- a/services/libs/data-access-layer/src/osspckgs/repos.ts +++ b/services/libs/data-access-layer/src/osspckgs/repos.ts @@ -15,19 +15,20 @@ export async function findRepoIdsByUrl( * Inserts or updates a repo row keyed on url. * Uses COALESCE so richer data from other enrichers (GitHub, deps.dev) is never * overwritten with nulls from a partial write. + * `last_synced_at` is intentionally NOT touched here — that column is owned by the + * GitHub enricher as its freshness signal. Maven discovery only stamps updated_at. * Returns the repo id. */ export async function upsertRepo(qx: QueryExecutor, item: IDbRepoUpsert): Promise { const row = await qx.selectOne( ` - INSERT INTO repos (url, host, owner, name, last_synced_at, updated_at) - VALUES ($(url), $(host), $(owner), $(name), NOW(), NOW()) + INSERT INTO repos (url, host, owner, name, updated_at) + VALUES ($(url), $(host), $(owner), $(name), NOW()) ON CONFLICT (url) DO UPDATE SET - host = COALESCE(EXCLUDED.host, repos.host), - owner = COALESCE(EXCLUDED.owner, repos.owner), - name = COALESCE(EXCLUDED.name, repos.name), - last_synced_at = NOW(), - updated_at = NOW() + host = COALESCE(EXCLUDED.host, repos.host), + owner = COALESCE(EXCLUDED.owner, repos.owner), + name = COALESCE(EXCLUDED.name, repos.name), + updated_at = NOW() RETURNING id `, item, diff --git a/services/libs/data-access-layer/src/packages/osv.ts b/services/libs/data-access-layer/src/packages/osv.ts index 9f7c3af33f..8ce9c45b84 100644 --- a/services/libs/data-access-layer/src/packages/osv.ts +++ b/services/libs/data-access-layer/src/packages/osv.ts @@ -243,7 +243,7 @@ export async function getRangesForPackages(qx: QueryExecutor, ids: number[]): Pr export async function flipVulnerableFlags(qx: QueryExecutor, ids: number[]): Promise { if (ids.length === 0) return 0 return qx.result( - `UPDATE packages SET has_critical_vulnerability = TRUE + `UPDATE packages SET has_critical_vulnerability = TRUE, last_synced_at = NOW() WHERE id IN ($(ids:csv)) AND has_critical_vulnerability = FALSE`, { ids }, ) @@ -252,7 +252,7 @@ export async function flipVulnerableFlags(qx: QueryExecutor, ids: number[]): Pro export async function clearSafeFlags(qx: QueryExecutor, ids: number[]): Promise { if (ids.length === 0) return 0 return qx.result( - `UPDATE packages SET has_critical_vulnerability = FALSE + `UPDATE packages SET has_critical_vulnerability = FALSE, last_synced_at = NOW() WHERE id IN ($(ids:csv)) AND has_critical_vulnerability = TRUE`, { ids }, ) diff --git a/services/libs/tinybird/datasources/repos.datasource b/services/libs/tinybird/datasources/repos.datasource index 6a41b4c2d6..0d1d1419e8 100644 --- a/services/libs/tinybird/datasources/repos.datasource +++ b/services/libs/tinybird/datasources/repos.datasource @@ -15,9 +15,9 @@ DESCRIPTION > - `watchers` is the watcher count (0 if not yet enriched). - `openIssues` is the number of open issues (0 if not yet enriched). - `lastCommitAt` is the timestamp of the most recent commit (NULL if unknown). - - `archived` is 1 if the repository is archived, 0 otherwise (0 default until enriched). - - `disabled` is 1 if the repository is disabled, 0 otherwise (0 default until enriched). - - `isFork` is 1 if this repository is a fork of another, 0 otherwise (0 default until enriched). + - `archived` is 1 if the repository is archived, 0 if not, NULL until the GitHub enricher runs (deps.dev seed does not expose this). + - `disabled` is 1 if the repository is disabled, 0 if not, NULL until the GitHub enricher runs. + - `isFork` is 1 if this repository is a fork of another, 0 if not, NULL until the GitHub enricher runs. - `createdAt` is the repository creation date on GitHub/GitLab — a domain timestamp, not a row-insert timestamp. - `homepage` is the project homepage URL (empty string if not provided). - `rawProjectType` is the deps.dev project type string (e.g. 'GITHUB', 'GITLAB') for identity resolution. @@ -42,9 +42,9 @@ SCHEMA > `watchers` Int32 `json:$.record.watchers` DEFAULT 0, `openIssues` Int32 `json:$.record.open_issues` DEFAULT 0, `lastCommitAt` Nullable(DateTime64(3)) `json:$.record.last_commit_at`, - `archived` UInt8 `json:$.record.archived` DEFAULT 0, - `disabled` UInt8 `json:$.record.disabled` DEFAULT 0, - `isFork` UInt8 `json:$.record.is_fork` DEFAULT 0, + `archived` Nullable(UInt8) `json:$.record.archived`, + `disabled` Nullable(UInt8) `json:$.record.disabled`, + `isFork` Nullable(UInt8) `json:$.record.is_fork`, `createdAt` Nullable(DateTime64(3)) `json:$.record.created_at`, `homepage` String `json:$.record.homepage` DEFAULT '', `rawProjectType` String `json:$.record.raw_project_type` DEFAULT '', diff --git a/services/libs/tinybird/datasources/versions.datasource b/services/libs/tinybird/datasources/versions.datasource index f2a533a4ff..4b3b620791 100644 --- a/services/libs/tinybird/datasources/versions.datasource +++ b/services/libs/tinybird/datasources/versions.datasource @@ -9,8 +9,8 @@ DESCRIPTION > - `name` is the package name — denormalized from packages for fast dependency resolution without a join. - `number` is the version string (e.g. '1.2.3', '2.0.0-beta.1'). - `publishedAt` is when this version was published to the registry (NULL if unknown). - - `isLatest` is 1 if this is the current latest stable version according to the registry, 0 otherwise (0 default until enriched). - - `isYanked` is 1 if this version was retracted/yanked from the registry, 0 otherwise (0 default until enriched). + - `isLatest` is 1 if this is the current latest stable version, 0 if not, NULL until registry workers enrich (deps.dev seed does not expose this). + - `isYanked` is 1 if this version was retracted/yanked from the registry, 0 if not, NULL until registry workers enrich. - `isPrerelease` is 1 if the version string contains a pre-release identifier (alpha, beta, rc, etc.), 0 otherwise. - `licenses` is an array of SPDX-normalized license identifiers for this specific version (may differ from the package-level value). - `downloadCount` is the per-version download count where available (npm, crates); 0 if not tracked. @@ -25,8 +25,8 @@ SCHEMA > `name` String `json:$.record.name`, `number` String `json:$.record.number`, `publishedAt` Nullable(DateTime64(3)) `json:$.record.published_at`, - `isLatest` UInt8 `json:$.record.is_latest` DEFAULT 0, - `isYanked` UInt8 `json:$.record.is_yanked` DEFAULT 0, + `isLatest` Nullable(UInt8) `json:$.record.is_latest`, + `isYanked` Nullable(UInt8) `json:$.record.is_yanked`, `isPrerelease` UInt8 `json:$.record.is_prerelease` DEFAULT 0, `licenses` Array(String) `json:$.record.licenses[:]` DEFAULT [], `downloadCount` UInt64 `json:$.record.download_count` DEFAULT 0, From 49aa2ff53c8ba859124eb46b493c65c63db108ab Mon Sep 17 00:00:00 2001 From: Joana Maia Date: Tue, 9 Jun 2026 14:56:29 +0100 Subject: [PATCH 19/19] chore: combine sequin publication and rank_packages migrations Signed-off-by: Joana Maia --- ...1781009234__createPublicationForSequin.sql | 52 ------ ...uin_publication_and_rank_packages_sync.sql | 165 ++++++++++++++++++ ...00__rank_packages_bumps_last_synced_at.sql | 97 ---------- 3 files changed, 165 insertions(+), 149 deletions(-) delete mode 100644 backend/src/osspckgs/migrations/V1781009234__createPublicationForSequin.sql create mode 100644 backend/src/osspckgs/migrations/V1781009234__sequin_publication_and_rank_packages_sync.sql delete mode 100644 backend/src/osspckgs/migrations/V1781100000__rank_packages_bumps_last_synced_at.sql diff --git a/backend/src/osspckgs/migrations/V1781009234__createPublicationForSequin.sql b/backend/src/osspckgs/migrations/V1781009234__createPublicationForSequin.sql deleted file mode 100644 index 2e9e86de23..0000000000 --- a/backend/src/osspckgs/migrations/V1781009234__createPublicationForSequin.sql +++ /dev/null @@ -1,52 +0,0 @@ -DO $$ -BEGIN - IF NOT EXISTS ( - SELECT 1 FROM pg_publication WHERE pubname = 'sequin_pub' - ) THEN - CREATE PUBLICATION sequin_pub - FOR TABLE - packages, - versions, - package_dependencies, - package_maintainers, - package_repos, - maintainers, - repos, - repo_scorecard_checks, - advisories, - advisory_packages, - advisory_affected_ranges - WITH (publish_via_partition_root = true); - END IF; -END$$; - -ALTER TABLE public.packages REPLICA IDENTITY FULL; -ALTER TABLE public.versions REPLICA IDENTITY FULL; -ALTER TABLE public.package_dependencies REPLICA IDENTITY FULL; -ALTER TABLE public.package_maintainers REPLICA IDENTITY FULL; -ALTER TABLE public.package_repos REPLICA IDENTITY FULL; -ALTER TABLE public.maintainers REPLICA IDENTITY FULL; -ALTER TABLE public.repos REPLICA IDENTITY FULL; -ALTER TABLE public.repo_scorecard_checks REPLICA IDENTITY FULL; -ALTER TABLE public.advisories REPLICA IDENTITY FULL; -ALTER TABLE public.advisory_packages REPLICA IDENTITY FULL; -ALTER TABLE public.advisory_affected_ranges REPLICA IDENTITY FULL; - --- versions (32) and package_dependencies (64) are hash-partitioned. REPLICA --- IDENTITY on the partitioned root does not cascade; set it on every leaf. -DO $$ -DECLARE - parent_table text; - partition_oid regclass; -BEGIN - FOREACH parent_table IN ARRAY ARRAY['public.versions', 'public.package_dependencies'] - LOOP - FOR partition_oid IN - SELECT inhrelid::regclass - FROM pg_inherits - WHERE inhparent = parent_table::regclass - LOOP - EXECUTE format('ALTER TABLE %s REPLICA IDENTITY FULL', partition_oid); - END LOOP; - END LOOP; -END$$; diff --git a/backend/src/osspckgs/migrations/V1781009234__sequin_publication_and_rank_packages_sync.sql b/backend/src/osspckgs/migrations/V1781009234__sequin_publication_and_rank_packages_sync.sql new file mode 100644 index 0000000000..c6bb0c0c91 --- /dev/null +++ b/backend/src/osspckgs/migrations/V1781009234__sequin_publication_and_rank_packages_sync.sql @@ -0,0 +1,165 @@ +-- Wire packages-db into the Sequin → Kafka → Tinybird pipeline. +-- +-- Two related changes bundled here because both serve the same goal — making +-- packages-db row changes replicate cleanly into Tinybird: +-- +-- 1. Publication + REPLICA IDENTITY FULL on the 11 tables the Tinybird +-- datasources read from. publish_via_partition_root collapses the +-- versions (32) / package_dependencies (64) partition leaves into a +-- single logical topic each. REPLICA IDENTITY on a partitioned root +-- does not cascade, so every leaf is set explicitly via pg_inherits. +-- +-- 2. rank_packages() bumps last_synced_at on every UPDATE that touches a +-- DS-exported field (impact, is_critical, last_rank_pass_at). +-- last_synced_at is the Tinybird ENGINE_VER for the packages datasource; +-- without this bump, ReplacingMergeTree may keep an older row when +-- criticality changes without any other write path touching the row. + +-- ─── 1. Sequin publication ────────────────────────────────────────────────── + +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_publication WHERE pubname = 'sequin_pub' + ) THEN + CREATE PUBLICATION sequin_pub + FOR TABLE + packages, + versions, + package_dependencies, + package_maintainers, + package_repos, + maintainers, + repos, + repo_scorecard_checks, + advisories, + advisory_packages, + advisory_affected_ranges + WITH (publish_via_partition_root = true); + END IF; +END$$; + +ALTER TABLE public.packages REPLICA IDENTITY FULL; +ALTER TABLE public.versions REPLICA IDENTITY FULL; +ALTER TABLE public.package_dependencies REPLICA IDENTITY FULL; +ALTER TABLE public.package_maintainers REPLICA IDENTITY FULL; +ALTER TABLE public.package_repos REPLICA IDENTITY FULL; +ALTER TABLE public.maintainers REPLICA IDENTITY FULL; +ALTER TABLE public.repos REPLICA IDENTITY FULL; +ALTER TABLE public.repo_scorecard_checks REPLICA IDENTITY FULL; +ALTER TABLE public.advisories REPLICA IDENTITY FULL; +ALTER TABLE public.advisory_packages REPLICA IDENTITY FULL; +ALTER TABLE public.advisory_affected_ranges REPLICA IDENTITY FULL; + +-- versions (32) and package_dependencies (64) are hash-partitioned. REPLICA +-- IDENTITY on the partitioned root does not cascade; set it on every leaf. +DO $$ +DECLARE + parent_table text; + partition_oid regclass; +BEGIN + FOREACH parent_table IN ARRAY ARRAY['public.versions', 'public.package_dependencies'] + LOOP + FOR partition_oid IN + SELECT inhrelid::regclass + FROM pg_inherits + WHERE inhparent = parent_table::regclass + LOOP + EXECUTE format('ALTER TABLE %s REPLICA IDENTITY FULL', partition_oid); + END LOOP; + END LOOP; +END$$; + +-- ─── 2. rank_packages() bumps last_synced_at ──────────────────────────────── + +CREATE OR REPLACE FUNCTION rank_packages( + weight_downloads numeric DEFAULT 0.25, + weight_dependent_packages numeric DEFAULT 0.25, + weight_transitive numeric DEFAULT 0.50, + critical_top_n_by_ecosystem jsonb DEFAULT '{"npm":400000,"go":100000,"maven":200000,"pypi":100000,"nuget":50000,"cargo":75000}'::jsonb +) +RETURNS TABLE(scored_rows int, ranked_rows int) +LANGUAGE plpgsql AS $$ +DECLARE + n_scored int; + n_ranked int; +BEGIN + -- Step 1: score + WITH percentile_scores AS ( + SELECT + id, + ( + weight_downloads * PERCENT_RANK() OVER ( + PARTITION BY ecosystem ORDER BY LOG(1 + COALESCE(downloads_last_30d, 0))) + + + weight_dependent_packages * PERCENT_RANK() OVER ( + PARTITION BY ecosystem ORDER BY LOG(1 + COALESCE(dependent_count, 0))) + + + weight_transitive * PERCENT_RANK() OVER ( + PARTITION BY ecosystem ORDER BY LOG(1 + COALESCE(transitive_dependent_count, 0))) + )::numeric(10, 4) AS new_impact + FROM packages + WHERE ecosystem IN (SELECT jsonb_object_keys(critical_top_n_by_ecosystem)) + ) + UPDATE packages p + SET impact = ps.new_impact, + last_synced_at = NOW() + FROM percentile_scores ps + WHERE p.id = ps.id + AND p.impact IS DISTINCT FROM ps.new_impact; + + GET DIAGNOSTICS n_scored = ROW_COUNT; + + -- Step 2: rank + flag + WITH ranked AS ( + SELECT + id, ecosystem, + ROW_NUMBER() OVER ( + PARTITION BY ecosystem + ORDER BY impact DESC NULLS LAST, id + ) AS r + FROM packages + WHERE purl IS NOT NULL + AND ecosystem IN (SELECT jsonb_object_keys(critical_top_n_by_ecosystem)) + ), + flagged AS ( + SELECT + id, r, + COALESCE( + r <= (critical_top_n_by_ecosystem ->> ecosystem)::int, + FALSE + ) AS new_is_critical + FROM ranked + ) + UPDATE packages p + SET rank_in_ecosystem = f.r, + is_critical = f.new_is_critical, + last_synced_at = NOW() + FROM flagged f + WHERE p.id = f.id + AND ( + p.rank_in_ecosystem IS DISTINCT FROM f.r + OR p.is_critical IS DISTINCT FROM f.new_is_critical + ); + + GET DIAGNOSTICS n_ranked = ROW_COUNT; + + -- Step 2.5: spotlight overrides + UPDATE packages p + SET is_critical = TRUE, + last_synced_at = NOW() + FROM package_criticality_spotlight s + WHERE p.ecosystem = s.ecosystem + AND (p.namespace IS NOT DISTINCT FROM s.namespace) + AND p.name = s.name + AND p.is_critical = FALSE; + + -- Step 3: stamp last_rank_pass_at unconditionally + UPDATE packages + SET last_rank_pass_at = NOW(), + last_synced_at = NOW() + WHERE ecosystem IN (SELECT jsonb_object_keys(critical_top_n_by_ecosystem)); + + RETURN QUERY SELECT n_scored, n_ranked; +END; +$$; diff --git a/backend/src/osspckgs/migrations/V1781100000__rank_packages_bumps_last_synced_at.sql b/backend/src/osspckgs/migrations/V1781100000__rank_packages_bumps_last_synced_at.sql deleted file mode 100644 index 33673e12a5..0000000000 --- a/backend/src/osspckgs/migrations/V1781100000__rank_packages_bumps_last_synced_at.sql +++ /dev/null @@ -1,97 +0,0 @@ --- rank_packages() now bumps last_synced_at on every UPDATE that touches a --- DS-exported field (impact, is_critical, last_rank_pass_at). last_synced_at --- is the Tinybird ENGINE_VER for the packages datasource; without this bump, --- ReplacingMergeTree may keep an older row when criticality changes without --- any other write path touching the package row. - -CREATE OR REPLACE FUNCTION rank_packages( - weight_downloads numeric DEFAULT 0.25, - weight_dependent_packages numeric DEFAULT 0.25, - weight_transitive numeric DEFAULT 0.50, - critical_top_n_by_ecosystem jsonb DEFAULT '{"npm":400000,"go":100000,"maven":200000,"pypi":100000,"nuget":50000,"cargo":75000}'::jsonb -) -RETURNS TABLE(scored_rows int, ranked_rows int) -LANGUAGE plpgsql AS $$ -DECLARE - n_scored int; - n_ranked int; -BEGIN - -- Step 1: score - WITH percentile_scores AS ( - SELECT - id, - ( - weight_downloads * PERCENT_RANK() OVER ( - PARTITION BY ecosystem ORDER BY LOG(1 + COALESCE(downloads_last_30d, 0))) - - + weight_dependent_packages * PERCENT_RANK() OVER ( - PARTITION BY ecosystem ORDER BY LOG(1 + COALESCE(dependent_count, 0))) - - + weight_transitive * PERCENT_RANK() OVER ( - PARTITION BY ecosystem ORDER BY LOG(1 + COALESCE(transitive_dependent_count, 0))) - )::numeric(10, 4) AS new_impact - FROM packages - WHERE ecosystem IN (SELECT jsonb_object_keys(critical_top_n_by_ecosystem)) - ) - UPDATE packages p - SET impact = ps.new_impact, - last_synced_at = NOW() - FROM percentile_scores ps - WHERE p.id = ps.id - AND p.impact IS DISTINCT FROM ps.new_impact; - - GET DIAGNOSTICS n_scored = ROW_COUNT; - - -- Step 2: rank + flag - WITH ranked AS ( - SELECT - id, ecosystem, - ROW_NUMBER() OVER ( - PARTITION BY ecosystem - ORDER BY impact DESC NULLS LAST, id - ) AS r - FROM packages - WHERE purl IS NOT NULL - AND ecosystem IN (SELECT jsonb_object_keys(critical_top_n_by_ecosystem)) - ), - flagged AS ( - SELECT - id, r, - COALESCE( - r <= (critical_top_n_by_ecosystem ->> ecosystem)::int, - FALSE - ) AS new_is_critical - FROM ranked - ) - UPDATE packages p - SET rank_in_ecosystem = f.r, - is_critical = f.new_is_critical, - last_synced_at = NOW() - FROM flagged f - WHERE p.id = f.id - AND ( - p.rank_in_ecosystem IS DISTINCT FROM f.r - OR p.is_critical IS DISTINCT FROM f.new_is_critical - ); - - GET DIAGNOSTICS n_ranked = ROW_COUNT; - - -- Step 2.5: spotlight overrides - UPDATE packages p - SET is_critical = TRUE, - last_synced_at = NOW() - FROM package_criticality_spotlight s - WHERE p.ecosystem = s.ecosystem - AND (p.namespace IS NOT DISTINCT FROM s.namespace) - AND p.name = s.name - AND p.is_critical = FALSE; - - -- Step 3: stamp last_rank_pass_at unconditionally - UPDATE packages - SET last_rank_pass_at = NOW(), - last_synced_at = NOW() - WHERE ecosystem IN (SELECT jsonb_object_keys(critical_top_n_by_ecosystem)); - - RETURN QUERY SELECT n_scored, n_ranked; -END; -$$;