Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions backend/.env.dist.local
Original file line number Diff line number Diff line change
Expand Up @@ -199,3 +199,10 @@ OSV_ECOSYSTEMS=npm,Maven
OSV_TMP_DIR=/tmp/osv
OSV_BATCH_SIZE=500
OSV_DERIVE_BATCH_SIZE=1000

# dockerhub-sync (see services/apps/packages_worker/src/dockerhub/)
DOCKERHUB_API_BASE_URL=https://hub.docker.com/v2
DOCKERHUB_BATCH_SIZE=100
DOCKERHUB_REFRESH_INTERVAL_HOURS=24
DOCKERHUB_DISCOVERY_INTERVAL_DAYS=14
DOCKERHUB_IDLE_SLEEP_SEC=60
61 changes: 61 additions & 0 deletions backend/src/osspckgs/migrations/V1780928852__dockerhub_sync.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
-- dockerhub-sync (CM-1213)
--
-- Adds discovery/refresh bookkeeping for the dockerhub-sync worker
-- (services/apps/packages_worker/src/dockerhub) and a daily snapshot table
-- for Docker Hub lifetime pull counts.

-- Last time dockerhub-sync probed this repo for a published Docker image
-- (Dockerfile detection + Hub candidate lookup). NULL = never checked.
-- Separate from repos.last_synced_at because discovery cadence (weeks)
-- differs from light-metadata refresh cadence (daily).
ALTER TABLE repos
ADD COLUMN IF NOT EXISTS docker_checked_at timestamptz;

-- Partial index for the discovery backlog query: pages repos that have never
-- been probed for a Docker image. Once docker_checked_at is set the row drops
-- out of the index, so this stays small even as the repos table grows.
CREATE INDEX IF NOT EXISTS repos_docker_pending_idx ON repos (id)
WHERE
host = 'github' AND docker_checked_at IS NULL;

-- Supports the refresh query (WHERE last_synced_at < NOW() - interval).
CREATE INDEX IF NOT EXISTS repo_docker_stale_idx ON repo_docker (last_synced_at);

-- ============================================================
-- REPO DOCKER PULLS DAILY
-- One row per image per day storing the *lifetime* pull_count as returned
-- by hub.docker.com/v2/repositories/<image>. Docker Hub does not expose
-- per-day download counts, so daily deltas are derived at query time:
-- pulls_total - LAG(pulls_total) OVER (PARTITION BY image_name ORDER BY date)
-- Keyed by image_name (matches repo_docker UNIQUE) so rows survive a
-- repo_docker re-discovery without an FK cascade.
--
-- Partitioned monthly via pg_partman (extension + schema already created in
-- V1780231200__npm_worker.sql).
-- ============================================================
CREATE TABLE IF NOT EXISTS repo_docker_pulls_daily (
image_name text NOT NULL,
date date NOT NULL,
pulls_total bigint NOT NULL,
PRIMARY KEY (image_name, date)
)
PARTITION BY RANGE (date);
Comment thread
cursor[bot] marked this conversation as resolved.

-- Guard so this migration is idempotent against environments where the
-- table was already registered manually (e.g. local dev that applied the
-- earlier in-place schema edit).
DO $$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM partman.part_config
WHERE parent_table = 'public.repo_docker_pulls_daily'
) THEN
PERFORM partman.create_parent(
p_parent_table => 'public.repo_docker_pulls_daily',
p_control => 'date',
p_interval => '1 month',
p_premake => 3
);
END IF;
END
$$;
70 changes: 70 additions & 0 deletions scripts/services/dockerhub-sync.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
version: '3.1'

x-env-args: &env-args
DOCKER_BUILDKIT: 1
NODE_ENV: docker
SERVICE: dockerhub-sync
SHELL: /bin/sh
SUPPRESS_NO_CONFIG_WARNING: 'true'
DOCKERHUB_API_BASE_URL: 'https://hub.docker.com/v2'
DOCKERHUB_BATCH_SIZE: '100'
DOCKERHUB_REFRESH_INTERVAL_HOURS: '24'
DOCKERHUB_DISCOVERY_INTERVAL_DAYS: '14'
DOCKERHUB_IDLE_SLEEP_SEC: '60'

services:
dockerhub-sync:
build:
context: ../../
dockerfile: ./scripts/services/docker/Dockerfile.packages-worker
command: 'pnpm run start:dockerhub-sync'
working_dir: /usr/crowd/app/services/apps/packages_worker
env_file:
- ../../backend/.env.dist.local
- ../../backend/.env.dist.composed
- ../../backend/.env.override.local
- ../../backend/.env.override.composed
environment:
<<: *env-args
restart: always
networks:
- crowd-bridge

dockerhub-sync-dev:
build:
context: ../../
dockerfile: ./scripts/services/docker/Dockerfile.packages-worker
command: 'pnpm run dev:dockerhub-sync'
working_dir: /usr/crowd/app/services/apps/packages_worker
# user: '${USER_ID}:${GROUP_ID}'
env_file:
- ../../backend/.env.dist.local
- ../../backend/.env.dist.composed
- ../../backend/.env.override.local
- ../../backend/.env.override.composed
environment:
<<: *env-args
hostname: dockerhub-sync
networks:
- crowd-bridge
volumes:
- ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src
- ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src
- ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src
- ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src
- ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src
- ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src
- ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src
- ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src
- ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src
- ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src
- ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src
- ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src
- ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src
- ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src
- ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src
- ../../services/apps/packages_worker/src:/usr/crowd/app/services/apps/packages_worker/src

networks:
crowd-bridge:
external: true
3 changes: 3 additions & 0 deletions services/apps/packages_worker/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
"dev:deps-dev-ingest:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/deps-dev-ingest.ts",
"dev:npm-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=npm-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=npm-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9236 src/bin/npm-worker.ts",
"dev:github-repos-enricher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts",
"start:dockerhub-sync": "SERVICE=dockerhub-sync tsx src/bin/dockerhub-sync.ts",
"dev:dockerhub-sync": "SERVICE=dockerhub-sync LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9238 src/bin/dockerhub-sync.ts",
"dev:dockerhub-sync:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=dockerhub-sync LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9238 src/bin/dockerhub-sync.ts",
"export-to-bucket": "SERVICE=deps-dev-ingest tsx src/scripts/exportToBucket.ts",
"export-to-bucket:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=deps-dev-ingest tsx src/scripts/exportToBucket.ts",
"monitor:osspckgs": "SERVICE=monitor tsx src/scripts/monitorOsspckgs.ts",
Expand Down
58 changes: 58 additions & 0 deletions services/apps/packages_worker/src/bin/dockerhub-sync.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import { getServiceLogger } from '@crowd/logging'

import { getDockerhubConfig, getGithubAppConfig } from '../config'
import { getPackagesDb } from '../db'
import { runDockerhubLoop } from '../dockerhub'
import { fetchRateLimitDiagnostics, resolveInstallations } from '../enricher/githubAppAuth'

const log = getServiceLogger()

let shuttingDown = false

const shutdown = async () => {
if (shuttingDown) return
shuttingDown = true
log.info('Shutting down dockerhub-sync...')
}

process.on('SIGINT', shutdown)
process.on('SIGTERM', shutdown)

const main = async () => {
log.info('dockerhub-sync starting...')

const config = getDockerhubConfig()
const appConfig = getGithubAppConfig()

const installationIds = await resolveInstallations(appConfig)

if (installationIds.length === 0) {
log.error('No GitHub App installations found — cannot build token pool')
process.exit(1)
}

await fetchRateLimitDiagnostics(appConfig.appId, appConfig.privateKeyPem, installationIds)

const qx = await getPackagesDb()
await qx.selectOne('SELECT 1')
log.info('Connected to packages-db.')

log.info(
{
installations: installationIds.length,
batchSize: config.batchSize,
hubBaseUrl: config.hubBaseUrl,
},
'Starting dockerhub loop',
)

await runDockerhubLoop(qx, installationIds, appConfig, config, () => shuttingDown)

log.info('dockerhub-sync stopped.')
process.exit(0)
}

main().catch((err) => {
log.error({ err }, 'dockerhub-sync fatal error')
process.exit(1)
})
10 changes: 10 additions & 0 deletions services/apps/packages_worker/src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,13 @@ export function getEnricherConfig() {
fetchTimeoutMs: parseInt(process.env.ENRICHER_FETCH_TIMEOUT_MS ?? '10000', 10),
}
}

export function getDockerhubConfig() {
return {
hubBaseUrl: requireEnv('DOCKERHUB_API_BASE_URL'),
batchSize: requireEnvInt('DOCKERHUB_BATCH_SIZE'),
refreshIntervalHours: requireEnvInt('DOCKERHUB_REFRESH_INTERVAL_HOURS'),
discoveryIntervalDays: requireEnvInt('DOCKERHUB_DISCOVERY_INTERVAL_DAYS'),
idleSleepSec: requireEnvInt('DOCKERHUB_IDLE_SLEEP_SEC'),
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import { describe, expect, it } from 'vitest'

import { buildCandidates } from '../candidates'

describe('buildCandidates', () => {
it('lowercases owner and repo into a single <owner>/<repo> candidate', () => {
expect(buildCandidates('Grafana', 'Grafana')).toEqual(['grafana/grafana'])
})

it('passes through already-valid lowercase slugs', () => {
expect(buildCandidates('prometheus', 'node_exporter')).toEqual(['prometheus/node_exporter'])
})

it('rejects components with characters Docker Hub does not accept', () => {
// GitHub allows '+' in org names via renames; Hub would 400.
expect(buildCandidates('foo+bar', 'baz')).toEqual([])
})

it('rejects components that start or end with a separator', () => {
expect(buildCandidates('-leading', 'repo')).toEqual([])
expect(buildCandidates('owner', 'trailing.')).toEqual([])
})

it('does not emit a library/<repo> candidate', () => {
// Guard against accidental reintroduction — see comment in candidates.ts.
expect(buildCandidates('nodejs', 'node')).toEqual(['nodejs/node'])
})
})
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import { afterEach, describe, expect, it, vi } from 'vitest'

import { fetchDockerhub } from '../fetchDockerhub'
import { FetchError } from '../types'

const BASE = 'https://hub.docker.com/v2'

function mockFetch(status: number, body: unknown, headers: Record<string, string> = {}) {
return vi.spyOn(globalThis, 'fetch').mockResolvedValue(
new Response(typeof body === 'string' ? body : JSON.stringify(body), {
status,
headers: { 'Content-Type': 'application/json', ...headers },
}),
)
}

afterEach(() => {
vi.restoreAllMocks()
})

describe('fetchDockerhub', () => {
it('returns pull/star counts on 200', async () => {
mockFetch(200, {
name: 'grafana',
namespace: 'grafana',
pull_count: 12345,
star_count: 678,
last_updated: '2026-05-01T00:00:00Z',
})

const r = await fetchDockerhub(BASE, 'grafana/grafana')
expect(r).toEqual({
imageName: 'grafana/grafana',
pulls: 12345,
stars: 678,
lastUpdated: '2026-05-01T00:00:00Z',
})
})

it('appends trailing slash to the request URL', async () => {
const spy = mockFetch(200, { pull_count: 1, star_count: 0 })
await fetchDockerhub(BASE, 'a/b')
expect(spy).toHaveBeenCalledWith(`${BASE}/repositories/a/b/`, expect.anything())
})

it('normalizes a trailing slash on the configured base URL', async () => {
const spy = mockFetch(200, { pull_count: 1, star_count: 0 })
await fetchDockerhub(`${BASE}/`, 'a/b')
expect(spy).toHaveBeenCalledWith(`${BASE}/repositories/a/b/`, expect.anything())
})

it('classifies 404 as NOT_FOUND', async () => {
mockFetch(404, { message: 'object not found' })
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'NOT_FOUND' })
})

it('classifies 400 as NOT_FOUND (Hub 400s on malformed slugs)', async () => {
mockFetch(400, { message: 'bad request' })
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'NOT_FOUND' })
})

it('classifies 429 as RATE_LIMIT with resetAt from header', async () => {
const resetSec = Math.floor(Date.now() / 1000) + 120
mockFetch(429, { message: 'too many' }, { 'x-ratelimit-reset': String(resetSec) })

expect.assertions(3)
try {
await fetchDockerhub(BASE, 'a/b')
} catch (err) {
expect(err).toBeInstanceOf(FetchError)
const fe = err as FetchError
expect(fe.kind).toBe('RATE_LIMIT')
expect(fe.resetAt).toBeGreaterThan(Date.now())
}
})

it('does NOT discard a 200 response when x-ratelimit-remaining is 0', async () => {
// remaining=0 means this request consumed the last slot; the response itself
// is valid. The next call will 429 and park then.
mockFetch(200, { pull_count: 1, star_count: 0 }, { 'x-ratelimit-remaining': '0' })
const r = await fetchDockerhub(BASE, 'a/b')
expect(r.pulls).toBe(1)
})

it('classifies 401/403 as AUTH so misconfig surfaces instead of looking like a miss', async () => {
mockFetch(401, { message: 'unauthorized' })
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'AUTH' })
mockFetch(403, { message: 'forbidden' })
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'AUTH' })
})

it('classifies 5xx as TRANSIENT', async () => {
mockFetch(503, 'Service Unavailable')
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'TRANSIENT' })
})

it('classifies network failure as TRANSIENT', async () => {
vi.spyOn(globalThis, 'fetch').mockRejectedValue(new Error('ECONNRESET'))
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'TRANSIENT' })
})

it('classifies non-JSON 200 body as MALFORMED', async () => {
mockFetch(200, '<html>not json</html>')
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'MALFORMED' })
})

it('classifies missing pull_count as MALFORMED', async () => {
mockFetch(200, { name: 'x', star_count: 0 })
await expect(fetchDockerhub(BASE, 'a/b')).rejects.toMatchObject({ kind: 'MALFORMED' })
})
})
19 changes: 19 additions & 0 deletions services/apps/packages_worker/src/dockerhub/candidates.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Docker Hub repository slugs are lowercase and limited to [a-z0-9._-].
// GitHub allows uppercase and a few characters Hub rejects, so we lowercase
// and validate before probing — anything that fails the regex would 400 on
// Hub anyway and isn't worth an HTTP round-trip.
const HUB_COMPONENT = /^[a-z0-9](?:[a-z0-9._-]*[a-z0-9])?$/

export function buildCandidates(owner: string, name: string): string[] {
const ns = owner.toLowerCase()
const repo = name.toLowerCase()

if (!HUB_COMPONENT.test(ns) || !HUB_COMPONENT.test(repo)) {
return []
}

// v1 deliberately omits `library/<repo>`: a random github.com/foo/node with a
// dev Dockerfile would false-positive onto the official library/node image.
// Official images (~150) to be seeded via allowlist in a follow-up.
return [`${ns}/${repo}`]
}
Loading
Loading