From ffb65c93676a1e0521be06e6640e7371bf6376e0 Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 26 May 2026 11:11:33 +0100 Subject: [PATCH 01/22] feat: add enrichment loop and github-repos-enricher entry point Signed-off-by: Mouad BANI --- .../apps/script_executor_worker/package.json | 1 + .../bin/sync-light-repos/fetchLightRepo.ts | 96 ++++++ .../src/bin/sync-light-repos/index.ts | 276 ++++++++++++++++++ .../src/bin/sync-light-repos/types.ts | 46 +++ .../bin/sync-light-repos/upsertLightRepos.ts | 49 ++++ 5 files changed, 468 insertions(+) create mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts create mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts create mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts create mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts diff --git a/services/apps/script_executor_worker/package.json b/services/apps/script_executor_worker/package.json index c94236ec00..482a390495 100644 --- a/services/apps/script_executor_worker/package.json +++ b/services/apps/script_executor_worker/package.json @@ -12,6 +12,7 @@ "recalculate-enrichment-affiliations": "npx tsx src/bin/recalculate-enrichment-affiliations.ts", "recalculate-all-affiliations": "npx tsx src/bin/recalculate-all-affiliations.ts", "add-lf-projects-to-collection": "npx tsx src/bin/add-lf-projects-to-collection.ts", + "sync-light-repos": "npx tsx src/bin/sync-light-repos/index.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", "format": "npx prettier --write \"src/**/*.ts\"", "format-check": "npx prettier --check .", diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts new file mode 100644 index 0000000000..ebd1dd87f4 --- /dev/null +++ b/services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts @@ -0,0 +1,96 @@ +import { FetchError, LightRepoResult } from './types' + +const GRAPHQL_URL = 'https://api.github.com/graphql' + +const REPO_QUERY = ` + query($owner: String!, $name: String!) { + repository(owner: $owner, name: $name) { + description + primaryLanguage { name } + repositoryTopics(first: 25) { nodes { topic { name } } } + stargazerCount + forkCount + watchers { totalCount } + issues(states: OPEN) { totalCount } + pushedAt + isArchived + isDisabled + isFork + createdAt + } + } +` + +export function parseGithubUrl(url: string): { owner: string; name: string } { + const match = url.match(/https?:\/\/github\.com\/([^/]+)\/([^/]+?)(?:\.git)?\/?$/) + if (!match) throw new FetchError('MALFORMED', `Cannot parse GitHub URL: ${url}`) + return { owner: match[1], name: match[2] } +} + +export async function fetchLightRepo(url: string, token: string): Promise { + const { owner, name } = parseGithubUrl(url) + + let response: Response + try { + response = await fetch(GRAPHQL_URL, { + method: 'POST', + headers: { + Authorization: `bearer ${token}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ query: REPO_QUERY, variables: { owner, name } }), + }) + } catch (err) { + throw new FetchError('TRANSIENT', `Network error for ${url}: ${(err as Error).message}`) + } + + const resetSec = parseInt(response.headers.get('x-ratelimit-reset') ?? '0', 10) + const resetMs = resetSec ? resetSec * 1000 + 5_000 : Date.now() + 65_000 + + if (response.status === 401) { + throw new FetchError('AUTH', `401 Unauthorized for ${url}`) + } + + if (response.status === 403) { + const body = await response.text() + if (body.toLowerCase().includes('rate limit')) { + throw new FetchError('RATE_LIMIT', `Rate limited on ${url}`, resetMs) + } + throw new FetchError('AUTH', `403 Forbidden for ${url}`) + } + + if (response.status === 404) throw new FetchError('NOT_FOUND', `404 for ${url}`) + if (response.status >= 500) throw new FetchError('TRANSIENT', `${response.status} for ${url}`) + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const json = (await response.json()) as any + + if (json.errors?.length) { + const err = json.errors[0] + if (err.type === 'RATE_LIMITED') throw new FetchError('RATE_LIMIT', `RATE_LIMITED for ${url}`, resetMs) + if (err.type === 'NOT_FOUND') throw new FetchError('NOT_FOUND', `NOT_FOUND for ${url}`) + throw new FetchError('TRANSIENT', `GraphQL error for ${url}: ${err.message ?? err.type}`) + } + + const repo = json.data?.repository + if (!repo) throw new FetchError('NOT_FOUND', `No repository data for ${url}`) + + return { + url, + host: 'github', + owner, + name, + description: repo.description ?? null, + primaryLanguage: repo.primaryLanguage?.name ?? null, + topics: (repo.repositoryTopics?.nodes ?? []).map((n: { topic: { name: string } }) => n.topic.name), + stars: repo.stargazerCount ?? 0, + forks: repo.forkCount ?? 0, + watchers: repo.watchers?.totalCount ?? 0, + openIssues: repo.issues?.totalCount ?? 0, + lastCommitAt: repo.pushedAt ?? null, + archived: repo.isArchived ?? false, + disabled: repo.isDisabled ?? false, + isFork: repo.isFork ?? false, + createdAt: repo.createdAt ?? null, + } +} diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts new file mode 100644 index 0000000000..cb0f2198b3 --- /dev/null +++ b/services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts @@ -0,0 +1,276 @@ +/** + * sync-light-repos + * + * Fetches GitHub repo metadata via GraphQL and upserts into the `repos` table. + * Runs one async worker per token — each worker claims URLs by index so no two + * requests ever share a token concurrently. + * + * Success tracking: a successful fetch updates repos.last_synced_at to NOW(). + * Failed repos keep a stale/null last_synced_at and are picked up on the next run. + * TODO: fetchPage will later filter by last_synced_at < NOW() - update_interval + * so this script becomes a continuous sync with no extra failure tracking needed. + * + * Usage: + * pnpm run sync-light-repos -- [options] + * + * Options: + * --page-size Repos fetched from source per cursor page (default: 200) + * --batch-size Upsert batch size (default: 50) + * --max-retries Per-repo transient retry cap (default: 3) + * --start-after Resume from cursor id (printed after each page) + * --limit Stop after N repos total (for testing) + * --dry-run Fetch but skip DB writes + * + * Environment: + * GITHUB_TOKENS Comma-separated GitHub PATs (required) + * CROWD_DB_WRITE_HOST/PORT/USERNAME/PASSWORD/DATABASE + * SERVICE + */ + +import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/data-access-layer/src/database' +import { pgpQx } from '@crowd/data-access-layer/src/queryExecutor' +import { getServiceChildLogger } from '@crowd/logging' + +import { fetchLightRepo, parseGithubUrl } from './fetchLightRepo' +import { FetchError, LightRepoResult } from './types' +import { upsertLightRepos } from './upsertLightRepos' + +const log = getServiceChildLogger('sync-light-repos') + +function parseArgs() { + const args = process.argv.slice(2) + const getArg = (flag: string) => { + const idx = args.indexOf(flag) + return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined + } + + const pageSize = parseInt(getArg('--page-size') ?? '200', 10) + const batchSize = parseInt(getArg('--batch-size') ?? '50', 10) + const maxRetries = parseInt(getArg('--max-retries') ?? '3', 10) + const startAfter = getArg('--start-after') ?? null + const limitRaw = getArg('--limit') + const limit = limitRaw !== undefined ? parseInt(limitRaw, 10) : null + const dryRun = args.includes('--dry-run') + + if (isNaN(pageSize) || pageSize <= 0) { log.error('--page-size must be a positive integer'); process.exit(1) } + if (isNaN(batchSize) || batchSize <= 0) { log.error('--batch-size must be a positive integer'); process.exit(1) } + if (isNaN(maxRetries) || maxRetries < 0) { log.error('--max-retries must be a non-negative integer'); process.exit(1) } + if (limit !== null && (isNaN(limit) || limit <= 0)) { log.error('--limit must be a positive integer'); process.exit(1) } + + return { pageSize, batchSize, maxRetries, startAfter, limit, dryRun } +} + +// TODO: add LEFT JOIN repos r ON r.url = pr.url and filter +// WHERE (r.last_synced_at IS NULL OR r.last_synced_at < NOW() - INTERVAL '$(updateIntervalHours) hours') +// once the update interval logic is scoped in. +async function fetchPage( + qx: ReturnType, + cursor: string | null, + pageSize: number, +): Promise<{ urls: string[]; nextCursor: string | null }> { + const rows = await qx.select( + ` + SELECT id, url + FROM public.repositories + WHERE url LIKE 'https://github.com/%' + AND "deletedAt" IS NULL + ${cursor ? 'AND id > $(cursor)' : ''} + ORDER BY id + LIMIT $(pageSize) + `, + { cursor, pageSize }, + ) + return { + urls: rows.map((r: { url: string }) => r.url), + nextCursor: rows.length > 0 ? (rows[rows.length - 1] as { id: string }).id : null, + } +} + +async function fetchWithRetries( + url: string, + token: string, + maxRetries: number, +): Promise { + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + return await fetchLightRepo(url, token) + } catch (err) { + if (!(err instanceof FetchError)) throw err + + if (['NOT_FOUND', 'AUTH', 'MALFORMED'].includes(err.kind)) { + log.warn({ url, kind: err.kind }, err.message) + return null + } + + if (err.kind === 'RATE_LIMIT') throw err + + if (attempt < maxRetries) { + const backoffMs = 1000 * 2 ** attempt + log.warn({ url, attempt, backoffMs }, `Transient error, retrying: ${err.message}`) + await new Promise((r) => setTimeout(r, backoffMs)) + } else { + log.error({ url }, `Gave up after ${maxRetries} retries: ${err.message}`) + return null + } + } + } + return null +} + +async function processPage( + urls: string[], + tokens: string[], + parkedUntil: Map, + opts: ReturnType, + qx: ReturnType, +): Promise<{ fetched: number; failed: number; flushed: number }> { + const validUrls: string[] = [] + let skipped = 0 + for (const url of urls) { + try { parseGithubUrl(url); validUrls.push(url) } catch { skipped++ } + } + if (skipped > 0) log.warn(`Skipped ${skipped} non-GitHub URLs`) + + const buffer: LightRepoResult[] = [] + const failures: Array<{ url: string; reason: string }> = [] + let failed = 0 + let flushed = 0 + let nextIdx = 0 + + await Promise.all( + tokens.map(async (token, tokenIdx) => { + // Respect any park set during a previous page of this run + const initialPark = (parkedUntil.get(token) ?? 0) - Date.now() + if (initialPark > 0) { + log.warn(`token#${tokenIdx} still parked, waiting ${Math.round(initialPark / 1000)}s`) + await new Promise((r) => setTimeout(r, initialPark)) + } + + while (true) { + const idx = nextIdx++ + if (idx >= validUrls.length) break + const url = validUrls[idx] + + try { + const result = await fetchWithRetries(url, token, opts.maxRetries) + if (result) { + buffer.push(result) + if (!opts.dryRun && buffer.length >= opts.batchSize) { + const batch = buffer.splice(0) + await upsertLightRepos(qx, batch) + flushed += batch.length + } + } else { + failures.push({ url, reason: 'see warn log above' }) + failed++ + } + } catch (err) { + if (err instanceof FetchError && err.kind === 'RATE_LIMIT') { + const resetAt = err.resetAt ?? Date.now() + 60_000 + const waitMs = Math.max(1_000, resetAt - Date.now()) + parkedUntil.set(token, resetAt) + log.warn( + { tokenIdx, parkedUntil: new Date(resetAt).toISOString() }, + `token#${tokenIdx} rate limited — parking for ${Math.round(waitMs / 1000)}s`, + ) + await new Promise((r) => setTimeout(r, waitMs)) + failures.push({ url, reason: 'rate-limit' }) + failed++ + } else { + log.error({ url, err }, 'Unexpected error') + failures.push({ url, reason: (err as Error).message }) + failed++ + } + } + } + }), + ) + + if (!opts.dryRun && buffer.length > 0) { + await upsertLightRepos(qx, buffer) + flushed += buffer.length + } + + if (failures.length > 0) { + log.warn({ failures }, `${failures.length} repo(s) failed this page`) + } + + return { fetched: validUrls.length - failed, failed, flushed } +} + +async function main() { + const opts = parseArgs() + + const tokens = (process.env.GITHUB_TOKENS ?? '') + .split(',') + .map((t) => t.trim()) + .filter(Boolean) + + if (tokens.length === 0) { + log.error('GITHUB_TOKENS is required (comma-separated PATs)') + process.exit(1) + } + + // TODO: when connecting the real DB, replace with a connection pool and add keepalive / + // reconnect-on-error handling. A single long-lived connection will be dropped by the server + // during multi-hour runs (TCP timeout, idle reaper), crashing the script. Completed work + // is safe via last_synced_at, but the run stops and must be manually resumed. + const dbConnection = await getDbConnection(WRITE_DB_CONFIG()) + const qx = pgpQx(dbConnection) + + log.info('='.repeat(60)) + log.info('sync-light-repos') + log.info(`tokens=${tokens.length} page-size=${opts.pageSize} batch-size=${opts.batchSize}`) + log.info(`max-retries=${opts.maxRetries} dry-run=${opts.dryRun} limit=${opts.limit ?? 'none'}`) + log.info(`start-after=${opts.startAfter ?? '(beginning)'}`) + log.info('='.repeat(60)) + + const parkedUntil = new Map() + let cursor = opts.startAfter + let pageNum = 0 + let totalProcessed = 0 + let totalFailed = 0 + let totalFlushed = 0 + + while (true) { + pageNum++ + + const remaining = opts.limit !== null ? opts.limit - totalProcessed : opts.pageSize + if (remaining <= 0) break + + const { urls, nextCursor } = await fetchPage(qx, cursor, Math.min(opts.pageSize, remaining)) + + if (urls.length === 0) { + log.info('No more repos to process') + break + } + + const { fetched, failed, flushed } = await processPage(urls, tokens, parkedUntil, opts, qx) + + totalProcessed += urls.length + totalFailed += failed + totalFlushed += flushed + + log.info( + `Page ${pageNum}: read=${urls.length} fetched=${fetched} failed=${failed}${opts.dryRun ? ' [dry-run]' : ` flushed=${flushed}`}`, + ) + + if (nextCursor) { + log.info(`Resume with: --start-after ${nextCursor}`) + cursor = nextCursor + } + + if (urls.length < Math.min(opts.pageSize, remaining)) break + } + + log.info('='.repeat(60)) + log.info(`Summary: pages=${pageNum} processed=${totalProcessed} failed=${totalFailed} flushed=${totalFlushed}`) + log.info('='.repeat(60)) + + process.exit(totalFailed > 0 ? 1 : 0) +} + +main().catch((err) => { + log.error({ err }, 'Unexpected error') + process.exit(1) +}) diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts new file mode 100644 index 0000000000..f9b5d0fc5b --- /dev/null +++ b/services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts @@ -0,0 +1,46 @@ +export interface LightRepoResult { + url: string + host: 'github' + owner: string + name: string + description: string | null + primaryLanguage: string | null + topics: string[] + stars: number + forks: number + watchers: number + openIssues: number + lastCommitAt: string | null + archived: boolean + disabled: boolean + isFork: boolean + createdAt: string | null +} + +export interface ParsedRepoUrl { + owner: string + name: string +} + +export interface Options { + pageSize: number + batchSize: number + maxRetries: number + startAfter: string | null + limit: number | null + dryRun: boolean + source: string +} + +export type FetchErrorKind = 'RATE_LIMIT' | 'TRANSIENT' | 'NOT_FOUND' | 'AUTH' | 'MALFORMED' + +export class FetchError extends Error { + constructor( + public readonly kind: FetchErrorKind, + message: string, + public readonly resetAt?: number, // epoch ms; only for RATE_LIMIT + ) { + super(message) + this.name = 'FetchError' + } +} diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts new file mode 100644 index 0000000000..f13af677fe --- /dev/null +++ b/services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts @@ -0,0 +1,49 @@ +import { getServiceChildLogger } from '@crowd/logging' + +// import { formatQuery, QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' +import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' + +import { LightRepoResult } from './types' + +const log = getServiceChildLogger('sync-light-repos:upsert') + +export async function upsertLightRepos(_qx: QueryExecutor, rows: LightRepoResult[]): Promise { + if (rows.length === 0) return + + log.info({ count: rows.length, rows: JSON.stringify(rows, null, 2) }, 'upsert results') + + // const values = rows + // .map((r) => + // formatQuery( + // `($(url), $(host), $(owner), $(name), $(description), $(primaryLanguage), $(topics)::text[], + // $(stars), $(forks), $(watchers), $(openIssues), $(lastCommitAt)::timestamptz, + // $(archived), $(disabled), $(isFork), $(createdAt)::timestamptz)`, + // r, + // ), + // ) + // .join(',\n') + + // await _qx.result(` + // INSERT INTO repos ( + // url, host, owner, name, description, primary_language, topics, + // stars, forks, watchers, open_issues, last_commit_at, + // archived, disabled, is_fork, created_at, last_synced_at + // ) VALUES ${values} + // ON CONFLICT (url) DO UPDATE SET + // host = EXCLUDED.host, + // owner = EXCLUDED.owner, + // name = EXCLUDED.name, + // description = EXCLUDED.description, + // primary_language = EXCLUDED.primary_language, + // topics = EXCLUDED.topics, + // stars = EXCLUDED.stars, + // forks = EXCLUDED.forks, + // watchers = EXCLUDED.watchers, + // open_issues = EXCLUDED.open_issues, + // last_commit_at = EXCLUDED.last_commit_at, + // archived = EXCLUDED.archived, + // disabled = EXCLUDED.disabled, + // is_fork = EXCLUDED.is_fork, + // last_synced_at = NOW() + // `) +} From 912183f1ce13fba5e9735c44ad59b19f09e03dce Mon Sep 17 00:00:00 2001 From: Mouad BANI Date: Tue, 26 May 2026 11:15:22 +0100 Subject: [PATCH 02/22] chore: remove sync-light-repos script from script_executor_worker (moved to packages_worker) Signed-off-by: Mouad BANI --- .../apps/script_executor_worker/package.json | 1 - .../bin/sync-light-repos/fetchLightRepo.ts | 96 ------ .../src/bin/sync-light-repos/index.ts | 276 ------------------ .../src/bin/sync-light-repos/types.ts | 46 --- .../bin/sync-light-repos/upsertLightRepos.ts | 49 ---- 5 files changed, 468 deletions(-) delete mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts delete mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts delete mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts delete mode 100644 services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts diff --git a/services/apps/script_executor_worker/package.json b/services/apps/script_executor_worker/package.json index 482a390495..c94236ec00 100644 --- a/services/apps/script_executor_worker/package.json +++ b/services/apps/script_executor_worker/package.json @@ -12,7 +12,6 @@ "recalculate-enrichment-affiliations": "npx tsx src/bin/recalculate-enrichment-affiliations.ts", "recalculate-all-affiliations": "npx tsx src/bin/recalculate-all-affiliations.ts", "add-lf-projects-to-collection": "npx tsx src/bin/add-lf-projects-to-collection.ts", - "sync-light-repos": "npx tsx src/bin/sync-light-repos/index.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", "format": "npx prettier --write \"src/**/*.ts\"", "format-check": "npx prettier --check .", diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts deleted file mode 100644 index ebd1dd87f4..0000000000 --- a/services/apps/script_executor_worker/src/bin/sync-light-repos/fetchLightRepo.ts +++ /dev/null @@ -1,96 +0,0 @@ -import { FetchError, LightRepoResult } from './types' - -const GRAPHQL_URL = 'https://api.github.com/graphql' - -const REPO_QUERY = ` - query($owner: String!, $name: String!) { - repository(owner: $owner, name: $name) { - description - primaryLanguage { name } - repositoryTopics(first: 25) { nodes { topic { name } } } - stargazerCount - forkCount - watchers { totalCount } - issues(states: OPEN) { totalCount } - pushedAt - isArchived - isDisabled - isFork - createdAt - } - } -` - -export function parseGithubUrl(url: string): { owner: string; name: string } { - const match = url.match(/https?:\/\/github\.com\/([^/]+)\/([^/]+?)(?:\.git)?\/?$/) - if (!match) throw new FetchError('MALFORMED', `Cannot parse GitHub URL: ${url}`) - return { owner: match[1], name: match[2] } -} - -export async function fetchLightRepo(url: string, token: string): Promise { - const { owner, name } = parseGithubUrl(url) - - let response: Response - try { - response = await fetch(GRAPHQL_URL, { - method: 'POST', - headers: { - Authorization: `bearer ${token}`, - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ query: REPO_QUERY, variables: { owner, name } }), - }) - } catch (err) { - throw new FetchError('TRANSIENT', `Network error for ${url}: ${(err as Error).message}`) - } - - const resetSec = parseInt(response.headers.get('x-ratelimit-reset') ?? '0', 10) - const resetMs = resetSec ? resetSec * 1000 + 5_000 : Date.now() + 65_000 - - if (response.status === 401) { - throw new FetchError('AUTH', `401 Unauthorized for ${url}`) - } - - if (response.status === 403) { - const body = await response.text() - if (body.toLowerCase().includes('rate limit')) { - throw new FetchError('RATE_LIMIT', `Rate limited on ${url}`, resetMs) - } - throw new FetchError('AUTH', `403 Forbidden for ${url}`) - } - - if (response.status === 404) throw new FetchError('NOT_FOUND', `404 for ${url}`) - if (response.status >= 500) throw new FetchError('TRANSIENT', `${response.status} for ${url}`) - - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const json = (await response.json()) as any - - if (json.errors?.length) { - const err = json.errors[0] - if (err.type === 'RATE_LIMITED') throw new FetchError('RATE_LIMIT', `RATE_LIMITED for ${url}`, resetMs) - if (err.type === 'NOT_FOUND') throw new FetchError('NOT_FOUND', `NOT_FOUND for ${url}`) - throw new FetchError('TRANSIENT', `GraphQL error for ${url}: ${err.message ?? err.type}`) - } - - const repo = json.data?.repository - if (!repo) throw new FetchError('NOT_FOUND', `No repository data for ${url}`) - - return { - url, - host: 'github', - owner, - name, - description: repo.description ?? null, - primaryLanguage: repo.primaryLanguage?.name ?? null, - topics: (repo.repositoryTopics?.nodes ?? []).map((n: { topic: { name: string } }) => n.topic.name), - stars: repo.stargazerCount ?? 0, - forks: repo.forkCount ?? 0, - watchers: repo.watchers?.totalCount ?? 0, - openIssues: repo.issues?.totalCount ?? 0, - lastCommitAt: repo.pushedAt ?? null, - archived: repo.isArchived ?? false, - disabled: repo.isDisabled ?? false, - isFork: repo.isFork ?? false, - createdAt: repo.createdAt ?? null, - } -} diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts deleted file mode 100644 index cb0f2198b3..0000000000 --- a/services/apps/script_executor_worker/src/bin/sync-light-repos/index.ts +++ /dev/null @@ -1,276 +0,0 @@ -/** - * sync-light-repos - * - * Fetches GitHub repo metadata via GraphQL and upserts into the `repos` table. - * Runs one async worker per token — each worker claims URLs by index so no two - * requests ever share a token concurrently. - * - * Success tracking: a successful fetch updates repos.last_synced_at to NOW(). - * Failed repos keep a stale/null last_synced_at and are picked up on the next run. - * TODO: fetchPage will later filter by last_synced_at < NOW() - update_interval - * so this script becomes a continuous sync with no extra failure tracking needed. - * - * Usage: - * pnpm run sync-light-repos -- [options] - * - * Options: - * --page-size Repos fetched from source per cursor page (default: 200) - * --batch-size Upsert batch size (default: 50) - * --max-retries Per-repo transient retry cap (default: 3) - * --start-after Resume from cursor id (printed after each page) - * --limit Stop after N repos total (for testing) - * --dry-run Fetch but skip DB writes - * - * Environment: - * GITHUB_TOKENS Comma-separated GitHub PATs (required) - * CROWD_DB_WRITE_HOST/PORT/USERNAME/PASSWORD/DATABASE - * SERVICE - */ - -import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/data-access-layer/src/database' -import { pgpQx } from '@crowd/data-access-layer/src/queryExecutor' -import { getServiceChildLogger } from '@crowd/logging' - -import { fetchLightRepo, parseGithubUrl } from './fetchLightRepo' -import { FetchError, LightRepoResult } from './types' -import { upsertLightRepos } from './upsertLightRepos' - -const log = getServiceChildLogger('sync-light-repos') - -function parseArgs() { - const args = process.argv.slice(2) - const getArg = (flag: string) => { - const idx = args.indexOf(flag) - return idx !== -1 && idx + 1 < args.length ? args[idx + 1] : undefined - } - - const pageSize = parseInt(getArg('--page-size') ?? '200', 10) - const batchSize = parseInt(getArg('--batch-size') ?? '50', 10) - const maxRetries = parseInt(getArg('--max-retries') ?? '3', 10) - const startAfter = getArg('--start-after') ?? null - const limitRaw = getArg('--limit') - const limit = limitRaw !== undefined ? parseInt(limitRaw, 10) : null - const dryRun = args.includes('--dry-run') - - if (isNaN(pageSize) || pageSize <= 0) { log.error('--page-size must be a positive integer'); process.exit(1) } - if (isNaN(batchSize) || batchSize <= 0) { log.error('--batch-size must be a positive integer'); process.exit(1) } - if (isNaN(maxRetries) || maxRetries < 0) { log.error('--max-retries must be a non-negative integer'); process.exit(1) } - if (limit !== null && (isNaN(limit) || limit <= 0)) { log.error('--limit must be a positive integer'); process.exit(1) } - - return { pageSize, batchSize, maxRetries, startAfter, limit, dryRun } -} - -// TODO: add LEFT JOIN repos r ON r.url = pr.url and filter -// WHERE (r.last_synced_at IS NULL OR r.last_synced_at < NOW() - INTERVAL '$(updateIntervalHours) hours') -// once the update interval logic is scoped in. -async function fetchPage( - qx: ReturnType, - cursor: string | null, - pageSize: number, -): Promise<{ urls: string[]; nextCursor: string | null }> { - const rows = await qx.select( - ` - SELECT id, url - FROM public.repositories - WHERE url LIKE 'https://github.com/%' - AND "deletedAt" IS NULL - ${cursor ? 'AND id > $(cursor)' : ''} - ORDER BY id - LIMIT $(pageSize) - `, - { cursor, pageSize }, - ) - return { - urls: rows.map((r: { url: string }) => r.url), - nextCursor: rows.length > 0 ? (rows[rows.length - 1] as { id: string }).id : null, - } -} - -async function fetchWithRetries( - url: string, - token: string, - maxRetries: number, -): Promise { - for (let attempt = 0; attempt <= maxRetries; attempt++) { - try { - return await fetchLightRepo(url, token) - } catch (err) { - if (!(err instanceof FetchError)) throw err - - if (['NOT_FOUND', 'AUTH', 'MALFORMED'].includes(err.kind)) { - log.warn({ url, kind: err.kind }, err.message) - return null - } - - if (err.kind === 'RATE_LIMIT') throw err - - if (attempt < maxRetries) { - const backoffMs = 1000 * 2 ** attempt - log.warn({ url, attempt, backoffMs }, `Transient error, retrying: ${err.message}`) - await new Promise((r) => setTimeout(r, backoffMs)) - } else { - log.error({ url }, `Gave up after ${maxRetries} retries: ${err.message}`) - return null - } - } - } - return null -} - -async function processPage( - urls: string[], - tokens: string[], - parkedUntil: Map, - opts: ReturnType, - qx: ReturnType, -): Promise<{ fetched: number; failed: number; flushed: number }> { - const validUrls: string[] = [] - let skipped = 0 - for (const url of urls) { - try { parseGithubUrl(url); validUrls.push(url) } catch { skipped++ } - } - if (skipped > 0) log.warn(`Skipped ${skipped} non-GitHub URLs`) - - const buffer: LightRepoResult[] = [] - const failures: Array<{ url: string; reason: string }> = [] - let failed = 0 - let flushed = 0 - let nextIdx = 0 - - await Promise.all( - tokens.map(async (token, tokenIdx) => { - // Respect any park set during a previous page of this run - const initialPark = (parkedUntil.get(token) ?? 0) - Date.now() - if (initialPark > 0) { - log.warn(`token#${tokenIdx} still parked, waiting ${Math.round(initialPark / 1000)}s`) - await new Promise((r) => setTimeout(r, initialPark)) - } - - while (true) { - const idx = nextIdx++ - if (idx >= validUrls.length) break - const url = validUrls[idx] - - try { - const result = await fetchWithRetries(url, token, opts.maxRetries) - if (result) { - buffer.push(result) - if (!opts.dryRun && buffer.length >= opts.batchSize) { - const batch = buffer.splice(0) - await upsertLightRepos(qx, batch) - flushed += batch.length - } - } else { - failures.push({ url, reason: 'see warn log above' }) - failed++ - } - } catch (err) { - if (err instanceof FetchError && err.kind === 'RATE_LIMIT') { - const resetAt = err.resetAt ?? Date.now() + 60_000 - const waitMs = Math.max(1_000, resetAt - Date.now()) - parkedUntil.set(token, resetAt) - log.warn( - { tokenIdx, parkedUntil: new Date(resetAt).toISOString() }, - `token#${tokenIdx} rate limited — parking for ${Math.round(waitMs / 1000)}s`, - ) - await new Promise((r) => setTimeout(r, waitMs)) - failures.push({ url, reason: 'rate-limit' }) - failed++ - } else { - log.error({ url, err }, 'Unexpected error') - failures.push({ url, reason: (err as Error).message }) - failed++ - } - } - } - }), - ) - - if (!opts.dryRun && buffer.length > 0) { - await upsertLightRepos(qx, buffer) - flushed += buffer.length - } - - if (failures.length > 0) { - log.warn({ failures }, `${failures.length} repo(s) failed this page`) - } - - return { fetched: validUrls.length - failed, failed, flushed } -} - -async function main() { - const opts = parseArgs() - - const tokens = (process.env.GITHUB_TOKENS ?? '') - .split(',') - .map((t) => t.trim()) - .filter(Boolean) - - if (tokens.length === 0) { - log.error('GITHUB_TOKENS is required (comma-separated PATs)') - process.exit(1) - } - - // TODO: when connecting the real DB, replace with a connection pool and add keepalive / - // reconnect-on-error handling. A single long-lived connection will be dropped by the server - // during multi-hour runs (TCP timeout, idle reaper), crashing the script. Completed work - // is safe via last_synced_at, but the run stops and must be manually resumed. - const dbConnection = await getDbConnection(WRITE_DB_CONFIG()) - const qx = pgpQx(dbConnection) - - log.info('='.repeat(60)) - log.info('sync-light-repos') - log.info(`tokens=${tokens.length} page-size=${opts.pageSize} batch-size=${opts.batchSize}`) - log.info(`max-retries=${opts.maxRetries} dry-run=${opts.dryRun} limit=${opts.limit ?? 'none'}`) - log.info(`start-after=${opts.startAfter ?? '(beginning)'}`) - log.info('='.repeat(60)) - - const parkedUntil = new Map() - let cursor = opts.startAfter - let pageNum = 0 - let totalProcessed = 0 - let totalFailed = 0 - let totalFlushed = 0 - - while (true) { - pageNum++ - - const remaining = opts.limit !== null ? opts.limit - totalProcessed : opts.pageSize - if (remaining <= 0) break - - const { urls, nextCursor } = await fetchPage(qx, cursor, Math.min(opts.pageSize, remaining)) - - if (urls.length === 0) { - log.info('No more repos to process') - break - } - - const { fetched, failed, flushed } = await processPage(urls, tokens, parkedUntil, opts, qx) - - totalProcessed += urls.length - totalFailed += failed - totalFlushed += flushed - - log.info( - `Page ${pageNum}: read=${urls.length} fetched=${fetched} failed=${failed}${opts.dryRun ? ' [dry-run]' : ` flushed=${flushed}`}`, - ) - - if (nextCursor) { - log.info(`Resume with: --start-after ${nextCursor}`) - cursor = nextCursor - } - - if (urls.length < Math.min(opts.pageSize, remaining)) break - } - - log.info('='.repeat(60)) - log.info(`Summary: pages=${pageNum} processed=${totalProcessed} failed=${totalFailed} flushed=${totalFlushed}`) - log.info('='.repeat(60)) - - process.exit(totalFailed > 0 ? 1 : 0) -} - -main().catch((err) => { - log.error({ err }, 'Unexpected error') - process.exit(1) -}) diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts deleted file mode 100644 index f9b5d0fc5b..0000000000 --- a/services/apps/script_executor_worker/src/bin/sync-light-repos/types.ts +++ /dev/null @@ -1,46 +0,0 @@ -export interface LightRepoResult { - url: string - host: 'github' - owner: string - name: string - description: string | null - primaryLanguage: string | null - topics: string[] - stars: number - forks: number - watchers: number - openIssues: number - lastCommitAt: string | null - archived: boolean - disabled: boolean - isFork: boolean - createdAt: string | null -} - -export interface ParsedRepoUrl { - owner: string - name: string -} - -export interface Options { - pageSize: number - batchSize: number - maxRetries: number - startAfter: string | null - limit: number | null - dryRun: boolean - source: string -} - -export type FetchErrorKind = 'RATE_LIMIT' | 'TRANSIENT' | 'NOT_FOUND' | 'AUTH' | 'MALFORMED' - -export class FetchError extends Error { - constructor( - public readonly kind: FetchErrorKind, - message: string, - public readonly resetAt?: number, // epoch ms; only for RATE_LIMIT - ) { - super(message) - this.name = 'FetchError' - } -} diff --git a/services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts b/services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts deleted file mode 100644 index f13af677fe..0000000000 --- a/services/apps/script_executor_worker/src/bin/sync-light-repos/upsertLightRepos.ts +++ /dev/null @@ -1,49 +0,0 @@ -import { getServiceChildLogger } from '@crowd/logging' - -// import { formatQuery, QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' -import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' - -import { LightRepoResult } from './types' - -const log = getServiceChildLogger('sync-light-repos:upsert') - -export async function upsertLightRepos(_qx: QueryExecutor, rows: LightRepoResult[]): Promise { - if (rows.length === 0) return - - log.info({ count: rows.length, rows: JSON.stringify(rows, null, 2) }, 'upsert results') - - // const values = rows - // .map((r) => - // formatQuery( - // `($(url), $(host), $(owner), $(name), $(description), $(primaryLanguage), $(topics)::text[], - // $(stars), $(forks), $(watchers), $(openIssues), $(lastCommitAt)::timestamptz, - // $(archived), $(disabled), $(isFork), $(createdAt)::timestamptz)`, - // r, - // ), - // ) - // .join(',\n') - - // await _qx.result(` - // INSERT INTO repos ( - // url, host, owner, name, description, primary_language, topics, - // stars, forks, watchers, open_issues, last_commit_at, - // archived, disabled, is_fork, created_at, last_synced_at - // ) VALUES ${values} - // ON CONFLICT (url) DO UPDATE SET - // host = EXCLUDED.host, - // owner = EXCLUDED.owner, - // name = EXCLUDED.name, - // description = EXCLUDED.description, - // primary_language = EXCLUDED.primary_language, - // topics = EXCLUDED.topics, - // stars = EXCLUDED.stars, - // forks = EXCLUDED.forks, - // watchers = EXCLUDED.watchers, - // open_issues = EXCLUDED.open_issues, - // last_commit_at = EXCLUDED.last_commit_at, - // archived = EXCLUDED.archived, - // disabled = EXCLUDED.disabled, - // is_fork = EXCLUDED.is_fork, - // last_synced_at = NOW() - // `) -} From 779423deac869a2aafbd681df1b10b296ce46585 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 26 May 2026 17:59:07 +0200 Subject: [PATCH 03/22] feat: pom extractor Signed-off-by: Umberto Sgueglia --- pnpm-lock.yaml | 72 ++++- scripts/services/pom-fetcher.yaml | 69 ++++ .../packages_worker/src/bin/pom-fetcher.ts | 42 +++ services/apps/packages_worker/src/config.ts | 9 + .../src/pom-fetcher/extract.ts | 295 ++++++++++++++++++ .../src/pom-fetcher/metadata.ts | 50 +++ .../src/pom-fetcher/runPomEnrichmentLoop.ts | 221 +++++++++++++ .../data-access-layer/src/osspckgs/index.ts | 3 + .../src/osspckgs/maintainers.ts | 55 ++++ .../src/osspckgs/packages.ts | 92 ++++++ .../data-access-layer/src/osspckgs/types.ts | 44 +++ 11 files changed, 946 insertions(+), 6 deletions(-) create mode 100644 scripts/services/pom-fetcher.yaml create mode 100644 services/apps/packages_worker/src/bin/pom-fetcher.ts create mode 100644 services/apps/packages_worker/src/pom-fetcher/extract.ts create mode 100644 services/apps/packages_worker/src/pom-fetcher/metadata.ts create mode 100644 services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts create mode 100644 services/libs/data-access-layer/src/osspckgs/index.ts create mode 100644 services/libs/data-access-layer/src/osspckgs/maintainers.ts create mode 100644 services/libs/data-access-layer/src/osspckgs/types.ts diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 78422e559e..b9e5074f5a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1411,6 +1411,58 @@ importers: specifier: ^3.0.1 version: 3.1.0 + services/apps/pom_fetcher_worker: + dependencies: + '@crowd/archetype-standard': + specifier: workspace:* + version: link:../../archetypes/standard + '@crowd/archetype-worker': + specifier: workspace:* + version: link:../../archetypes/worker + '@crowd/common': + specifier: workspace:* + version: link:../../libs/common + '@crowd/data-access-layer': + specifier: workspace:* + version: link:../../libs/data-access-layer + '@crowd/database': + specifier: workspace:* + version: link:../../libs/database + '@crowd/logging': + specifier: workspace:* + version: link:../../libs/logging + '@crowd/temporal': + specifier: workspace:* + version: link:../../libs/temporal + '@temporalio/activity': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/client': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/workflow': + specifier: ~1.11.8 + version: 1.11.8 + axios: + specifier: ^1.6.7 + version: 1.13.5 + fast-xml-parser: + specifier: ^4.4.0 + version: 4.5.6 + tsx: + specifier: ^4.7.1 + version: 4.7.3 + typescript: + specifier: ^5.6.3 + version: 5.6.3 + devDependencies: + '@types/node': + specifier: ^20.8.2 + version: 20.12.7 + nodemon: + specifier: ^3.0.1 + version: 3.1.0 + services/apps/profiles_worker: dependencies: '@crowd/archetype-standard': @@ -6650,6 +6702,10 @@ packages: resolution: {integrity: sha512-B9/wizE4WngqQftFPmdaMYlXoJlJOYxGQOanC77fq9k8+Z0v5dDSVh+3glErdIROP//s/jgb7ZuxKfB8nVyo0g==} hasBin: true + fast-xml-parser@4.5.6: + resolution: {integrity: sha512-Yd4vkROfJf8AuJrDIVMVmYfULKmIJszVsMv7Vo71aocsKgFxpdlpSHXSaInvyYfgw2PRuObQSW2GFpVMUjxu9A==} + hasBin: true + fast-xml-parser@5.3.4: resolution: {integrity: sha512-EFd6afGmXlCx8H8WTZHhAoDaWaGyuIBoZJ2mknrNxug+aZKjkp0a0dlars9Izl+jF+7Gu1/5f/2h68cQpe0IiA==} hasBin: true @@ -12927,7 +12983,7 @@ snapshots: '@sendgrid/client@8.1.3': dependencies: '@sendgrid/helpers': 8.0.0 - axios: 1.13.1 + axios: 1.13.5 transitivePeerDependencies: - debug @@ -12958,7 +13014,7 @@ snapshots: '@slack/types': 2.11.0 '@types/is-stream': 1.1.0 '@types/node': 20.12.7 - axios: 1.11.0 + axios: 1.13.5 eventemitter3: 3.1.2 form-data: 2.5.1 is-electron: 2.2.2 @@ -14550,7 +14606,7 @@ snapshots: axios@0.21.4: dependencies: - follow-redirects: 1.15.6 + follow-redirects: 1.15.11 transitivePeerDependencies: - debug @@ -14571,8 +14627,8 @@ snapshots: axios@1.12.0: dependencies: - follow-redirects: 1.15.6 - form-data: 4.0.4 + follow-redirects: 1.15.11 + form-data: 4.0.5 proxy-from-env: 1.1.0 transitivePeerDependencies: - debug @@ -16151,6 +16207,10 @@ snapshots: dependencies: strnum: 1.0.5 + fast-xml-parser@4.5.6: + dependencies: + strnum: 1.0.5 + fast-xml-parser@5.3.4: dependencies: strnum: 2.1.2 @@ -18209,7 +18269,7 @@ snapshots: peopledatalabs@6.1.5: dependencies: - axios: 1.11.0 + axios: 1.13.5 copy-anything: 3.0.5 transitivePeerDependencies: - debug diff --git a/scripts/services/pom-fetcher.yaml b/scripts/services/pom-fetcher.yaml new file mode 100644 index 0000000000..a7aa8a9c37 --- /dev/null +++ b/scripts/services/pom-fetcher.yaml @@ -0,0 +1,69 @@ +version: '3.1' + +x-env-args: &env-args + DOCKER_BUILDKIT: 1 + NODE_ENV: docker + SERVICE: pom-fetcher + SHELL: /bin/sh + SUPPRESS_NO_CONFIG_WARNING: 'true' + POM_FETCHER_BATCH_SIZE: '200' + POM_FETCHER_CONCURRENCY: '10' + POM_FETCHER_STALE_DAYS: '7' + POM_FETCHER_IDLE_SLEEP_SEC: '3600' + +services: + pom-fetcher: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages-worker + command: 'pnpm run start:pom-fetcher' + working_dir: /usr/crowd/app/services/apps/packages_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + restart: always + networks: + - crowd-bridge + + pom-fetcher-dev: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages-worker + command: 'pnpm run dev:pom-fetcher' + working_dir: /usr/crowd/app/services/apps/packages_worker + # user: '${USER_ID}:${GROUP_ID}' + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + hostname: pom-fetcher + networks: + - crowd-bridge + volumes: + - ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src + - ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src + - ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src + - ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src + - ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src + - ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src + - ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src + - ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src + - ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src + - ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src + - ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src + - ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src + - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src + - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src + - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src + - ../../services/apps/packages_worker/src:/usr/crowd/app/services/apps/packages_worker/src + +networks: + crowd-bridge: + external: true diff --git a/services/apps/packages_worker/src/bin/pom-fetcher.ts b/services/apps/packages_worker/src/bin/pom-fetcher.ts new file mode 100644 index 0000000000..d8130bb327 --- /dev/null +++ b/services/apps/packages_worker/src/bin/pom-fetcher.ts @@ -0,0 +1,42 @@ +import { getServiceLogger } from '@crowd/logging' + +import { getPomFetcherConfig } from '../config' +import { getPackagesDb } from '../db' +import { runPomEnrichmentLoop } from '../pom-fetcher/runPomEnrichmentLoop' + +const log = getServiceLogger() + +let shuttingDown = false + +const shutdown = async () => { + if (shuttingDown) return + shuttingDown = true + log.info('Shutting down pom-fetcher...') +} + +process.on('SIGINT', shutdown) +process.on('SIGTERM', shutdown) + +const main = async () => { + log.info('pom-fetcher starting...') + + const config = getPomFetcherConfig() + log.info( + { batchSize: config.batchSize, concurrency: config.concurrency, staleDays: config.staleDays }, + 'Config loaded', + ) + + const qx = await getPackagesDb() + await qx.selectOne('SELECT 1') + log.info('Connected to packages-db.') + + await runPomEnrichmentLoop(qx, config, () => shuttingDown) + + log.info('pom-fetcher stopped.') + process.exit(0) +} + +main().catch((err) => { + log.error({ err }, 'pom-fetcher fatal error') + process.exit(1) +}) diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index 9c7cc7829b..afd0d9ab32 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -45,3 +45,12 @@ export function getEnricherConfig() { fetchTimeoutMs: parseInt(process.env.ENRICHER_FETCH_TIMEOUT_MS ?? '10000', 10), } } + +export function getPomFetcherConfig() { + return { + batchSize: parseInt(process.env.POM_FETCHER_BATCH_SIZE ?? '200', 10), + concurrency: parseInt(process.env.POM_FETCHER_CONCURRENCY ?? '10', 10), + staleDays: parseInt(process.env.POM_FETCHER_STALE_DAYS ?? '7', 10), + idleSleepSec: parseInt(process.env.POM_FETCHER_IDLE_SLEEP_SEC ?? '3600', 10), + } +} diff --git a/services/apps/packages_worker/src/pom-fetcher/extract.ts b/services/apps/packages_worker/src/pom-fetcher/extract.ts new file mode 100644 index 0000000000..ea79257726 --- /dev/null +++ b/services/apps/packages_worker/src/pom-fetcher/extract.ts @@ -0,0 +1,295 @@ +/** + * Core POM extraction logic — pure functions (no I/O side-effects, no DB calls). + * Callers are responsible for concurrency, retries, and persistence. + */ + +import axios from 'axios' +import { XMLParser } from 'fast-xml-parser' + +// ─── Types ──────────────────────────────────────────────────────────────────── + +export interface PomMaintainer { + username: string | null + displayName: string | null + /** Raw email from POM — hash with SHA-256 before storing (GDPR) */ + email: string | null + url: string | null + role: 'author' | 'maintainer' +} + +export interface PomExtractionResult { + groupId: string + artifactId: string + version: string + purl: string + description: string | null + licenses: string[] + licensesRaw: string | null + scmUrl: string | null + homepageUrl: string | null + developers: PomMaintainer[] + contributors: PomMaintainer[] + parentHops: number + error: string | null +} + +// ─── Internal POM types ─────────────────────────────────────────────────────── + +interface PomData { + description?: unknown + url?: unknown + licenses?: { license?: unknown } + scm?: { url?: unknown; connection?: unknown } + developers?: { developer?: unknown } + contributors?: { contributor?: unknown } + parent?: { groupId?: unknown; artifactId?: unknown; version?: unknown } +} + +interface PomPerson { + id?: unknown + name?: unknown + email?: unknown + url?: unknown +} + +// ─── Config ─────────────────────────────────────────────────────────────────── + +const MAVEN_REPO = 'https://repo1.maven.org/maven2' +const MAX_PARENT_HOPS = 5 +const REQUEST_TIMEOUT_MS = 15_000 + +const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: '@_', + parseTagValue: false, // keep all values as strings — prevents version "65" becoming number + parseAttributeValue: false, +}) + +// ─── POM fetch ──────────────────────────────────────────────────────────────── + +export function buildPomUrl(groupId: string, artifactId: string, version: string): string { + const groupPath = groupId.replace(/\./g, '/') + return `${MAVEN_REPO}/${groupPath}/${artifactId}/${version}/${artifactId}-${version}.pom` +} + +export async function fetchPom( + groupId: string, + artifactId: string, + version: string, + log?: (msg: string) => void, +): Promise { + const url = buildPomUrl(groupId, artifactId, version) + try { + const res = await axios.get(url, { responseType: 'text', timeout: REQUEST_TIMEOUT_MS }) + const parsed = parser.parse(res.data) + return (parsed?.project as PomData) ?? null + } catch (err) { + if (axios.isAxiosError(err)) { + const status = err.response?.status + if (status === 404) { + log?.(`POM not found (404): ${url}`) + return null + } + log?.(`HTTP ${status ?? 'unknown'} fetching POM: ${url}`) + return null + } + throw err + } +} + +// ─── Inheritance resolution ─────────────────────────────────────────────────── + +interface ResolvedFields { + description: string | null + licenses: string[] + licensesRaw: string | null + scmUrl: string | null + homepageUrl: string | null + developers: PomMaintainer[] + contributors: PomMaintainer[] + hops: number +} + +async function resolveWithInheritance( + groupId: string, + artifactId: string, + version: string, + log: (msg: string) => void, + depth = 0, +): Promise { + if (depth > MAX_PARENT_HOPS) { + log(`Max parent hops (${MAX_PARENT_HOPS}) reached`) + return emptyFields(depth) + } + + const pom = await fetchPom(groupId, artifactId, version, log) + if (!pom) return emptyFields(depth) + + const licenses = extractLicenses(pom) + const scmUrl = extractStr(pom.scm?.url ?? pom.scm?.connection) + const developers = extractPersons(pom.developers?.developer, 'author') + const contributors = extractPersons(pom.contributors?.contributor, 'maintainer') + + const missingLicense = licenses.length === 0 + const missingScm = !scmUrl + const parent = extractParent(pom) + + if (parent && (missingLicense || missingScm)) { + log(`[hop ${depth + 1}] ${parent.groupId}:${parent.artifactId}:${parent.version}`) + const parentFields = await resolveWithInheritance( + parent.groupId, + parent.artifactId, + parent.version, + log, + depth + 1, + ) + return { + description: extractStr(pom.description) ?? parentFields.description, + licenses: licenses.length > 0 ? licenses : parentFields.licenses, + licensesRaw: licenses.length > 0 ? licenses.join(', ') : parentFields.licensesRaw, + scmUrl: scmUrl ?? parentFields.scmUrl, + homepageUrl: extractStr(pom.url) ?? parentFields.homepageUrl, + developers: developers.length > 0 ? developers : parentFields.developers, + contributors: contributors.length > 0 ? contributors : parentFields.contributors, + hops: parentFields.hops, + } + } + + return { + description: extractStr(pom.description), + licenses, + licensesRaw: licenses.length > 0 ? licenses.join(', ') : null, + scmUrl, + homepageUrl: extractStr(pom.url), + developers, + contributors, + hops: depth, + } +} + +// ─── Public entry point ─────────────────────────────────────────────────────── + +/** + * Fetches and resolves POM metadata for the given Maven artifact. + * Always returns a result object; errors are captured in `result.error`. + */ +export async function extractArtifact( + groupId: string, + artifactId: string, + version: string, + log: (msg: string) => void = () => undefined, +): Promise { + const purl = `pkg:maven/${groupId}/${artifactId}@${version}` + + const rootPom = await fetchPom(groupId, artifactId, version, log) + if (!rootPom) { + const pomUrl = buildPomUrl(groupId, artifactId, version) + return { + groupId, + artifactId, + version, + purl, + description: null, + licenses: [], + licensesRaw: null, + scmUrl: null, + homepageUrl: null, + developers: [], + contributors: [], + parentHops: 0, + error: `POM not found: ${pomUrl}`, + } + } + + try { + const resolved = await resolveWithInheritance(groupId, artifactId, version, log) + return { + groupId, + artifactId, + version, + purl, + description: resolved.description, + licenses: resolved.licenses, + licensesRaw: resolved.licensesRaw, + scmUrl: resolved.scmUrl, + homepageUrl: resolved.homepageUrl, + developers: resolved.developers, + contributors: resolved.contributors, + parentHops: resolved.hops, + error: null, + } + } catch (err) { + const message = err instanceof Error ? err.message : String(err) + log(`Error resolving POM: ${message}`) + return { + groupId, + artifactId, + version, + purl, + description: null, + licenses: [], + licensesRaw: null, + scmUrl: null, + homepageUrl: null, + developers: [], + contributors: [], + parentHops: 0, + error: message, + } + } +} + +// ─── Private helpers ────────────────────────────────────────────────────────── + +function extractStr(value: unknown): string | null { + if (typeof value === 'string' && value.trim()) return value.trim() + return null +} + +function extractLicenses(pom: PomData): string[] { + const raw = pom.licenses?.license + if (!raw) return [] + const list = Array.isArray(raw) ? raw : [raw] + return (list as Array<{ name?: unknown }>) + .map((l) => extractStr(l?.name)) + .filter((n): n is string => n !== null) +} + +function extractPersons(raw: unknown, role: 'author' | 'maintainer'): PomMaintainer[] { + if (!raw) return [] + const list = Array.isArray(raw) ? raw : [raw] + return (list as PomPerson[]) + .filter((p) => p.id || p.name || p.email) + .map((p) => ({ + username: extractStr(p.id), + displayName: extractStr(p.name), + email: extractStr(p.email), + url: extractStr(p.url), + role, + })) +} + +function extractParent( + pom: PomData, +): { groupId: string; artifactId: string; version: string } | null { + const p = pom.parent + if (!p) return null + const groupId = extractStr(p.groupId) + const artifactId = extractStr(p.artifactId) + const version = extractStr(p.version) + if (!groupId || !artifactId || !version) return null + return { groupId, artifactId, version } +} + +function emptyFields(hops: number): ResolvedFields { + return { + description: null, + licenses: [], + licensesRaw: null, + scmUrl: null, + homepageUrl: null, + developers: [], + contributors: [], + hops, + } +} diff --git a/services/apps/packages_worker/src/pom-fetcher/metadata.ts b/services/apps/packages_worker/src/pom-fetcher/metadata.ts new file mode 100644 index 0000000000..9192bc05a1 --- /dev/null +++ b/services/apps/packages_worker/src/pom-fetcher/metadata.ts @@ -0,0 +1,50 @@ +/** + * Resolves the latest release version of a Maven artifact using the + * maven-metadata.xml endpoint on Maven Central. + * + * URL format: + * https://repo1.maven.org/maven2/{groupPath}/{artifactId}/maven-metadata.xml + * + * Returns null when the artifact is not found (404) or the metadata is + * malformed. + */ + +import axios from 'axios' +import { XMLParser } from 'fast-xml-parser' + +const MAVEN_REPO = 'https://repo1.maven.org/maven2' +const REQUEST_TIMEOUT_MS = 10_000 + +const parser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: '@_', + parseTagValue: false, + parseAttributeValue: false, +}) + +export async function resolveLatestVersion( + groupId: string, + artifactId: string, +): Promise { + const groupPath = groupId.replace(/\./g, '/') + const url = `${MAVEN_REPO}/${groupPath}/${artifactId}/maven-metadata.xml` + + try { + const res = await axios.get(url, { responseType: 'text', timeout: REQUEST_TIMEOUT_MS }) + const parsed = parser.parse(res.data) + + // Prefer over — release excludes snapshots/alphas + const versioning = parsed?.metadata?.versioning + const release = typeof versioning?.release === 'string' ? versioning.release.trim() : null + const latest = typeof versioning?.latest === 'string' ? versioning.latest.trim() : null + + return release || latest || null + } catch (err) { + if (axios.isAxiosError(err)) { + // Not found is expected for packages that don't exist on Maven Central + if (err.response?.status === 404) return null + } + // Rethrow unexpected errors so callers can decide whether to retry + throw err + } +} diff --git a/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts b/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts new file mode 100644 index 0000000000..b31a591309 --- /dev/null +++ b/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts @@ -0,0 +1,221 @@ +import crypto from 'crypto' + +import { + listMavenPackagesToEnrich, + upsertMaintainer, + upsertPackage, + upsertPackageMaintainer, +} from '@crowd/data-access-layer' +import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' +import { getServiceChildLogger } from '@crowd/logging' + +import { getPomFetcherConfig } from '../config' +import { extractArtifact } from './extract' +import { resolveLatestVersion } from './metadata' + +const log = getServiceChildLogger('pom-fetcher') + +// ─── Types ──────────────────────────────────────────────────────────────────── + +interface BatchResult { + processed: number + skipped: number + errors: number +} + +// ─── Batch processing ───────────────────────────────────────────────────────── + +async function processBatch( + qx: QueryExecutor, + offset: number, + config: ReturnType, +): Promise { + const packages = await listMavenPackagesToEnrich(qx, { + limit: config.batchSize, + offset, + staleDays: config.staleDays, + }) + + if (packages.length === 0) { + return { processed: 0, skipped: 0, errors: 0 } + } + + log.info({ offset, count: packages.length }, 'Processing POM batch...') + + let processed = 0 + let skipped = 0 + let errors = 0 + + // Process in small concurrent groups to be polite to Maven Central + for (let i = 0; i < packages.length; i += config.concurrency) { + const group = packages.slice(i, i + config.concurrency) + + await Promise.all( + group.map(async (pkg) => { + const groupId = pkg.namespace + const artifactId = pkg.name + + if (!groupId) { + log.warn({ purl: pkg.purl }, 'Skipping package with null namespace (groupId)') + skipped++ + return + } + + try { + log.info({ groupId, artifactId }, 'Fetching POM...') + + // Step 1: resolve latest version from maven-metadata.xml + const version = await resolveLatestVersion(groupId, artifactId) + if (!version) { + log.warn({ groupId, artifactId }, 'Could not resolve latest version, skipping') + skipped++ + return + } + log.info({ groupId, artifactId, version }, 'Version resolved, extracting POM...') + + // Step 2: fetch + resolve POM (follows parent chain) + const result = await extractArtifact(groupId, artifactId, version, (msg) => { + log.debug({ groupId, artifactId, version }, msg) + }) + + if (result.error) { + log.warn({ groupId, artifactId, version, error: result.error }, 'POM extraction error') + errors++ + return + } + log.info( + { + groupId, + artifactId, + version, + licenses: result.licenses, + scmUrl: result.scmUrl, + developers: result.developers.length, + contributors: result.contributors.length, + parentHops: result.parentHops, + }, + 'POM extracted, upserting...', + ) + + // Step 3: upsert into `packages` + // purl at package level has no version (package-level identifier) + const packagePurl = `pkg:maven/${groupId}/${artifactId}` + const packageId = await upsertPackage(qx, { + purl: packagePurl, + ecosystem: 'maven', + namespace: groupId, + name: artifactId, + description: result.description, + homepage: result.homepageUrl, + declaredRepositoryUrl: result.scmUrl, + licenses: result.licenses.length > 0 ? result.licenses : null, + licensesRaw: result.licensesRaw, + latestVersion: version, + ingestionSource: 'pom_fetcher', + }) + + // Step 4: upsert maintainers (developers + contributors) + const allPeople = [ + ...result.developers.map((d) => ({ ...d, role: 'author' as const })), + ...result.contributors.map((c) => ({ ...c, role: 'maintainer' as const })), + ] + + for (const person of allPeople) { + const username = person.username ?? person.email ?? person.displayName + if (!username) continue + + const emailHash = person.email + ? crypto.createHash('sha256').update(person.email.toLowerCase().trim()).digest('hex') + : null + + const maintainerId = await upsertMaintainer(qx, { + ecosystem: 'maven', + username, + displayName: person.displayName, + url: person.url, + emailHash, + }) + + await upsertPackageMaintainer(qx, { + packageId, + maintainerId, + role: person.role, + }) + } + + processed++ + } catch (err) { + const message = err instanceof Error ? err.message : String(err) + log.error({ groupId, artifactId, error: message }, 'Unexpected error processing package') + errors++ + } + }), + ) + } + + return { processed, skipped, errors } +} + +// ─── Main loop ──────────────────────────────────────────────────────────────── + +/** + * Loops indefinitely: pages through all Maven packages that need POM + * enrichment, sleeps when the pass is complete, then restarts from offset 0. + * + * The caller is responsible for creating the DB connection and passing + * `isShuttingDown` so the loop exits cleanly on SIGTERM/SIGINT. + */ +export async function runPomEnrichmentLoop( + qx: QueryExecutor, + config: ReturnType, + isShuttingDown: () => boolean, +): Promise { + let offset = 0 + let totalProcessed = 0 + let totalSkipped = 0 + let totalErrors = 0 + let passNumber = 0 + let passStartedAt = Date.now() + + while (!isShuttingDown()) { + if (offset === 0) { + passNumber++ + passStartedAt = Date.now() + log.info({ pass: passNumber }, 'Starting pass') + } + + const result = await processBatch(qx, offset, config) + + if (result.processed + result.skipped + result.errors === 0) { + // Nothing left in this pass — log summary and sleep + const durationMs = Date.now() - passStartedAt + log.info( + { + totalProcessed, + totalSkipped, + totalErrors, + durationMs, + durationSec: Math.round(durationMs / 1000), + }, + `Pass complete. Sleeping ${config.idleSleepSec}s before next pass.`, + ) + await new Promise((r) => setTimeout(r, config.idleSleepSec * 1000)) + offset = 0 + totalProcessed = 0 + totalSkipped = 0 + totalErrors = 0 + passStartedAt = Date.now() + continue + } + + totalProcessed += result.processed + totalSkipped += result.skipped + totalErrors += result.errors + offset += config.batchSize + + log.info( + { offset, processed: result.processed, skipped: result.skipped, errors: result.errors }, + 'Batch complete', + ) + } +} diff --git a/services/libs/data-access-layer/src/osspckgs/index.ts b/services/libs/data-access-layer/src/osspckgs/index.ts new file mode 100644 index 0000000000..49fc5f85e3 --- /dev/null +++ b/services/libs/data-access-layer/src/osspckgs/index.ts @@ -0,0 +1,3 @@ +export * from './types' +export * from './packages' +export * from './maintainers' diff --git a/services/libs/data-access-layer/src/osspckgs/maintainers.ts b/services/libs/data-access-layer/src/osspckgs/maintainers.ts new file mode 100644 index 0000000000..ebd9a77767 --- /dev/null +++ b/services/libs/data-access-layer/src/osspckgs/maintainers.ts @@ -0,0 +1,55 @@ +import { QueryExecutor } from '../queryExecutor' + +import { IDbMaintainerUpsert, IDbPackageMaintainerUpsert } from './types' + +/** + * Inserts or updates a maintainer row. + * Returns the maintainer id. + */ +export async function upsertMaintainer( + qx: QueryExecutor, + item: IDbMaintainerUpsert, +): Promise { + const row = await qx.selectOne( + ` + INSERT INTO maintainers ( + ecosystem, + username, + display_name, + url, + email_hash + ) VALUES ( + $(ecosystem), + $(username), + $(displayName), + $(url), + $(emailHash) + ) + ON CONFLICT (ecosystem, username) DO UPDATE SET + display_name = COALESCE(EXCLUDED.display_name, maintainers.display_name), + url = COALESCE(EXCLUDED.url, maintainers.url), + email_hash = COALESCE(EXCLUDED.email_hash, maintainers.email_hash) + RETURNING id + `, + item, + ) + return row.id as number +} + +/** + * Links a maintainer to a package with the given role. + * Does nothing on conflict. + */ +export async function upsertPackageMaintainer( + qx: QueryExecutor, + item: IDbPackageMaintainerUpsert, +): Promise { + await qx.result( + ` + INSERT INTO package_maintainers (package_id, maintainer_id, role) + VALUES ($(packageId), $(maintainerId), $(role)) + ON CONFLICT (package_id, maintainer_id) DO NOTHING + `, + item, + ) +} diff --git a/services/libs/data-access-layer/src/osspckgs/packages.ts b/services/libs/data-access-layer/src/osspckgs/packages.ts index 124ef74fcc..f28dd1efea 100644 --- a/services/libs/data-access-layer/src/osspckgs/packages.ts +++ b/services/libs/data-access-layer/src/osspckgs/packages.ts @@ -1,4 +1,5 @@ import { QueryExecutor } from '../queryExecutor' +import { IDbPackageUniverse, IDbPackageUpsert } from './types' export async function findPackageIdsByPurl( qx: QueryExecutor, @@ -10,3 +11,94 @@ export async function findPackageIdsByPurl( }) return new Map(rows.map((r: { purl: string; id: number }) => [r.purl, r.id])) } +// ─── packages_universe ──────────────────────────────────────────────────────── + +/** + * Returns a page of Maven packages from packages_universe that either have no + * corresponding entry in `packages` yet, or whose `packages.last_synced_at` is + * older than the given cutoff (defaults to 7 days). + * + * Ordered by rank_in_ecosystem ASC (most critical first), unranked last. + */ +export async function listMavenPackagesToEnrich( + qx: QueryExecutor, + options: { limit: number; offset: number; staleDays?: number }, +): Promise[]> { + const { limit, offset, staleDays = 7 } = options + + return qx.select( + ` + SELECT + pu.id, + pu.purl, + pu.namespace, + pu.name + FROM packages_universe pu + LEFT JOIN packages p ON p.purl = pu.purl + WHERE + pu.ecosystem = 'maven' + AND pu.namespace IS NOT NULL + AND ( + p.id IS NULL + OR p.last_synced_at < NOW() - ($(staleDays) || ' days')::interval + ) + ORDER BY + pu.rank_in_ecosystem ASC NULLS LAST, + pu.id ASC + LIMIT $(limit) OFFSET $(offset) + `, + { limit, offset, staleDays }, + ) +} + +// ─── packages upsert ────────────────────────────────────────────────────────── + +/** + * Inserts or updates a row in `packages`. + * Returns the id of the upserted row. + */ +export async function upsertPackage(qx: QueryExecutor, item: IDbPackageUpsert): Promise { + const row = await qx.selectOne( + ` + INSERT INTO packages ( + purl, + ecosystem, + namespace, + name, + description, + homepage, + declared_repository_url, + licenses, + licenses_raw, + latest_version, + ingestion_source, + last_synced_at + ) VALUES ( + $(purl), + $(ecosystem), + $(namespace), + $(name), + $(description), + $(homepage), + $(declaredRepositoryUrl), + $(licenses)::text[], + $(licensesRaw), + $(latestVersion), + $(ingestionSource), + NOW() + ) + ON CONFLICT (purl) DO UPDATE SET + description = EXCLUDED.description, + homepage = EXCLUDED.homepage, + declared_repository_url = EXCLUDED.declared_repository_url, + licenses = EXCLUDED.licenses, + licenses_raw = EXCLUDED.licenses_raw, + latest_version = COALESCE(EXCLUDED.latest_version, packages.latest_version), + ingestion_source = EXCLUDED.ingestion_source, + last_synced_at = NOW() + RETURNING id + `, + item, + ) + return row.id as number +} diff --git a/services/libs/data-access-layer/src/osspckgs/types.ts b/services/libs/data-access-layer/src/osspckgs/types.ts new file mode 100644 index 0000000000..7553ff00a6 --- /dev/null +++ b/services/libs/data-access-layer/src/osspckgs/types.ts @@ -0,0 +1,44 @@ +// ─── packages_universe ──────────────────────────────────────────────────────── + +export interface IDbPackageUniverse { + id: number + purl: string | null + ecosystem: string + namespace: string | null + name: string + rankInEcosystem: number | null +} + +// ─── packages ───────────────────────────────────────────────────────────────── + +export type IDbPackageUpsert = { + purl: string + ecosystem: string + namespace: string | null + name: string + description: string | null + homepage: string | null + declaredRepositoryUrl: string | null + licenses: string[] | null + licensesRaw: string | null + latestVersion: string | null + ingestionSource: string +} + +// ─── maintainers ────────────────────────────────────────────────────────────── + +export type IDbMaintainerUpsert = { + ecosystem: string + username: string + displayName: string | null + url: string | null + emailHash: string | null +} + +// ─── package_maintainers ────────────────────────────────────────────────────── + +export type IDbPackageMaintainerUpsert = { + packageId: number + maintainerId: number + role: 'author' | 'maintainer' | null +} From f284fa6cc8e1d4616f83411d72cb66a7e4ad0d85 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 27 May 2026 10:19:25 +0200 Subject: [PATCH 04/22] feat: pom extractor Signed-off-by: Umberto Sgueglia --- scripts/cli | 2 +- scripts/services/packages.yaml | 4 ++ scripts/services/pom-fetcher.yaml | 5 +- services/apps/packages_worker/package.json | 3 + services/apps/packages_worker/src/config.ts | 4 +- .../src/pom-fetcher/runPomEnrichmentLoop.ts | 60 +++++++++++-------- 6 files changed, 47 insertions(+), 31 deletions(-) diff --git a/scripts/cli b/scripts/cli index 9f3ce75aac..6e863b5c44 100755 --- a/scripts/cli +++ b/scripts/cli @@ -1060,7 +1060,7 @@ while test $# -gt 0; do exit ;; clean-start-dev) - # IGNORED_SERVICES=("python-worker" "job-generator" "discord-ws" "webhook-api" "profiles-worker" "organizations-enrichment-worker" "merge-suggestions-worker" "members-enrichment-worker" "exports-worker" "entity-merging-worker") + IGNORED_SERVICES=("python-worker" "job-generator" "discord-ws" "webhook-api" "profiles-worker" "organizations-enrichment-worker" "merge-suggestions-worker" "members-enrichment-worker" "exports-worker" "entity-merging-worker") CLEAN_START=1 DEV=1 start diff --git a/scripts/services/packages.yaml b/scripts/services/packages.yaml index 459f780ebd..bd6f6499fe 100644 --- a/scripts/services/packages.yaml +++ b/scripts/services/packages.yaml @@ -8,6 +8,10 @@ x-env-args: &env-args CROWD_TEMPORAL_NAMESPACE: ${CROWD_PACKAGES_TEMPORAL_NAMESPACE} SHELL: /bin/sh SUPPRESS_NO_CONFIG_WARNING: 'true' + POM_FETCHER_BATCH_SIZE: '50' + POM_FETCHER_CONCURRENCY: '3' + POM_FETCHER_STALE_DAYS: '7' + POM_FETCHER_IDLE_SLEEP_SEC: '3600' services: packages: diff --git a/scripts/services/pom-fetcher.yaml b/scripts/services/pom-fetcher.yaml index a7aa8a9c37..4210778a6d 100644 --- a/scripts/services/pom-fetcher.yaml +++ b/scripts/services/pom-fetcher.yaml @@ -6,8 +6,9 @@ x-env-args: &env-args SERVICE: pom-fetcher SHELL: /bin/sh SUPPRESS_NO_CONFIG_WARNING: 'true' - POM_FETCHER_BATCH_SIZE: '200' - POM_FETCHER_CONCURRENCY: '10' + LOG_LEVEL: 'info' + POM_FETCHER_BATCH_SIZE: '50' + POM_FETCHER_CONCURRENCY: '3' POM_FETCHER_STALE_DAYS: '7' POM_FETCHER_IDLE_SLEEP_SEC: '3600' diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index 6ad988b6e4..4fdcf0d256 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -17,6 +17,9 @@ "monitor:osspckgs:local": "bash -c 'set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=monitor tsx src/scripts/monitorOsspckgs.ts'", "trigger-bootstrap": "SERVICE=deps-dev-ingest tsx src/scripts/triggerBootstrap.ts", "trigger-bootstrap:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=deps-dev-ingest tsx src/scripts/triggerBootstrap.ts", + "start:pom-fetcher": "SERVICE=pom-fetcher tsx src/bin/pom-fetcher.ts", + "dev:pom-fetcher": "SERVICE=pom-fetcher LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", + "dev:pom-fetcher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=pom-fetcher LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", "format": "npx prettier --write \"src/**/*.ts\"", "format-check": "npx prettier --check .", diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index afd0d9ab32..58c58fd414 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -48,8 +48,8 @@ export function getEnricherConfig() { export function getPomFetcherConfig() { return { - batchSize: parseInt(process.env.POM_FETCHER_BATCH_SIZE ?? '200', 10), - concurrency: parseInt(process.env.POM_FETCHER_CONCURRENCY ?? '10', 10), + batchSize: parseInt(process.env.POM_FETCHER_BATCH_SIZE ?? '50', 10), + concurrency: parseInt(process.env.POM_FETCHER_CONCURRENCY ?? '3', 10), staleDays: parseInt(process.env.POM_FETCHER_STALE_DAYS ?? '7', 10), idleSleepSec: parseInt(process.env.POM_FETCHER_IDLE_SLEEP_SEC ?? '3600', 10), } diff --git a/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts b/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts index b31a591309..a652377e4f 100644 --- a/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts +++ b/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts @@ -27,12 +27,11 @@ interface BatchResult { async function processBatch( qx: QueryExecutor, - offset: number, config: ReturnType, ): Promise { const packages = await listMavenPackagesToEnrich(qx, { limit: config.batchSize, - offset, + offset: 0, staleDays: config.staleDays, }) @@ -40,11 +39,12 @@ async function processBatch( return { processed: 0, skipped: 0, errors: 0 } } - log.info({ offset, count: packages.length }, 'Processing POM batch...') + log.info({ count: packages.length }, 'Processing POM batch...') let processed = 0 let skipped = 0 let errors = 0 + const PROGRESS_EVERY = 25 // Process in small concurrent groups to be polite to Maven Central for (let i = 0; i < packages.length; i += config.concurrency) { @@ -62,16 +62,30 @@ async function processBatch( } try { - log.info({ groupId, artifactId }, 'Fetching POM...') - // Step 1: resolve latest version from maven-metadata.xml const version = await resolveLatestVersion(groupId, artifactId) if (!version) { log.warn({ groupId, artifactId }, 'Could not resolve latest version, skipping') + // Upsert a minimal record so last_synced_at is set — prevents this package + // from re-appearing in every batch within the same pass. + // ingestionSource 'pom_fetcher_no_version' marks that it was tried but had no + // resolvable version on Maven Central (404 on maven-metadata.xml). + await upsertPackage(qx, { + purl: `pkg:maven/${groupId}/${artifactId}`, + ecosystem: 'maven', + namespace: groupId, + name: artifactId, + description: null, + homepage: null, + declaredRepositoryUrl: null, + licenses: null, + licensesRaw: null, + latestVersion: null, + ingestionSource: 'pom_fetcher_no_version', + }) skipped++ return } - log.info({ groupId, artifactId, version }, 'Version resolved, extracting POM...') // Step 2: fetch + resolve POM (follows parent chain) const result = await extractArtifact(groupId, artifactId, version, (msg) => { @@ -83,19 +97,6 @@ async function processBatch( errors++ return } - log.info( - { - groupId, - artifactId, - version, - licenses: result.licenses, - scmUrl: result.scmUrl, - developers: result.developers.length, - contributors: result.contributors.length, - parentHops: result.parentHops, - }, - 'POM extracted, upserting...', - ) // Step 3: upsert into `packages` // purl at package level has no version (package-level identifier) @@ -151,6 +152,17 @@ async function processBatch( } }), ) + + // done = packages processed so far (based on loop index, always accurate) + const done = i + group.length + const prevDone = i + const crossedBoundary = Math.floor(done / PROGRESS_EVERY) > Math.floor(prevDone / PROGRESS_EVERY) + if (crossedBoundary || done === packages.length) { + log.info( + { done, total: packages.length, processed, skipped, errors }, + `Progress: ${done}/${packages.length}`, + ) + } } return { processed, skipped, errors } @@ -170,7 +182,6 @@ export async function runPomEnrichmentLoop( config: ReturnType, isShuttingDown: () => boolean, ): Promise { - let offset = 0 let totalProcessed = 0 let totalSkipped = 0 let totalErrors = 0 @@ -178,13 +189,13 @@ export async function runPomEnrichmentLoop( let passStartedAt = Date.now() while (!isShuttingDown()) { - if (offset === 0) { + if (totalProcessed + totalSkipped + totalErrors === 0) { passNumber++ passStartedAt = Date.now() log.info({ pass: passNumber }, 'Starting pass') } - const result = await processBatch(qx, offset, config) + const result = await processBatch(qx, config) if (result.processed + result.skipped + result.errors === 0) { // Nothing left in this pass — log summary and sleep @@ -200,21 +211,18 @@ export async function runPomEnrichmentLoop( `Pass complete. Sleeping ${config.idleSleepSec}s before next pass.`, ) await new Promise((r) => setTimeout(r, config.idleSleepSec * 1000)) - offset = 0 totalProcessed = 0 totalSkipped = 0 totalErrors = 0 - passStartedAt = Date.now() continue } totalProcessed += result.processed totalSkipped += result.skipped totalErrors += result.errors - offset += config.batchSize log.info( - { offset, processed: result.processed, skipped: result.skipped, errors: result.errors }, + { processed: result.processed, skipped: result.skipped, errors: result.errors, totalProcessed, totalSkipped, totalErrors }, 'Batch complete', ) } From ecab481df975d16a696004bd171e483252544e66 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 28 May 2026 18:25:04 +0200 Subject: [PATCH 05/22] feat: pom extractor Signed-off-by: Umberto Sgueglia --- backend/.env.dist.local | 7 + pnpm-lock.yaml | 164 ++++---- services/apps/packages_worker/package.json | 6 +- .../apps/packages_worker/src/activities.ts | 1 + .../src/bin/packages-worker.ts | 2 + .../packages_worker/src/bin/pom-fetcher.ts | 2 +- services/apps/packages_worker/src/config.ts | 7 +- .../src/pom-fetcher/activities.ts | 23 ++ .../src/pom-fetcher/extract.ts | 68 +++- .../src/pom-fetcher/metadata.ts | 41 +- .../src/pom-fetcher/runPomEnrichmentLoop.ts | 367 +++++++++++------- .../src/pom-fetcher/schedule.ts | 41 ++ .../src/pom-fetcher/workflows.ts | 36 ++ .../packages_worker/src/workflows/index.ts | 1 + .../src/osspckgs/packages.ts | 94 ++++- .../data-access-layer/src/osspckgs/types.ts | 11 + 16 files changed, 612 insertions(+), 259 deletions(-) create mode 100644 services/apps/packages_worker/src/pom-fetcher/activities.ts create mode 100644 services/apps/packages_worker/src/pom-fetcher/schedule.ts create mode 100644 services/apps/packages_worker/src/pom-fetcher/workflows.ts diff --git a/backend/.env.dist.local b/backend/.env.dist.local index eb67a512a7..b1a9c85f8d 100755 --- a/backend/.env.dist.local +++ b/backend/.env.dist.local @@ -199,3 +199,10 @@ OSV_ECOSYSTEMS=npm,Maven OSV_TMP_DIR=/tmp/osv OSV_BATCH_SIZE=500 OSV_DERIVE_BATCH_SIZE=1000 +# pom-fetcher non-critical (DB-only) +POM_FETCHER_NON_CRITICAL_BATCH_SIZE=500 +POM_FETCHER_NON_CRITICAL_CONCURRENCY=20 + +# pom-fetcher critical (HTTP) +POM_FETCHER_BATCH_SIZE=100 +POM_FETCHER_CONCURRENCY=5 diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b9e5074f5a..2e5b4aff0a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1336,6 +1336,12 @@ importers: semver: specifier: ^7.6.0 version: 7.6.0 + axios: + specifier: ^1.16.1 + version: 1.16.1 + fast-xml-parser: + specifier: ^5.8.0 + version: 5.8.0 tsx: specifier: ^4.7.1 version: 4.7.3 @@ -1411,58 +1417,6 @@ importers: specifier: ^3.0.1 version: 3.1.0 - services/apps/pom_fetcher_worker: - dependencies: - '@crowd/archetype-standard': - specifier: workspace:* - version: link:../../archetypes/standard - '@crowd/archetype-worker': - specifier: workspace:* - version: link:../../archetypes/worker - '@crowd/common': - specifier: workspace:* - version: link:../../libs/common - '@crowd/data-access-layer': - specifier: workspace:* - version: link:../../libs/data-access-layer - '@crowd/database': - specifier: workspace:* - version: link:../../libs/database - '@crowd/logging': - specifier: workspace:* - version: link:../../libs/logging - '@crowd/temporal': - specifier: workspace:* - version: link:../../libs/temporal - '@temporalio/activity': - specifier: ~1.11.8 - version: 1.11.8 - '@temporalio/client': - specifier: ~1.11.8 - version: 1.11.8 - '@temporalio/workflow': - specifier: ~1.11.8 - version: 1.11.8 - axios: - specifier: ^1.6.7 - version: 1.13.5 - fast-xml-parser: - specifier: ^4.4.0 - version: 4.5.6 - tsx: - specifier: ^4.7.1 - version: 4.7.3 - typescript: - specifier: ^5.6.3 - version: 5.6.3 - devDependencies: - '@types/node': - specifier: ^20.8.2 - version: 20.12.7 - nodemon: - specifier: ^3.0.1 - version: 3.1.0 - services/apps/profiles_worker: dependencies: '@crowd/archetype-standard': @@ -3796,6 +3750,9 @@ packages: '@nangohq/types@0.69.22': resolution: {integrity: sha512-3p7KMZ3GDXrt+wo5BKn/ouEX93TPTBtHRzFWq8AIRLl9aaOi3T0CraHz94NlHye1od5N2mWeN04sCu9f4WTyxA==} + '@nodable/entities@2.1.0': + resolution: {integrity: sha512-nyT7T3nbMyBI/lvr6L5TyWbFJAI9FTgVRakNoBqCD+PmID8DzFrrNdLLtHMwMszOtqZa8PAOV24ZqDnQrhQINA==} + '@nodelib/fs.scandir@2.1.5': resolution: {integrity: sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==} engines: {node: '>= 8'} @@ -5443,6 +5400,9 @@ packages: axios@1.13.5: resolution: {integrity: sha512-cz4ur7Vb0xS4/KUN0tPWe44eqxrIu31me+fbang3ijiNscE129POzipJJA6zniq2C/Z6sJCjMimjS8Lc/GAs8Q==} + axios@1.16.1: + resolution: {integrity: sha512-caYkukvroVPO8KrzuJEb50Hm07KwfBZPEC3VeFHTsqWHvKTsy54hjJz9BS/cdaypROE2rH6xvm9mHX4fgWkr3A==} + axios@1.6.8: resolution: {integrity: sha512-v/ZHtJDU39mDpyBoFVkETcd/uNdxrWRrg3bKpOKzXFA6Bvqopts6ALSMU3y6ijYxbw2B+wPrIv46egTzJXCLGQ==} @@ -6698,14 +6658,13 @@ packages: fast-uri@3.0.6: resolution: {integrity: sha512-Atfo14OibSv5wAp4VWNsFYE1AchQRTv9cBGWET4pZWHzYshFSS9NQI6I57rdKn9croWVMbYFbLhJ+yJvmZIIHw==} + fast-xml-builder@1.2.0: + resolution: {integrity: sha512-00aAWieqff+ZJhsXA4g1g7M8k+7AYoMUUHF+/zFb5U6Uv/P0Vl4QZo84/IcufzYalLuEj9928bXN9PbbFzMF0Q==} + fast-xml-parser@4.2.5: resolution: {integrity: sha512-B9/wizE4WngqQftFPmdaMYlXoJlJOYxGQOanC77fq9k8+Z0v5dDSVh+3glErdIROP//s/jgb7ZuxKfB8nVyo0g==} hasBin: true - fast-xml-parser@4.5.6: - resolution: {integrity: sha512-Yd4vkROfJf8AuJrDIVMVmYfULKmIJszVsMv7Vo71aocsKgFxpdlpSHXSaInvyYfgw2PRuObQSW2GFpVMUjxu9A==} - hasBin: true - fast-xml-parser@5.3.4: resolution: {integrity: sha512-EFd6afGmXlCx8H8WTZHhAoDaWaGyuIBoZJ2mknrNxug+aZKjkp0a0dlars9Izl+jF+7Gu1/5f/2h68cQpe0IiA==} hasBin: true @@ -6714,6 +6673,10 @@ packages: resolution: {integrity: sha512-JeaA2Vm9ffQKp9VjvfzObuMCjUYAp5WDYhRYL5LrBPY/jUDlUtOvDfot0vKSkB9tuX885BDHjtw4fZadD95wnA==} hasBin: true + fast-xml-parser@5.8.0: + resolution: {integrity: sha512-6bIM7fsJxeo3uXv7OncQYsBAMPJ7V16Slahl/6M98C/i2q+vB1+4a0MtrvYwDFEUrwDSbAmeLDRXsOBwrL7yAg==} + hasBin: true + fastest-levenshtein@1.0.16: resolution: {integrity: sha512-eRnCtTTtGZFpQCwhJiUOuxPQWRXVKYDn0b2PeHfXL6/Zi53SLAzAHfVhVWK2AryC/WH05kGfxhFIPvTF0SXQzg==} engines: {node: '>= 4.9.1'} @@ -6808,6 +6771,15 @@ packages: debug: optional: true + follow-redirects@1.16.0: + resolution: {integrity: sha512-y5rN/uOsadFT/JfYwhxRS5R7Qce+g3zG97+JrtFZlC9klX/W5hD7iiLzScI4nZqUS7DNUdhPgw4xI8W2LuXlUw==} + engines: {node: '>=4.0'} + peerDependencies: + debug: '*' + peerDependenciesMeta: + debug: + optional: true + for-each@0.3.3: resolution: {integrity: sha512-jqYfLp7mo9vIyQf8ykW2v7A+2N4QjeCeI5+Dz9XraiO1ign81wjiH7Fb9vSOWvQfNtmSa4H2RoQTrrXivdUZmw==} @@ -8625,6 +8597,10 @@ packages: resolution: {integrity: sha512-RjhtfwJOxzcFmNOi6ltcbcu4Iu+FL3zEj83dk4kAS+fVpTxXLO1b38RvJgT/0QwvV/L3aY9TAnyv0EOqW4GoMQ==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + path-expression-matcher@1.5.0: + resolution: {integrity: sha512-cbrerZV+6rvdQrrD+iGMcZFEiiSrbv9Tfdkvnusy6y0x0GKBXREFg/Y65GhIfm0tnLntThhzCnfKwp1WRjeCyQ==} + engines: {node: '>=14.0.0'} + path-is-absolute@1.0.1: resolution: {integrity: sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==} engines: {node: '>=0.10.0'} @@ -8854,6 +8830,10 @@ packages: proxy-from-env@1.1.0: resolution: {integrity: sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==} + proxy-from-env@2.1.0: + resolution: {integrity: sha512-cJ+oHTW1VAEa8cJslgmUZrc+sjRKgAKl3Zyse6+PV38hZe/V6Z14TbCuXcan9F9ghlz4QrFr2c92TNF82UkYHA==} + engines: {node: '>=10'} + pseudomap@1.0.2: resolution: {integrity: sha512-b/YwNhb8lk1Zz2+bXXpS/LK9OisiZZ1SNsSLxN1x2OXVEhW2Ckr/7mWE5vrC1ZTiJlD9g19jWszTmJsB+oEpFQ==} @@ -9553,6 +9533,9 @@ packages: strnum@2.1.2: resolution: {integrity: sha512-l63NF9y/cLROq/yqKXSLtcMeeyOfnSQlfMSlzFt/K73oIaD8DGaQWd7Z34X9GPiKqP5rbSh84Hl4bOlLcjiSrQ==} + strnum@2.3.0: + resolution: {integrity: sha512-ums3KNd42PGyx5xaoVTO1mjU1bH3NpY4vsrVlnv9PNGqQj8wd7rJ6nEypLrJ7z5vxK5RP0yMLo6J/Gsm62DI5Q==} + stubs@3.0.0: resolution: {integrity: sha512-PdHt7hHUJKxvTCgbKX9C1V/ftOcjJQgz8BZwNfV5c4B6dcGqlpelTbJ999jBGZ2jYiPAwcX5dP6oBwVlBlUbxw==} @@ -10220,6 +10203,10 @@ packages: resolution: {integrity: sha512-PSNhEJDejZYV7h50BohL09Er9VaIefr2LMAf3OEmpCkjOi34eYyQYAXUTjEQtZJTKcF0E2UKTh+osDLsgNim9Q==} engines: {node: '>=8'} + xml-naming@0.1.0: + resolution: {integrity: sha512-k8KO9hrMyNk6tUWqUfkTEZbezRRpONVOzUTnc97VnCvyj6Tf9lyUR9EDAIeiVLv56jsMcoXEwjW8Kv5yPY52lw==} + engines: {node: '>=16.0.0'} + xml2js@0.4.19: resolution: {integrity: sha512-esZnJZJOiJR9wWKMyuvSE1y6Dq5LCuJanqhxslH2bxM6duahNZ+HMpCLhBQGZkbX6xRf8x1Y2eJlgt2q3qo49Q==} @@ -10424,8 +10411,8 @@ snapshots: dependencies: '@aws-crypto/sha256-browser': 3.0.0 '@aws-crypto/sha256-js': 3.0.0 - '@aws-sdk/client-sso-oidc': 3.572.0(@aws-sdk/client-sts@3.572.0) - '@aws-sdk/client-sts': 3.572.0 + '@aws-sdk/client-sso-oidc': 3.572.0 + '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) '@aws-sdk/core': 3.572.0 '@aws-sdk/credential-provider-node': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0) '@aws-sdk/middleware-host-header': 3.567.0 @@ -10619,11 +10606,11 @@ snapshots: transitivePeerDependencies: - aws-crt - '@aws-sdk/client-sso-oidc@3.572.0(@aws-sdk/client-sts@3.572.0)': + '@aws-sdk/client-sso-oidc@3.572.0': dependencies: '@aws-crypto/sha256-browser': 3.0.0 '@aws-crypto/sha256-js': 3.0.0 - '@aws-sdk/client-sts': 3.572.0 + '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) '@aws-sdk/core': 3.572.0 '@aws-sdk/credential-provider-node': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0) '@aws-sdk/middleware-host-header': 3.567.0 @@ -10662,7 +10649,6 @@ snapshots: '@smithy/util-utf8': 2.3.0 tslib: 2.6.2 transitivePeerDependencies: - - '@aws-sdk/client-sts' - aws-crt '@aws-sdk/client-sso@3.556.0': @@ -10838,11 +10824,11 @@ snapshots: transitivePeerDependencies: - aws-crt - '@aws-sdk/client-sts@3.572.0': + '@aws-sdk/client-sts@3.572.0(@aws-sdk/client-sso-oidc@3.572.0)': dependencies: '@aws-crypto/sha256-browser': 3.0.0 '@aws-crypto/sha256-js': 3.0.0 - '@aws-sdk/client-sso-oidc': 3.572.0(@aws-sdk/client-sts@3.572.0) + '@aws-sdk/client-sso-oidc': 3.572.0 '@aws-sdk/core': 3.572.0 '@aws-sdk/credential-provider-node': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0) '@aws-sdk/middleware-host-header': 3.567.0 @@ -10881,6 +10867,7 @@ snapshots: '@smithy/util-utf8': 2.3.0 tslib: 2.6.2 transitivePeerDependencies: + - '@aws-sdk/client-sso-oidc' - aws-crt '@aws-sdk/client-sts@3.985.0': @@ -11046,7 +11033,7 @@ snapshots: '@aws-sdk/credential-provider-ini@3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0)': dependencies: - '@aws-sdk/client-sts': 3.572.0 + '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) '@aws-sdk/credential-provider-env': 3.568.0 '@aws-sdk/credential-provider-process': 3.572.0 '@aws-sdk/credential-provider-sso': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) @@ -11223,7 +11210,7 @@ snapshots: '@aws-sdk/credential-provider-web-identity@3.568.0(@aws-sdk/client-sts@3.572.0)': dependencies: - '@aws-sdk/client-sts': 3.572.0 + '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) '@aws-sdk/types': 3.567.0 '@smithy/property-provider': 2.2.0 '@smithy/types': 2.12.0 @@ -11535,7 +11522,7 @@ snapshots: '@aws-sdk/token-providers@3.572.0(@aws-sdk/client-sso-oidc@3.572.0)': dependencies: - '@aws-sdk/client-sso-oidc': 3.572.0(@aws-sdk/client-sts@3.572.0) + '@aws-sdk/client-sso-oidc': 3.572.0 '@aws-sdk/types': 3.567.0 '@smithy/property-provider': 2.2.0 '@smithy/shared-ini-file-loader': 2.4.0 @@ -12479,6 +12466,8 @@ snapshots: transitivePeerDependencies: - debug + '@nodable/entities@2.1.0': {} + '@nodelib/fs.scandir@2.1.5': dependencies: '@nodelib/fs.stat': 2.0.5 @@ -14370,7 +14359,7 @@ snapshots: agent-base@6.0.2: dependencies: - debug: 4.4.0(supports-color@5.5.0) + debug: 4.4.3 transitivePeerDependencies: - supports-color @@ -14649,6 +14638,16 @@ snapshots: transitivePeerDependencies: - debug + axios@1.16.1: + dependencies: + follow-redirects: 1.16.0 + form-data: 4.0.5 + https-proxy-agent: 5.0.1 + proxy-from-env: 2.1.0 + transitivePeerDependencies: + - debug + - supports-color + axios@1.6.8: dependencies: follow-redirects: 1.15.6 @@ -16203,11 +16202,12 @@ snapshots: fast-uri@3.0.6: {} - fast-xml-parser@4.2.5: + fast-xml-builder@1.2.0: dependencies: - strnum: 1.0.5 + path-expression-matcher: 1.5.0 + xml-naming: 0.1.0 - fast-xml-parser@4.5.6: + fast-xml-parser@4.2.5: dependencies: strnum: 1.0.5 @@ -16219,6 +16219,14 @@ snapshots: dependencies: strnum: 2.1.2 + fast-xml-parser@5.8.0: + dependencies: + '@nodable/entities': 2.1.0 + fast-xml-builder: 1.2.0 + path-expression-matcher: 1.5.0 + strnum: 2.3.0 + xml-naming: 0.1.0 + fastest-levenshtein@1.0.16: {} fastq@1.17.1: @@ -16320,6 +16328,8 @@ snapshots: follow-redirects@1.15.6: {} + follow-redirects@1.16.0: {} + for-each@0.3.3: dependencies: is-callable: 1.2.7 @@ -16850,7 +16860,7 @@ snapshots: dependencies: '@tootallnate/once': 2.0.0 agent-base: 6.0.2 - debug: 4.4.0(supports-color@5.5.0) + debug: 4.4.3 transitivePeerDependencies: - supports-color @@ -16866,7 +16876,7 @@ snapshots: https-proxy-agent@5.0.1: dependencies: agent-base: 6.0.2 - debug: 4.4.0(supports-color@5.5.0) + debug: 4.4.3 transitivePeerDependencies: - supports-color @@ -18232,6 +18242,8 @@ snapshots: path-exists@5.0.0: {} + path-expression-matcher@1.5.0: {} + path-is-absolute@1.0.1: {} path-key@2.0.1: {} @@ -18439,6 +18451,8 @@ snapshots: proxy-from-env@1.1.0: {} + proxy-from-env@2.1.0: {} + pseudomap@1.0.2: {} pstree.remy@1.1.8: {} @@ -18674,7 +18688,7 @@ snapshots: retry-request@4.2.2: dependencies: - debug: 4.4.0(supports-color@5.5.0) + debug: 4.4.3 extend: 3.0.2 transitivePeerDependencies: - supports-color @@ -19314,6 +19328,8 @@ snapshots: strnum@2.1.2: {} + strnum@2.3.0: {} + stubs@3.0.0: {} superagent@8.1.2: @@ -20129,6 +20145,8 @@ snapshots: xdg-basedir@4.0.0: {} + xml-naming@0.1.0: {} + xml2js@0.4.19: dependencies: sax: 1.2.1 diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index 4fdcf0d256..364385dce7 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -5,11 +5,12 @@ "start:deps-dev-ingest": "CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest tsx src/bin/deps-dev-ingest.ts", "start:github-repos-enricher": "SERVICE=github-repos-enricher tsx src/bin/github-repos-enricher.ts", "start:packages-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker tsx src/bin/packages-worker.ts", + "start:pom-fetcher": "SERVICE=pom-fetcher tsx src/bin/pom-fetcher.ts", + "dev:packages-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", "dev:deps-dev-ingest": "CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/deps-dev-ingest.ts", "dev:deps-dev-ingest:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/deps-dev-ingest.ts", "dev:github-repos-enricher": "SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", "dev:github-repos-enricher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", - "dev:packages-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", "dev:packages-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", "export-to-bucket": "SERVICE=deps-dev-ingest tsx src/scripts/exportToBucket.ts", "export-to-bucket:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=deps-dev-ingest tsx src/scripts/exportToBucket.ts", @@ -17,7 +18,6 @@ "monitor:osspckgs:local": "bash -c 'set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=monitor tsx src/scripts/monitorOsspckgs.ts'", "trigger-bootstrap": "SERVICE=deps-dev-ingest tsx src/scripts/triggerBootstrap.ts", "trigger-bootstrap:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=deps-dev-ingest tsx src/scripts/triggerBootstrap.ts", - "start:pom-fetcher": "SERVICE=pom-fetcher tsx src/bin/pom-fetcher.ts", "dev:pom-fetcher": "SERVICE=pom-fetcher LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", "dev:pom-fetcher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=pom-fetcher LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", @@ -41,6 +41,8 @@ "@temporalio/workflow": "~1.11.8", "jsonwebtoken": "^9.0.0", "semver": "^7.6.0", + "axios": "^1.16.1", + "fast-xml-parser": "^5.8.0", "tsx": "^4.7.1", "typescript": "^5.6.3", "unzipper": "^0.12.3" diff --git a/services/apps/packages_worker/src/activities.ts b/services/apps/packages_worker/src/activities.ts index 89f75f54c3..95dbcccfc0 100644 --- a/services/apps/packages_worker/src/activities.ts +++ b/services/apps/packages_worker/src/activities.ts @@ -1,3 +1,4 @@ export * from './deps-dev/activities' export * from './npm/activities' export { osvSyncEcosystem, osvDeriveCriticalFlag } from './osv/activities' +export { processMavenCriticalBatch, processMavenNonCriticalBatch } from './pom-fetcher/activities' diff --git a/services/apps/packages_worker/src/bin/packages-worker.ts b/services/apps/packages_worker/src/bin/packages-worker.ts index 21dee223f4..4396c6a2d1 100644 --- a/services/apps/packages_worker/src/bin/packages-worker.ts +++ b/services/apps/packages_worker/src/bin/packages-worker.ts @@ -1,10 +1,12 @@ import { scheduleNpmIngest } from '../npm/schedule' import { scheduleOsvSync } from '../osv/schedule' +import { schedulePomFetcher } from '../pom-fetcher/schedule' import { svc } from '../service' setImmediate(async () => { await svc.init() await scheduleNpmIngest() await scheduleOsvSync() + await schedulePomFetcher() await svc.start() }) diff --git a/services/apps/packages_worker/src/bin/pom-fetcher.ts b/services/apps/packages_worker/src/bin/pom-fetcher.ts index d8130bb327..5d2773f911 100644 --- a/services/apps/packages_worker/src/bin/pom-fetcher.ts +++ b/services/apps/packages_worker/src/bin/pom-fetcher.ts @@ -22,7 +22,7 @@ const main = async () => { const config = getPomFetcherConfig() log.info( - { batchSize: config.batchSize, concurrency: config.concurrency, staleDays: config.staleDays }, + { batchSize: config.batchSize, concurrency: config.concurrency, fullRefreshDays: config.fullRefreshDays }, 'Config loaded', ) diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index 58c58fd414..24f88d4a6c 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -48,9 +48,14 @@ export function getEnricherConfig() { export function getPomFetcherConfig() { return { + // critical packages — HTTP-bound, keep low batchSize: parseInt(process.env.POM_FETCHER_BATCH_SIZE ?? '50', 10), concurrency: parseInt(process.env.POM_FETCHER_CONCURRENCY ?? '3', 10), - staleDays: parseInt(process.env.POM_FETCHER_STALE_DAYS ?? '7', 10), + fullRefreshDays: parseInt(process.env.POM_FETCHER_FULL_REFRESH_DAYS ?? '90', 10), + // non-critical packages — DB-only, can go much higher + nonCriticalBatchSize: parseInt(process.env.POM_FETCHER_NON_CRITICAL_BATCH_SIZE ?? '500', 10), + nonCriticalConcurrency: parseInt(process.env.POM_FETCHER_NON_CRITICAL_CONCURRENCY ?? '20', 10), + nonCriticalRefreshDays: parseInt(process.env.POM_FETCHER_NON_CRITICAL_REFRESH_DAYS ?? '180', 10), idleSleepSec: parseInt(process.env.POM_FETCHER_IDLE_SLEEP_SEC ?? '3600', 10), } } diff --git a/services/apps/packages_worker/src/pom-fetcher/activities.ts b/services/apps/packages_worker/src/pom-fetcher/activities.ts new file mode 100644 index 0000000000..6417e02be9 --- /dev/null +++ b/services/apps/packages_worker/src/pom-fetcher/activities.ts @@ -0,0 +1,23 @@ +import { getServiceChildLogger } from '@crowd/logging' + +import { getPomFetcherConfig } from '../config' +import { getPackagesDb } from '../db' +import { BatchResult, processBatch } from './runPomEnrichmentLoop' + +const log = getServiceChildLogger('pom-fetcher-activity') + +export async function processMavenCriticalBatch(): Promise { + const config = getPomFetcherConfig() + const qx = await getPackagesDb() + const result = await processBatch(qx, config, true) + log.info({ processed: result.processed, skipped: result.skipped, errors: result.errors }, 'Maven critical batch complete') + return result +} + +export async function processMavenNonCriticalBatch(): Promise { + const config = getPomFetcherConfig() + const qx = await getPackagesDb() + const result = await processBatch(qx, config, false) + log.info({ processed: result.processed, skipped: result.skipped, errors: result.errors }, 'Maven non-critical batch complete') + return result +} diff --git a/services/apps/packages_worker/src/pom-fetcher/extract.ts b/services/apps/packages_worker/src/pom-fetcher/extract.ts index ea79257726..58c200f0f6 100644 --- a/services/apps/packages_worker/src/pom-fetcher/extract.ts +++ b/services/apps/packages_worker/src/pom-fetcher/extract.ts @@ -65,6 +65,34 @@ const parser = new XMLParser({ parseAttributeValue: false, }) +// ─── Retry with exponential backoff ────────────────────────────────────────── + +const MAX_RETRIES = 3 +const RETRY_BASE_MS = 2_000 + +async function sleep(ms: number): Promise { + return new Promise((r) => setTimeout(r, ms)) +} + +async function getWithRetry(url: string): Promise { + for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) { + try { + const res = await axios.get(url, { responseType: 'text', timeout: REQUEST_TIMEOUT_MS }) + return res.data + } catch (err) { + if (axios.isAxiosError(err) && err.response?.status === 429) { + if (attempt < MAX_RETRIES) { + const delay = RETRY_BASE_MS * 2 ** attempt + Math.random() * 500 + await sleep(delay) + continue + } + } + throw err + } + } + throw new Error(`Max retries exceeded for ${url}`) +} + // ─── POM fetch ──────────────────────────────────────────────────────────────── export function buildPomUrl(groupId: string, artifactId: string, version: string): string { @@ -80,8 +108,8 @@ export async function fetchPom( ): Promise { const url = buildPomUrl(groupId, artifactId, version) try { - const res = await axios.get(url, { responseType: 'text', timeout: REQUEST_TIMEOUT_MS }) - const parsed = parser.parse(res.data) + const data = await getWithRetry(url) + const parsed = parser.parse(data) return (parsed?.project as PomData) ?? null } catch (err) { if (axios.isAxiosError(err)) { @@ -239,6 +267,42 @@ export async function extractArtifact( } } +// ─── SCM URL normalisation ─────────────────────────────────────────────────── + +/** + * Converts the raw SCM URL from a POM (declared_repository_url) into a clean + * HTTPS repository URL suitable for storage as repository_url. + * + * Handles common Maven SCM URL forms: + * scm:git:git@github.com:owner/repo.git → https://github.com/owner/repo + * scm:git:https://github.com/owner/repo → https://github.com/owner/repo + * git://github.com/owner/repo.git → https://github.com/owner/repo + * https://github.com/owner/repo/tree/... → https://github.com/owner/repo + */ +export function normalizeScmUrl(raw: string | null): string | null { + if (!raw) return null + let url = raw.trim() + + // Strip scm:git: or scm: prefix + url = url.replace(/^scm:git:/i, '').replace(/^scm:/i, '') + + // Convert SSH git@host:owner/repo → https://host/owner/repo + url = url.replace(/^git@([^:]+):(.+)$/, 'https://$1/$2') + + // Convert git:// → https:// + url = url.replace(/^git:\/\//, 'https://') + + // Strip trailing .git + url = url.replace(/\.git$/, '') + + // Strip /tree/... or /blob/... path suffixes (keep only host + owner + repo) + url = url.replace(/\/(tree|blob)(\/.*)?$/, '') + + if (!url.startsWith('https://')) return null + + return url.replace(/\/$/, '') +} + // ─── Private helpers ────────────────────────────────────────────────────────── function extractStr(value: unknown): string | null { diff --git a/services/apps/packages_worker/src/pom-fetcher/metadata.ts b/services/apps/packages_worker/src/pom-fetcher/metadata.ts index 9192bc05a1..c6153c21ce 100644 --- a/services/apps/packages_worker/src/pom-fetcher/metadata.ts +++ b/services/apps/packages_worker/src/pom-fetcher/metadata.ts @@ -14,6 +14,8 @@ import { XMLParser } from 'fast-xml-parser' const MAVEN_REPO = 'https://repo1.maven.org/maven2' const REQUEST_TIMEOUT_MS = 10_000 +const MAX_RETRIES = 3 +const RETRY_BASE_MS = 2_000 const parser = new XMLParser({ ignoreAttributes: false, @@ -22,6 +24,10 @@ const parser = new XMLParser({ parseAttributeValue: false, }) +async function sleep(ms: number): Promise { + return new Promise((r) => setTimeout(r, ms)) +} + export async function resolveLatestVersion( groupId: string, artifactId: string, @@ -29,22 +35,29 @@ export async function resolveLatestVersion( const groupPath = groupId.replace(/\./g, '/') const url = `${MAVEN_REPO}/${groupPath}/${artifactId}/maven-metadata.xml` - try { - const res = await axios.get(url, { responseType: 'text', timeout: REQUEST_TIMEOUT_MS }) - const parsed = parser.parse(res.data) + for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) { + try { + const res = await axios.get(url, { responseType: 'text', timeout: REQUEST_TIMEOUT_MS }) + const parsed = parser.parse(res.data) - // Prefer over — release excludes snapshots/alphas - const versioning = parsed?.metadata?.versioning - const release = typeof versioning?.release === 'string' ? versioning.release.trim() : null - const latest = typeof versioning?.latest === 'string' ? versioning.latest.trim() : null + // Prefer over — release excludes snapshots/alphas + const versioning = parsed?.metadata?.versioning + const release = typeof versioning?.release === 'string' ? versioning.release.trim() : null + const latest = typeof versioning?.latest === 'string' ? versioning.latest.trim() : null - return release || latest || null - } catch (err) { - if (axios.isAxiosError(err)) { - // Not found is expected for packages that don't exist on Maven Central - if (err.response?.status === 404) return null + return release || latest || null + } catch (err) { + if (axios.isAxiosError(err)) { + if (err.response?.status === 404) return null + if (err.response?.status === 429 && attempt < MAX_RETRIES) { + const delay = RETRY_BASE_MS * 2 ** attempt + Math.random() * 500 + await sleep(delay) + continue + } + } + throw err } - // Rethrow unexpected errors so callers can decide whether to retry - throw err } + + return null } diff --git a/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts b/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts index a652377e4f..9b82da2515 100644 --- a/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts +++ b/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts @@ -1,7 +1,7 @@ import crypto from 'crypto' import { - listMavenPackagesToEnrich, + listMavenPackagesToSync, upsertMaintainer, upsertPackage, upsertPackageMaintainer, @@ -10,155 +10,211 @@ import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' import { getServiceChildLogger } from '@crowd/logging' import { getPomFetcherConfig } from '../config' -import { extractArtifact } from './extract' +import { extractArtifact, normalizeScmUrl } from './extract' import { resolveLatestVersion } from './metadata' const log = getServiceChildLogger('pom-fetcher') // ─── Types ──────────────────────────────────────────────────────────────────── -interface BatchResult { +export interface BatchResult { processed: number skipped: number errors: number } +type PackageToSync = Awaited>[number] + +// ─── Non-critical: copy universe stats into packages ───────────────────────── + +function mavenRegistryUrl(groupId: string, artifactId: string): string { + return `https://central.sonatype.com/artifact/${groupId}/${artifactId}` +} + +async function processNonCriticalPackage(qx: QueryExecutor, pkg: PackageToSync): Promise { + await upsertPackage(qx, { + purl: pkg.purl, + ecosystem: 'maven', + namespace: pkg.namespace, + name: pkg.name, + description: null, + homepage: null, + registryUrl: pkg.namespace ? mavenRegistryUrl(pkg.namespace, pkg.name) : null, + declaredRepositoryUrl: null, + repositoryUrl: null, + licenses: null, + licensesRaw: null, + latestVersion: null, + ingestionSource: 'packages_universe', + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, + }) +} + +// ─── Critical: full POM extraction ─────────────────────────────────────────── + +async function processCriticalPackage( + qx: QueryExecutor, + pkg: PackageToSync, +): Promise<'processed' | 'skipped' | 'error'> { + const groupId = pkg.namespace + const artifactId = pkg.name + + if (!groupId) { + log.warn({ purl: pkg.purl }, 'Skipping critical package with null namespace (groupId)') + return 'skipped' + } + + let version = pkg.latestVersion ?? null + if (!version) { + log.debug({ groupId, artifactId }, 'No baseline version — falling back to maven-metadata.xml') + version = await resolveLatestVersion(groupId, artifactId) + } + + if (!version) { + log.warn({ groupId, artifactId }, 'Could not resolve latest version, skipping') + await upsertPackage(qx, { + purl: pkg.purl, + ecosystem: 'maven', + namespace: groupId, + name: artifactId, + description: null, + homepage: null, + registryUrl: mavenRegistryUrl(groupId, artifactId), + declaredRepositoryUrl: null, + repositoryUrl: null, + licenses: null, + licensesRaw: null, + latestVersion: null, + ingestionSource: 'pom_fetcher_no_version', + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, + }) + return 'skipped' + } + + const result = await extractArtifact(groupId, artifactId, version, (msg) => { + log.debug({ groupId, artifactId, version }, msg) + }) + + if (result.error) { + log.warn({ groupId, artifactId, version, error: result.error }, 'POM extraction error') + return 'error' + } + + const packageId = await upsertPackage(qx, { + purl: pkg.purl, + ecosystem: 'maven', + namespace: groupId, + name: artifactId, + description: result.description, + homepage: result.homepageUrl, + registryUrl: mavenRegistryUrl(groupId, artifactId), + declaredRepositoryUrl: result.scmUrl, + repositoryUrl: normalizeScmUrl(result.scmUrl), + licenses: result.licenses.length > 0 ? result.licenses : null, + licensesRaw: result.licensesRaw, + latestVersion: version, + ingestionSource: 'pom_fetcher', + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, + }) + + const allPeople = [ + ...result.developers.map((d) => ({ ...d, role: 'author' as const })), + ...result.contributors.map((c) => ({ ...c, role: 'maintainer' as const })), + ] + + for (const person of allPeople) { + const username = person.username ?? person.email ?? person.displayName + if (!username) continue + + const emailHash = person.email + ? crypto.createHash('sha256').update(person.email.toLowerCase().trim()).digest('hex') + : null + + const maintainerId = await upsertMaintainer(qx, { + ecosystem: 'maven', + username, + displayName: person.displayName, + url: person.url, + emailHash, + }) + + await upsertPackageMaintainer(qx, { + packageId, + maintainerId, + role: person.role, + }) + } + + return 'processed' +} + // ─── Batch processing ───────────────────────────────────────────────────────── -async function processBatch( +export async function processBatch( qx: QueryExecutor, config: ReturnType, + isCritical: boolean, ): Promise { - const packages = await listMavenPackagesToEnrich(qx, { - limit: config.batchSize, + const batchSize = isCritical ? config.batchSize : config.nonCriticalBatchSize + const concurrency = isCritical ? config.concurrency : config.nonCriticalConcurrency + + const packages = await listMavenPackagesToSync(qx, { + limit: batchSize, offset: 0, - staleDays: config.staleDays, + fullRefreshDays: config.fullRefreshDays, + nonCriticalRefreshDays: config.nonCriticalRefreshDays, + isCritical, }) if (packages.length === 0) { return { processed: 0, skipped: 0, errors: 0 } } - log.info({ count: packages.length }, 'Processing POM batch...') + log.info({ count: packages.length, isCritical }, 'Processing batch...') let processed = 0 let skipped = 0 let errors = 0 const PROGRESS_EVERY = 25 - // Process in small concurrent groups to be polite to Maven Central - for (let i = 0; i < packages.length; i += config.concurrency) { - const group = packages.slice(i, i + config.concurrency) + for (let i = 0; i < packages.length; i += concurrency) { + const group = packages.slice(i, i + concurrency) await Promise.all( group.map(async (pkg) => { - const groupId = pkg.namespace - const artifactId = pkg.name - - if (!groupId) { - log.warn({ purl: pkg.purl }, 'Skipping package with null namespace (groupId)') - skipped++ - return - } - try { - // Step 1: resolve latest version from maven-metadata.xml - const version = await resolveLatestVersion(groupId, artifactId) - if (!version) { - log.warn({ groupId, artifactId }, 'Could not resolve latest version, skipping') - // Upsert a minimal record so last_synced_at is set — prevents this package - // from re-appearing in every batch within the same pass. - // ingestionSource 'pom_fetcher_no_version' marks that it was tried but had no - // resolvable version on Maven Central (404 on maven-metadata.xml). - await upsertPackage(qx, { - purl: `pkg:maven/${groupId}/${artifactId}`, - ecosystem: 'maven', - namespace: groupId, - name: artifactId, - description: null, - homepage: null, - declaredRepositoryUrl: null, - licenses: null, - licensesRaw: null, - latestVersion: null, - ingestionSource: 'pom_fetcher_no_version', - }) - skipped++ + if (!isCritical) { + await processNonCriticalPackage(qx, pkg) + processed++ return } - // Step 2: fetch + resolve POM (follows parent chain) - const result = await extractArtifact(groupId, artifactId, version, (msg) => { - log.debug({ groupId, artifactId, version }, msg) - }) - - if (result.error) { - log.warn({ groupId, artifactId, version, error: result.error }, 'POM extraction error') - errors++ - return - } - - // Step 3: upsert into `packages` - // purl at package level has no version (package-level identifier) - const packagePurl = `pkg:maven/${groupId}/${artifactId}` - const packageId = await upsertPackage(qx, { - purl: packagePurl, - ecosystem: 'maven', - namespace: groupId, - name: artifactId, - description: result.description, - homepage: result.homepageUrl, - declaredRepositoryUrl: result.scmUrl, - licenses: result.licenses.length > 0 ? result.licenses : null, - licensesRaw: result.licensesRaw, - latestVersion: version, - ingestionSource: 'pom_fetcher', - }) - - // Step 4: upsert maintainers (developers + contributors) - const allPeople = [ - ...result.developers.map((d) => ({ ...d, role: 'author' as const })), - ...result.contributors.map((c) => ({ ...c, role: 'maintainer' as const })), - ] - - for (const person of allPeople) { - const username = person.username ?? person.email ?? person.displayName - if (!username) continue - - const emailHash = person.email - ? crypto.createHash('sha256').update(person.email.toLowerCase().trim()).digest('hex') - : null - - const maintainerId = await upsertMaintainer(qx, { - ecosystem: 'maven', - username, - displayName: person.displayName, - url: person.url, - emailHash, - }) - - await upsertPackageMaintainer(qx, { - packageId, - maintainerId, - role: person.role, - }) - } - - processed++ + const status = await processCriticalPackage(qx, pkg) + if (status === 'processed') processed++ + else if (status === 'skipped') skipped++ + else errors++ } catch (err) { const message = err instanceof Error ? err.message : String(err) - log.error({ groupId, artifactId, error: message }, 'Unexpected error processing package') + log.error({ purl: pkg.purl, error: message }, 'Unexpected error processing package') errors++ } }), ) - // done = packages processed so far (based on loop index, always accurate) const done = i + group.length const prevDone = i const crossedBoundary = Math.floor(done / PROGRESS_EVERY) > Math.floor(prevDone / PROGRESS_EVERY) if (crossedBoundary || done === packages.length) { - log.info( + log.debug( { done, total: packages.length, processed, skipped, errors }, `Progress: ${done}/${packages.length}`, ) @@ -168,62 +224,83 @@ async function processBatch( return { processed, skipped, errors } } +// ─── Phase runner ───────────────────────────────────────────────────────────── + +async function runPhase( + qx: QueryExecutor, + config: ReturnType, + isCritical: boolean, + isShuttingDown: () => boolean, +): Promise<{ processed: number; skipped: number; errors: number }> { + const label = isCritical ? 'critical' : 'non-critical' + let total = { processed: 0, skipped: 0, errors: 0 } + let batchNum = 0 + const phaseStartedAt = Date.now() + + log.info({ phase: label }, 'Phase started') + + while (!isShuttingDown()) { + const result = await processBatch(qx, config, isCritical) + + if (result.processed + result.skipped + result.errors === 0) { + const durationSec = Math.round((Date.now() - phaseStartedAt) / 1000) + log.info({ phase: label, ...total, durationSec }, 'Phase complete') + return total + } + + batchNum++ + total.processed += result.processed + total.skipped += result.skipped + total.errors += result.errors + + log.info( + { + phase: label, + batch: batchNum, + totalProcessed: total.processed, + totalSkipped: total.skipped, + totalErrors: total.errors, + elapsedSec: Math.round((Date.now() - phaseStartedAt) / 1000), + }, + 'Batch done', + ) + } + + return total +} + // ─── Main loop ──────────────────────────────────────────────────────────────── -/** - * Loops indefinitely: pages through all Maven packages that need POM - * enrichment, sleeps when the pass is complete, then restarts from offset 0. - * - * The caller is responsible for creating the DB connection and passing - * `isShuttingDown` so the loop exits cleanly on SIGTERM/SIGINT. - */ export async function runPomEnrichmentLoop( qx: QueryExecutor, config: ReturnType, isShuttingDown: () => boolean, ): Promise { - let totalProcessed = 0 - let totalSkipped = 0 - let totalErrors = 0 let passNumber = 0 - let passStartedAt = Date.now() while (!isShuttingDown()) { - if (totalProcessed + totalSkipped + totalErrors === 0) { - passNumber++ - passStartedAt = Date.now() - log.info({ pass: passNumber }, 'Starting pass') - } + passNumber++ + const passStartedAt = Date.now() + log.info({ pass: passNumber }, 'Starting pass') - const result = await processBatch(qx, config) + // Phase 1: non-critical first — DB-only, high throughput + const nonCritical = await runPhase(qx, config, false, isShuttingDown) - if (result.processed + result.skipped + result.errors === 0) { - // Nothing left in this pass — log summary and sleep - const durationMs = Date.now() - passStartedAt - log.info( - { - totalProcessed, - totalSkipped, - totalErrors, - durationMs, - durationSec: Math.round(durationMs / 1000), - }, - `Pass complete. Sleeping ${config.idleSleepSec}s before next pass.`, - ) - await new Promise((r) => setTimeout(r, config.idleSleepSec * 1000)) - totalProcessed = 0 - totalSkipped = 0 - totalErrors = 0 - continue - } - - totalProcessed += result.processed - totalSkipped += result.skipped - totalErrors += result.errors + // Phase 2: critical — HTTP-bound, lower throughput + const critical = await runPhase(qx, config, true, isShuttingDown) + const durationMs = Date.now() - passStartedAt log.info( - { processed: result.processed, skipped: result.skipped, errors: result.errors, totalProcessed, totalSkipped, totalErrors }, - 'Batch complete', + { + pass: passNumber, + totalProcessed: nonCritical.processed + critical.processed, + totalSkipped: nonCritical.skipped + critical.skipped, + totalErrors: nonCritical.errors + critical.errors, + durationSec: Math.round(durationMs / 1000), + }, + `Pass complete. Sleeping ${config.idleSleepSec}s before next pass.`, ) + + await new Promise((r) => setTimeout(r, config.idleSleepSec * 1000)) } } diff --git a/services/apps/packages_worker/src/pom-fetcher/schedule.ts b/services/apps/packages_worker/src/pom-fetcher/schedule.ts new file mode 100644 index 0000000000..e80bee8168 --- /dev/null +++ b/services/apps/packages_worker/src/pom-fetcher/schedule.ts @@ -0,0 +1,41 @@ +import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' + +import { svc } from '../service' +import { pomFetcherWorkflow } from '../workflows' + +export async function schedulePomFetcher(): Promise { + const { temporal } = svc + if (!temporal) throw new Error('Temporal client not initialized') + + try { + await temporal.schedule.create({ + scheduleId: 'maven-pom-fetcher', + spec: { + // Run daily at 4am UTC — off-peak, after nightly GitHub enrichment completes + cronExpressions: ['0 4 * * *'], + }, + policies: { + overlap: ScheduleOverlapPolicy.SKIP, + catchupWindow: '1 hour', + }, + action: { + type: 'startWorkflow', + workflowType: pomFetcherWorkflow, + taskQueue: 'packages-worker', + workflowExecutionTimeout: '12 hours', + retry: { + initialInterval: '30 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, + }, + args: [], + }, + }) + } catch (err) { + if (err instanceof ScheduleAlreadyRunning) { + svc.log.info('Schedule maven-pom-fetcher already registered.') + } else { + throw err + } + } +} diff --git a/services/apps/packages_worker/src/pom-fetcher/workflows.ts b/services/apps/packages_worker/src/pom-fetcher/workflows.ts new file mode 100644 index 0000000000..f9d05eb03c --- /dev/null +++ b/services/apps/packages_worker/src/pom-fetcher/workflows.ts @@ -0,0 +1,36 @@ +import { proxyActivities } from '@temporalio/workflow' + +import type * as activities from './activities' + +const { processMavenCriticalBatch } = proxyActivities({ + startToCloseTimeout: '15 minutes', +}) + +const { processMavenNonCriticalBatch } = proxyActivities({ + startToCloseTimeout: '5 minutes', +}) + +/** + * Temporal workflow: runs a full pass of Maven package syncing. + * + * Phase 1 — non-critical: copies universe stats into packages (DB-only, no HTTP). + * Phase 2 — critical: full POM enrichment (HTTP calls to Maven Central). + * + * Each phase loops until its batch returns empty, then the workflow exits. + * The Temporal schedule re-triggers this workflow on the configured interval. + */ +export async function pomFetcherWorkflow(): Promise { + // Phase 1: non-critical — DB-only, fast + // eslint-disable-next-line no-constant-condition + while (true) { + const result = await processMavenNonCriticalBatch() + if (result.processed + result.skipped + result.errors === 0) break + } + + // Phase 2: critical — HTTP-bound, slower + // eslint-disable-next-line no-constant-condition + while (true) { + const result = await processMavenCriticalBatch() + if (result.processed + result.skipped + result.errors === 0) break + } +} diff --git a/services/apps/packages_worker/src/workflows/index.ts b/services/apps/packages_worker/src/workflows/index.ts index 5dc712e8df..b52ba11277 100644 --- a/services/apps/packages_worker/src/workflows/index.ts +++ b/services/apps/packages_worker/src/workflows/index.ts @@ -10,3 +10,4 @@ export { } from '../deps-dev/workflows' export { npmHello } from '../npm/workflows' export { osvSync } from '../osv/workflows' +export { pomFetcherWorkflow } from '../pom-fetcher/workflows' diff --git a/services/libs/data-access-layer/src/osspckgs/packages.ts b/services/libs/data-access-layer/src/osspckgs/packages.ts index f28dd1efea..9791ea76a8 100644 --- a/services/libs/data-access-layer/src/osspckgs/packages.ts +++ b/services/libs/data-access-layer/src/osspckgs/packages.ts @@ -14,17 +14,28 @@ export async function findPackageIdsByPurl( // ─── packages_universe ──────────────────────────────────────────────────────── /** - * Returns a page of Maven packages from packages_universe that either have no - * corresponding entry in `packages` yet, or whose `packages.last_synced_at` is - * older than the given cutoff (defaults to 7 days). + * Returns a page of Maven packages from packages_universe that need syncing + * into the packages table. * - * Ordered by rank_in_ecosystem ASC (most critical first), unranked last. + * Eligibility rules: + * - p.purl IS NULL → never added to packages (any criticality) + * - is_critical = false → periodic refresh of universe stats (default 180d) + * - is_critical = true → not yet POM-enriched, new version released, or + * periodic full refresh (default 90d) + * + * Critical packages are returned first so POM enrichment is prioritised. */ -export async function listMavenPackagesToEnrich( +export async function listMavenPackagesToSync( qx: QueryExecutor, - options: { limit: number; offset: number; staleDays?: number }, -): Promise[]> { - const { limit, offset, staleDays = 7 } = options + options: { limit: number; offset: number; fullRefreshDays?: number; nonCriticalRefreshDays?: number; isCritical?: boolean }, +): Promise< + (Pick & { + purl: string + latestVersion: string | null + })[] +> { + const { limit, offset, fullRefreshDays = 90, nonCriticalRefreshDays = 180, isCritical } = options + const isCriticalFilter = isCritical !== undefined ? isCritical : null return qx.select( ` @@ -32,22 +43,37 @@ export async function listMavenPackagesToEnrich( pu.id, pu.purl, pu.namespace, - pu.name + pu.name, + pu.is_critical AS "isCritical", + pu.criticality_score AS "criticalityScore", + pu.dependent_packages_count AS "dependentPackagesCount", + pu.dependent_repos_count AS "dependentReposCount", + pu.downloads_30d AS "downloads30d", + p.latest_version AS "latestVersion" FROM packages_universe pu LEFT JOIN packages p ON p.purl = pu.purl WHERE pu.ecosystem = 'maven' + AND pu.purl IS NOT NULL AND pu.namespace IS NOT NULL + AND ($(isCriticalFilter)::boolean IS NULL OR pu.is_critical = $(isCriticalFilter)::boolean) AND ( - p.id IS NULL - OR p.last_synced_at < NOW() - ($(staleDays) || ' days')::interval + p.purl IS NULL + OR (pu.is_critical = false + AND p.last_synced_at < NOW() - ($(nonCriticalRefreshDays) || ' days')::interval) + OR (pu.is_critical = true + AND p.ingestion_source IN ('maven_index', 'packages_universe')) + OR (pu.is_critical = true + AND p.latest_release_at > p.last_synced_at) + OR (pu.is_critical = true + AND p.last_synced_at < NOW() - ($(fullRefreshDays) || ' days')::interval) ) ORDER BY pu.rank_in_ecosystem ASC NULLS LAST, pu.id ASC LIMIT $(limit) OFFSET $(offset) `, - { limit, offset, staleDays }, + { limit, offset, fullRefreshDays, nonCriticalRefreshDays, isCriticalFilter }, ) } @@ -67,10 +93,16 @@ export async function upsertPackage(qx: QueryExecutor, item: IDbPackageUpsert): name, description, homepage, + registry_url, declared_repository_url, + repository_url, licenses, licenses_raw, latest_version, + criticality_score, + dependent_packages_count, + dependent_repos_count, + downloads_last_month, ingestion_source, last_synced_at ) VALUES ( @@ -80,25 +112,45 @@ export async function upsertPackage(qx: QueryExecutor, item: IDbPackageUpsert): $(name), $(description), $(homepage), + $(registryUrl), $(declaredRepositoryUrl), + $(repositoryUrl), $(licenses)::text[], $(licensesRaw), $(latestVersion), + $(criticalityScore), + $(dependentPackagesCount), + $(dependentReposCount), + $(downloadsLastMonth), $(ingestionSource), NOW() ) ON CONFLICT (purl) DO UPDATE SET - description = EXCLUDED.description, - homepage = EXCLUDED.homepage, - declared_repository_url = EXCLUDED.declared_repository_url, - licenses = EXCLUDED.licenses, - licenses_raw = EXCLUDED.licenses_raw, - latest_version = COALESCE(EXCLUDED.latest_version, packages.latest_version), - ingestion_source = EXCLUDED.ingestion_source, - last_synced_at = NOW() + description = COALESCE(EXCLUDED.description, packages.description), + homepage = COALESCE(EXCLUDED.homepage, packages.homepage), + registry_url = COALESCE(EXCLUDED.registry_url, packages.registry_url), + declared_repository_url = COALESCE(EXCLUDED.declared_repository_url, packages.declared_repository_url), + repository_url = COALESCE(EXCLUDED.repository_url, packages.repository_url), + licenses = COALESCE(EXCLUDED.licenses, packages.licenses), + licenses_raw = COALESCE(EXCLUDED.licenses_raw, packages.licenses_raw), + latest_version = COALESCE(EXCLUDED.latest_version, packages.latest_version), + criticality_score = COALESCE(EXCLUDED.criticality_score, packages.criticality_score), + dependent_packages_count = COALESCE(EXCLUDED.dependent_packages_count, packages.dependent_packages_count), + dependent_repos_count = COALESCE(EXCLUDED.dependent_repos_count, packages.dependent_repos_count), + downloads_last_month = COALESCE(EXCLUDED.downloads_last_month, packages.downloads_last_month), + ingestion_source = EXCLUDED.ingestion_source, + last_synced_at = NOW() RETURNING id `, - item, + { + ...item, + registryUrl: item.registryUrl ?? null, + repositoryUrl: item.repositoryUrl ?? null, + criticalityScore: item.criticalityScore ?? null, + dependentPackagesCount: item.dependentPackagesCount ?? null, + dependentReposCount: item.dependentReposCount ?? null, + downloadsLastMonth: item.downloadsLastMonth ?? null, + }, ) return row.id as number } diff --git a/services/libs/data-access-layer/src/osspckgs/types.ts b/services/libs/data-access-layer/src/osspckgs/types.ts index 7553ff00a6..ee3d93b849 100644 --- a/services/libs/data-access-layer/src/osspckgs/types.ts +++ b/services/libs/data-access-layer/src/osspckgs/types.ts @@ -7,6 +7,11 @@ export interface IDbPackageUniverse { namespace: string | null name: string rankInEcosystem: number | null + isCritical: boolean + criticalityScore: number | null + dependentPackagesCount: number | null + dependentReposCount: number | null + downloads30d: bigint | null } // ─── packages ───────────────────────────────────────────────────────────────── @@ -23,6 +28,12 @@ export type IDbPackageUpsert = { licensesRaw: string | null latestVersion: string | null ingestionSource: string + criticalityScore?: number | null + dependentPackagesCount?: number | null + dependentReposCount?: number | null + downloadsLastMonth?: bigint | null + registryUrl?: string | null + repositoryUrl?: string | null } // ─── maintainers ────────────────────────────────────────────────────────────── From 21289d60963d72e5aa3f1cfcdb3639193ac2c7b0 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Mon, 1 Jun 2026 12:20:05 +0200 Subject: [PATCH 06/22] feat: improve pom extractor performances Signed-off-by: Umberto Sgueglia --- backend/.env.dist.local | 24 ++ scripts/services/pom-fetcher.yaml | 5 +- .../packages_worker/src/bin/pom-fetcher.ts | 5 +- services/apps/packages_worker/src/config.ts | 10 +- .../packages_worker/src/pom-fetcher/README.md | 294 ++++++++++++++ .../src/pom-fetcher/extract.ts | 68 +++- .../src/pom-fetcher/metadata.ts | 34 +- .../src/pom-fetcher/runPomEnrichmentLoop.ts | 360 ++++++++++-------- .../data-access-layer/src/osspckgs/index.ts | 2 + .../src/osspckgs/packages.ts | 37 +- .../data-access-layer/src/osspckgs/repos.ts | 44 +++ .../data-access-layer/src/osspckgs/types.ts | 29 ++ .../src/osspckgs/versions.ts | 53 +++ 13 files changed, 749 insertions(+), 216 deletions(-) create mode 100644 services/apps/packages_worker/src/pom-fetcher/README.md create mode 100644 services/libs/data-access-layer/src/osspckgs/versions.ts diff --git a/backend/.env.dist.local b/backend/.env.dist.local index b1a9c85f8d..91c19e85c1 100755 --- a/backend/.env.dist.local +++ b/backend/.env.dist.local @@ -206,3 +206,27 @@ POM_FETCHER_NON_CRITICAL_CONCURRENCY=20 # pom-fetcher critical (HTTP) POM_FETCHER_BATCH_SIZE=100 POM_FETCHER_CONCURRENCY=5 +# ── non-critical DB-only (usato quando DIRECT_POM_FOR_ALL=false) ────────────── +# POM_FETCHER_NON_CRITICAL_BATCH_SIZE=500 +POM_FETCHER_NON_CRITICAL_CONCURRENCY=20 # solo DB writes, nessun HTTP +POM_FETCHER_NON_CRITICAL_REFRESH_DAYS=1 + +# ── non-critical HTTP / direct-pom (usato quando DIRECT_POM_FOR_ALL=true) ───── +# POM_FETCHER_NON_CRITICAL_POM_CONCURRENCY=5 +# POM_FETCHER_NON_CRITICAL_POM_REFRESH_DAYS=1 + +# ── critical HTTP / full-pom (sempre attivo) ────────────────────────────────── +# POM_FETCHER_BATCH_SIZE=100 +# POM_FETCHER_CONCURRENCY=5 +# POM_FETCHER_FULL_REFRESH_DAYS=1 + +# ── modalità e rate limiting ────────────────────────────────────────────────── +# POM_FETCHER_DIRECT_POM_FOR_ALL=true +# POM_FETCHER_GROUP_DELAY_MS=200 + + +POM_FETCHER_BATCH_SIZE=50 +POM_FETCHER_CONCURRENCY=3 +POM_FETCHER_REFRESH_DAYS=1 +POM_FETCHER_GROUP_DELAY_MS=500 +POM_FETCHER_IDLE_SLEEP_SEC=3600 diff --git a/scripts/services/pom-fetcher.yaml b/scripts/services/pom-fetcher.yaml index 4210778a6d..54f083c598 100644 --- a/scripts/services/pom-fetcher.yaml +++ b/scripts/services/pom-fetcher.yaml @@ -8,8 +8,9 @@ x-env-args: &env-args SUPPRESS_NO_CONFIG_WARNING: 'true' LOG_LEVEL: 'info' POM_FETCHER_BATCH_SIZE: '50' - POM_FETCHER_CONCURRENCY: '3' - POM_FETCHER_STALE_DAYS: '7' + POM_FETCHER_CONCURRENCY: '5' + POM_FETCHER_REFRESH_DAYS: '1' + POM_FETCHER_GROUP_DELAY_MS: '200' POM_FETCHER_IDLE_SLEEP_SEC: '3600' services: diff --git a/services/apps/packages_worker/src/bin/pom-fetcher.ts b/services/apps/packages_worker/src/bin/pom-fetcher.ts index 5d2773f911..efac07125e 100644 --- a/services/apps/packages_worker/src/bin/pom-fetcher.ts +++ b/services/apps/packages_worker/src/bin/pom-fetcher.ts @@ -21,10 +21,7 @@ const main = async () => { log.info('pom-fetcher starting...') const config = getPomFetcherConfig() - log.info( - { batchSize: config.batchSize, concurrency: config.concurrency, fullRefreshDays: config.fullRefreshDays }, - 'Config loaded', - ) + log.info(config, 'Config loaded') const qx = await getPackagesDb() await qx.selectOne('SELECT 1') diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index 24f88d4a6c..6901fd9da2 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -48,14 +48,10 @@ export function getEnricherConfig() { export function getPomFetcherConfig() { return { - // critical packages — HTTP-bound, keep low batchSize: parseInt(process.env.POM_FETCHER_BATCH_SIZE ?? '50', 10), - concurrency: parseInt(process.env.POM_FETCHER_CONCURRENCY ?? '3', 10), - fullRefreshDays: parseInt(process.env.POM_FETCHER_FULL_REFRESH_DAYS ?? '90', 10), - // non-critical packages — DB-only, can go much higher - nonCriticalBatchSize: parseInt(process.env.POM_FETCHER_NON_CRITICAL_BATCH_SIZE ?? '500', 10), - nonCriticalConcurrency: parseInt(process.env.POM_FETCHER_NON_CRITICAL_CONCURRENCY ?? '20', 10), - nonCriticalRefreshDays: parseInt(process.env.POM_FETCHER_NON_CRITICAL_REFRESH_DAYS ?? '180', 10), + concurrency: parseInt(process.env.POM_FETCHER_CONCURRENCY ?? '5', 10), + refreshDays: parseInt(process.env.POM_FETCHER_REFRESH_DAYS ?? '1', 10), + groupDelayMs: parseInt(process.env.POM_FETCHER_GROUP_DELAY_MS ?? '200', 10), idleSleepSec: parseInt(process.env.POM_FETCHER_IDLE_SLEEP_SEC ?? '3600', 10), } } diff --git a/services/apps/packages_worker/src/pom-fetcher/README.md b/services/apps/packages_worker/src/pom-fetcher/README.md new file mode 100644 index 0000000000..8ca08638df --- /dev/null +++ b/services/apps/packages_worker/src/pom-fetcher/README.md @@ -0,0 +1,294 @@ +# Maven POM Fetcher + +Worker that syncs Maven package metadata from Maven Central into the `packages` DB. +Runs as a standalone entry point inside `packages_worker` on a daily Temporal schedule (4am UTC). + +--- + +## Architecture: Two-Tier Fetch + +All Maven packages in `packages_universe` are processed in two sequential phases per pass. + +### Phase 1 — Non-Critical + +| Mode | Trigger | What happens | +|------|---------|-------------| +| `POM_FETCHER_DIRECT_POM_FOR_ALL=false` (default) | `last_synced_at > 1 day` | DB-only: copies universe stats (criticality, downloads, dependents) into `packages`. No HTTP. | +| `POM_FETCHER_DIRECT_POM_FOR_ALL=true` | `last_synced_at > 30 days` | Fetches `maven-metadata.xml` + root POM (no parent chain). Populates description, homepage, SCM, maintainers, versions. | + +### Phase 2 — Critical + +Always active. Full POM extraction with parent chain resolution (max 5 hops). Runs when: +- Package not yet in `packages` table +- `ingestion_source IN ('maven_index', 'packages_universe')` — not yet POM-enriched +- New version released (`latest_release_at > last_synced_at`) +- Periodic full refresh (`last_synced_at > 90 days`) + +**Why two tiers?** Parent POM resolution is the expensive part — it requires multiple HTTP requests per package (up to 5 extra fetches). Running it on millions of non-critical packages every day is not feasible. For non-critical packages a single direct-POM fetch is sufficient; for critical packages the extra cost is justified by data quality. + +--- + +## Coverage Matrix (`POM_FETCHER_DIRECT_POM_FOR_ALL=true`) + +### packages + +| Column | Source | Coverage | +|--------|--------|----------| +| purl | packages_universe | ✅ all | +| ecosystem | hardcoded `'maven'` | ✅ all | +| namespace | packages_universe.namespace (= groupId) | ✅ all | +| name | packages_universe.name (= artifactId) | ✅ all | +| registry_url | `https://central.sonatype.com/artifact/{ns}/{name}` | ✅ all | +| latest_version | maven-metadata.xml `` | ✅ all | +| ingestion_source | see table below | ✅ all | +| last_synced_at | NOW() | ✅ all | +| description | POM `` | ✅ best-effort¹ | +| homepage | POM `` | ✅ best-effort¹ | +| declared_repository_url | POM `` raw | ✅ best-effort¹ | +| repository_url | normalized from declared_repository_url | ✅ best-effort¹ | +| licenses / licenses_raw | POM `` | ✅ best-effort¹ / ✅ full for critical² | +| status | Sonatype: deprecated flag | 🔜 Sonatype | +| versions_count | Sonatype: COUNT of releases | 🔜 Sonatype | +| first_release_at | Sonatype: MIN release timestamp | 🔜 Sonatype | +| latest_release_at | Sonatype: MAX release timestamp | 🔜 Sonatype | +| keywords | not in Maven POM | ❌ | +| dist_tags_* | N/A — Maven ecosystem | ❌ | +| dependent_packages_count | not in Maven registry API | ❌ | +| dependent_repos_count | not in Maven registry API | ❌ | +| criticality_score | set by ranking function | ❌ | +| is_critical | set by ranking function | ❌ | +| last_rank_pass_at | set by ranking function | ❌ | + +### versions + +| Column | Source | Coverage | +|--------|--------|----------| +| package_id | FK from packages upsert | ✅ all | +| ecosystem | hardcoded `'maven'` | ✅ all | +| number | maven-metadata.xml `` | ✅ all | +| is_latest | `number === ` | ✅ all | +| is_prerelease | regex on version string³ | ✅ all | +| last_synced_at | NOW() | ✅ all | +| license | package-level license applied to all versions⁴ | ✅ best-effort¹ | +| published_at | Sonatype: release timestamp | 🔜 Sonatype | +| is_yanked | no yank mechanism in Maven | ❌ | +| download_count | no public per-version API | ❌ | + +### maintainers / package_maintainers + +| Column | Source | Coverage | +|--------|--------|----------| +| ecosystem | hardcoded `'maven'` | ✅ all | +| username | POM `` | ✅ best-effort¹ | +| display_name | POM `` | ✅ best-effort¹ | +| email_hash | SHA-256(``) — GDPR | ✅ best-effort¹ | +| url | POM `` | ✅ best-effort¹ | +| role | `'author'` from ``, `'maintainer'` from `` | ✅ best-effort¹ | +| github_login | requires identity resolution | ❌ | + +### repos / package_repos + +| Column | Source | Coverage | +|--------|--------|----------| +| repos.url | `repository_url` (normalized from POM ``) | ✅ best-effort¹ | +| repos.host | derived from URL (`github` / `gitlab` / `bitbucket` / `other`) | ✅ best-effort¹ | +| repos.owner | URL path segment | ✅ best-effort¹ | +| repos.name | URL path segment | ✅ best-effort¹ | +| repos.description / stars / forks / … | GitHub enricher | filled by github-repos-enricher | +| package_repos.source | `'declared'` (from POM ``) | ✅ best-effort¹ | +| package_repos.confidence | `0.80` | ✅ best-effort¹ | + +The POM fetcher seeds `repos` with URL-derivable fields only. The GitHub enricher then fills the rest (description, stars, forks, language, topics, etc.) because the repo row already exists. On conflict the `repos` upsert uses `COALESCE` — richer data from other enrichers is never overwritten. + +`package_repos.confidence` is updated with `GREATEST(new, existing)` so a higher-confidence link from deps.dev (`0.90`) is never downgraded by our `0.80` write. + +### Not supported (no Maven source) + +`package_funding_links` — no funding concept in Maven POM. +`package_name_history` — Maven coordinates are immutable; rename history does not exist. +`downloads_daily` — no public per-day download API from Maven Central. +`downloads_last_30d` — 🔜 Sonatype. + +--- + +**Notes:** + +> ¹ **best-effort**: field is populated when declared in the direct POM. If inherited from a parent POM (common for `licenses`, `` in Apache/Spring/Google projects), it is null for non-critical packages. +> +> ² **full resolution for critical**: parent chain is followed (max 5 hops), so inherited fields are resolved correctly. +> +> ³ **prerelease regex**: matches `-SNAPSHOT`, `-alpha`, `-beta`, `-rc`, `-M[0-9]+` (case-insensitive). +> +> ⁴ **license per version**: the package-level license (first entry from POM ``) is applied to all versions. Per-version POM fetches are not performed. This is an approximation — Maven licenses rarely change between versions. + +--- + +## `ingestion_source` Values + +| Value | Meaning | +|-------|---------| +| `pom_fetcher` | Critical — full POM + parent resolution succeeded | +| `pom_fetcher_direct` | Non-critical — direct POM fetch succeeded (no parent resolution) | +| `pom_fetcher_not_on_central` | `maven-metadata.xml` not found on `repo1.maven.org` — artifact is hosted on a third-party repository (e.g. WSO2 Nexus, JBoss, Atlassian). Universe data came from an aggregator (deps.dev, OSV). | +| `pom_fetcher_no_version` | `maven-metadata.xml` found but `` is empty — artifact has no stable release | +| `pom_fetcher_error` | `maven-metadata.xml` has a release version but the `.pom` file for that version is a 404. Typical cause: partial deploy to Maven Central (metadata updated, artifact not uploaded) or Eclipse P2 feature artifacts that don't publish a standard POM. | +| `pom_fetcher_rate_limited` | Maven Central returned 403/429 on all retry attempts. Package will be retried on the next pass. | +| `packages_universe` | Non-critical, DB-only mode (`POM_FETCHER_DIRECT_POM_FOR_ALL=false`) — only universe stats copied | + +--- + +## Known Exceptions + +### WSO2 (`org.wso2.*`) + +WSO2 publishes some artifacts exclusively to their own Nexus at `maven.wso2.org`. A subset of their artifacts appear in `packages_universe` (sourced from deps.dev/OSV which aggregates all Maven repositories) but are **not** available on `repo1.maven.org/maven2`. + +Affected pattern: `org.wso2.carbon.*` — specifically `.feature` Eclipse P2 artifacts and `.stub` artifacts. These are written with `ingestion_source = 'pom_fetcher_not_on_central'` and are not retried until the next `nonCriticalPomRefreshDays` window. + +### Eclipse P2 Feature Artifacts (`*.feature`) + +Eclipse/OSGi feature artifacts (e.g. `org.wso2.carbon.identity.xacml.server.feature`) are packaged as `.zip` files, not `.jar`. Some publishers update `maven-metadata.xml` on Central without uploading the corresponding `.pom`. These land in `pom_fetcher_error`. No fix is possible without the publisher correcting their CI/CD pipeline. + +### Maven Central 403 rate limiting + +Maven Central (`repo1.maven.org`) restituisce 403 come meccanismo di throttle oltre al canonico 429. Il comportamento è gestito a due livelli: + +1. **Retry con backoff esponenziale** — 403 e 429 vengono ritentati fino a 3 volte (2s base, ×2 per tentativo). Gestito in `getWithRetry` (extract.ts) e `resolveVersionsList` (metadata.ts). + +2. **Fallback su DB** — se tutti i retry esauriscono, il pacchetto viene scritto con `ingestion_source = 'pom_fetcher_rate_limited'` e `last_synced_at = NOW()`, evitando loop infiniti. Verrà ritentato al prossimo ciclo di refresh. + +**Causa root dei 403 persistenti:** `packages_universe` è ordinato per `rank_in_ecosystem`, quindi pacchetti dello stesso namespace (es. `com.google.apis`, `org.wso2`, `software.amazon.awssdk`) si raggruppano nel batch e colpiscono lo stesso CDN node di Maven Central in rapida successione. Il rate limit scatta sistematicamente dopo ~150–200 pacchetti processati. + +**Fix applicato:** i batch HTTP vengono shufflati prima dell'esecuzione (`[...packages].sort(() => Math.random() - 0.5)`) per distribuire i namespace uniformemente nei gruppi concorrenti. Un delay configurabile tra i gruppi (`POM_FETCHER_GROUP_DELAY_MS`, default 200ms) riduce ulteriormente il rate di richieste. + +Namespace noti per triggerare il rate limit a causa dell'alta densità di artefatti: `com.google.apis`, `software.amazon.awssdk`, `org.wso2.*`. + +**IP caldo durante i test locali:** run ripetute sulla stessa macchina accumulano request history sull'IP. Maven Central usa finestre di throttle lunghe (1–4 ore), quindi anche a concurrency=3 + delay=400ms l'IP può rimanere in stato di throttle per tutta la sessione di test. In produzione questo non accade perché le run sono distanziate di 24 ore e l'IP è sempre freddo tra un pass e l'altro. Per verificare se l'IP è throttlato: `curl -I https://repo1.maven.org/maven2/org/wso2/carbon/identity/framework/application-mgt/maven-metadata.xml` — risposta 403 immediata conferma il throttle. + +### Partial Maven Central Deploys + +Occasionally a publisher's CI/CD updates `` in `maven-metadata.xml` before the `.pom` is fully propagated to all Central mirrors. These appear as `pom_fetcher_error` on the first pass and usually resolve on the next periodic refresh. + +--- + +## Configuration Reference + +All variables are optional — defaults are shown. + +| Env var | Default | Description | +|---------|---------|-------------| +| `POM_FETCHER_DIRECT_POM_FOR_ALL` | `false` | `true` = direct POM fetch for all packages + full resolution for critical | +| `POM_FETCHER_BATCH_SIZE` | `50` | Packages per batch — critical phase | +| `POM_FETCHER_CONCURRENCY` | `5` | Concurrent fetches — critical phase | +| `POM_FETCHER_FULL_REFRESH_DAYS` | `90` | Re-sync critical packages after N days | +| `POM_FETCHER_NON_CRITICAL_BATCH_SIZE` | `500` | Packages per batch — non-critical phase | +| `POM_FETCHER_NON_CRITICAL_CONCURRENCY` | `20` | Concurrent writes — non-critical DB-only mode | +| `POM_FETCHER_NON_CRITICAL_POM_CONCURRENCY` | `5` | Concurrent fetches — non-critical direct-pom mode | +| `POM_FETCHER_NON_CRITICAL_REFRESH_DAYS` | `1` | Re-sync non-critical stats after N days (DB-only) | +| `POM_FETCHER_NON_CRITICAL_POM_REFRESH_DAYS` | `30` | Re-sync non-critical POM data after N days (direct-pom) | +| `POM_FETCHER_IDLE_SLEEP_SEC` | `3600` | Sleep between passes | + +**Concurrency guidance:** Maven Central handles 10–15 concurrent requests per IP without throttling. Retry logic with exponential backoff handles 429s. Keep `POM_FETCHER_CONCURRENCY` + `POM_FETCHER_NON_CRITICAL_POM_CONCURRENCY` ≤ 15 in production. + +--- + +## Performance + +Observed on ~2K packages (local dev, Maven Central over the network): + +| Phase | Mode | Throughput | Notes | +|-------|------|------------|-------| +| Non-critical | DB-only | ~1000 pkg/sec | Pure DB writes, no HTTP | +| Non-critical | direct-pom | ~25 pkg/sec | 2 HTTP requests/pkg: metadata.xml + POM | +| Critical | full-pom | ~15–25 pkg/sec | Faster when packages share parent POMs (CDN cache warm) | + +**Estimated time for 800K packages (10% critical) at default settings:** + +| Phase | Packages | Estimated time | +|-------|----------|---------------| +| Non-critical (DB-only, first pass) | 720K | ~12 min | +| Non-critical (direct-pom, first pass) | 720K | ~8 h | +| Critical (full-pom, first pass) | 80K | ~3–4 h | + +First pass is the expensive one. Subsequent daily passes are incremental: +- Non-critical DB-only: re-syncs all packages daily (~12 min) +- Non-critical direct-pom: re-syncs after 30 days (~8 h every 30 days) +- Critical: only packages with new versions or approaching 90-day refresh window + +The daily Temporal schedule has a **12-hour workflow timeout**. With `POM_FETCHER_DIRECT_POM_FOR_ALL=true`, the first pass is within budget (~11–12 h); increase `POM_FETCHER_NON_CRITICAL_POM_CONCURRENCY` to 10 to halve non-critical time if needed. + +--- + +## Scheduling + +Runs daily at **4am UTC** via Temporal schedule `maven-pom-fetcher`. +Overlap policy: `SKIP` — if a previous run is still active, the new trigger is dropped. +Catchup window: 1 hour. +Workflow timeout: 12 hours. +Max retries: 3 (30s initial, 2× backoff). + +--- + +## Known Data Anomalies + +### High version counts + +Maven packages released via automated CI/CD pipelines (every commit or every day) accumulate thousands of versions on Central. Observed examples on a 10K sample: + +| Package | Versions | +|---------|----------| +| io.joern/x2cpg_3 | ~2 166 | +| org.cdk8s/cdk8s | ~1 749 | +| io.joern/semanticcpg_3 | ~2 077 | +| org.janusgraph/* (×15 artifacts) | ~795 each | + +`maven-metadata.xml` `` lists **every version ever published**, including each snapshot, alpha, RC, and automated patch. On a 10K package run this produced ~3.8M rows in the `versions` table (~1 375 versions/package on average). + +This is correct data, not a bug. The high cardinality is expected given Maven's publishing model and is useful for `versions_count`, `first_release_at`, and `is_prerelease` derivation. `published_at` (pending Sonatype) will complete the picture. + +To inspect the distribution: + +```sql +SELECT + width_bucket(cnt, 0, 3000, 10) AS bucket, + min(cnt) AS min_versions, + max(cnt) AS max_versions, + count(*) AS packages +FROM (SELECT package_id, count(*) AS cnt FROM versions GROUP BY package_id) t +GROUP BY bucket ORDER BY bucket; +``` + +### Low repo coverage for non-critical packages + +On the same 10K sample only ~3–4% of packages produced a `repos` row. The root cause is that `` is frequently absent from the direct POM and inherited from a parent POM instead (Apache parent, Spring parent, Google parent, etc.). Since non-critical packages use direct-POM fetch without parent resolution, those SCM URLs are null and no repo row is written. + +Coverage breakdown by ingestion source: + +```sql +SELECT p.ingestion_source, count(p.id) AS packages, count(pr.id) AS with_repo +FROM packages p +LEFT JOIN package_repos pr ON pr.package_id = p.id +WHERE p.ecosystem = 'maven' +GROUP BY p.ingestion_source +ORDER BY packages DESC; +``` + +Expected behaviour: +- `pom_fetcher` (critical, full resolution) → high repo coverage +- `pom_fetcher_direct` (non-critical, no parent resolution) → low repo coverage +- `pom_fetcher_not_on_central` / `pom_fetcher_error` → no repo (no POM data) + +Repo coverage will grow naturally as the critical package set expands and as non-critical packages hit their 30-day POM refresh window. + +--- + +## Pending: Sonatype Integration + +The following fields require data from the Sonatype API and are not yet populated: + +- `packages.status` — deprecated flag +- `packages.versions_count` — count of published releases +- `packages.first_release_at` — timestamp of first release +- `packages.latest_release_at` — timestamp of most recent release +- `versions.published_at` — per-version release timestamp +- `downloads_last_30d` — 30-day rolling download count diff --git a/services/apps/packages_worker/src/pom-fetcher/extract.ts b/services/apps/packages_worker/src/pom-fetcher/extract.ts index 58c200f0f6..4ce81fd25e 100644 --- a/services/apps/packages_worker/src/pom-fetcher/extract.ts +++ b/services/apps/packages_worker/src/pom-fetcher/extract.ts @@ -11,7 +11,6 @@ import { XMLParser } from 'fast-xml-parser' export interface PomMaintainer { username: string | null displayName: string | null - /** Raw email from POM — hash with SHA-256 before storing (GDPR) */ email: string | null url: string | null role: 'author' | 'maintainer' @@ -55,7 +54,7 @@ interface PomPerson { // ─── Config ─────────────────────────────────────────────────────────────────── const MAVEN_REPO = 'https://repo1.maven.org/maven2' -const MAX_PARENT_HOPS = 5 +export const MAX_PARENT_HOPS = 7 const REQUEST_TIMEOUT_MS = 15_000 const parser = new XMLParser({ @@ -80,8 +79,10 @@ async function getWithRetry(url: string): Promise { const res = await axios.get(url, { responseType: 'text', timeout: REQUEST_TIMEOUT_MS }) return res.data } catch (err) { - if (axios.isAxiosError(err) && err.response?.status === 429) { - if (attempt < MAX_RETRIES) { + if (axios.isAxiosError(err)) { + const status = err.response?.status + // 429 = explicit rate limit, 403 = CDN throttle (Maven Central uses both) + if ((status === 429 || status === 403) && attempt < MAX_RETRIES) { const delay = RETRY_BASE_MS * 2 ** attempt + Math.random() * 500 await sleep(delay) continue @@ -195,10 +196,65 @@ async function resolveWithInheritance( } } -// ─── Public entry point ─────────────────────────────────────────────────────── +// ─── Public entry points ────────────────────────────────────────────────────── /** - * Fetches and resolves POM metadata for the given Maven artifact. + * Fetches only the root POM without following the parent chain. + * Faster than extractArtifact — use for non-critical packages where inherited + * fields (licenses, SCM) may be missing but throughput matters more. + */ +export async function extractArtifactDirect( + groupId: string, + artifactId: string, + version: string, + log: (msg: string) => void = () => undefined, +): Promise { + const purl = `pkg:maven/${groupId}/${artifactId}@${version}` + const pom = await fetchPom(groupId, artifactId, version, log) + + if (!pom) { + return { + groupId, + artifactId, + version, + purl, + description: null, + licenses: [], + licensesRaw: null, + scmUrl: null, + homepageUrl: null, + developers: [], + contributors: [], + parentHops: 0, + error: `POM not found: ${buildPomUrl(groupId, artifactId, version)}`, + } + } + + const licenses = extractLicenses(pom) + const scmUrl = extractStr(pom.scm?.url ?? pom.scm?.connection) + const developers = extractPersons(pom.developers?.developer, 'author') + const contributors = extractPersons(pom.contributors?.contributor, 'maintainer') + + return { + groupId, + artifactId, + version, + purl, + description: extractStr(pom.description), + licenses, + licensesRaw: licenses.length > 0 ? licenses.join(', ') : null, + scmUrl, + homepageUrl: extractStr(pom.url), + developers, + contributors, + parentHops: 0, + error: null, + } +} + +/** + * Fetches and resolves POM metadata for the given Maven artifact, following + * the parent chain to inherit licenses and SCM when not in the direct POM. * Always returns a result object; errors are captured in `result.error`. */ export async function extractArtifact( diff --git a/services/apps/packages_worker/src/pom-fetcher/metadata.ts b/services/apps/packages_worker/src/pom-fetcher/metadata.ts index c6153c21ce..c755739b78 100644 --- a/services/apps/packages_worker/src/pom-fetcher/metadata.ts +++ b/services/apps/packages_worker/src/pom-fetcher/metadata.ts @@ -1,6 +1,6 @@ /** - * Resolves the latest release version of a Maven artifact using the - * maven-metadata.xml endpoint on Maven Central. + * Fetches maven-metadata.xml for a Maven artifact and returns the full version + * list plus the current release version. * * URL format: * https://repo1.maven.org/maven2/{groupPath}/{artifactId}/maven-metadata.xml @@ -24,14 +24,19 @@ const parser = new XMLParser({ parseAttributeValue: false, }) +export interface MavenVersionsMetadata { + versions: string[] + releaseVersion: string | null +} + async function sleep(ms: number): Promise { return new Promise((r) => setTimeout(r, ms)) } -export async function resolveLatestVersion( +export async function resolveVersionsList( groupId: string, artifactId: string, -): Promise { +): Promise { const groupPath = groupId.replace(/\./g, '/') const url = `${MAVEN_REPO}/${groupPath}/${artifactId}/maven-metadata.xml` @@ -45,11 +50,20 @@ export async function resolveLatestVersion( const release = typeof versioning?.release === 'string' ? versioning.release.trim() : null const latest = typeof versioning?.latest === 'string' ? versioning.latest.trim() : null - return release || latest || null + const rawVersions = versioning?.versions?.version + let versions: string[] = [] + if (Array.isArray(rawVersions)) { + versions = rawVersions.map((v: unknown) => String(v).trim()).filter(Boolean) + } else if (typeof rawVersions === 'string' && rawVersions.trim()) { + versions = [rawVersions.trim()] + } + + return { versions, releaseVersion: release || latest || null } } catch (err) { if (axios.isAxiosError(err)) { if (err.response?.status === 404) return null - if (err.response?.status === 429 && attempt < MAX_RETRIES) { + // 429 = explicit rate limit, 403 = CDN throttle (Maven Central uses both) + if ((err.response?.status === 429 || err.response?.status === 403) && attempt < MAX_RETRIES) { const delay = RETRY_BASE_MS * 2 ** attempt + Math.random() * 500 await sleep(delay) continue @@ -61,3 +75,11 @@ export async function resolveLatestVersion( return null } + +export async function resolveLatestVersion( + groupId: string, + artifactId: string, +): Promise { + const meta = await resolveVersionsList(groupId, artifactId) + return meta?.releaseVersion ?? null +} diff --git a/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts b/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts index 9b82da2515..5f28e1b12b 100644 --- a/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts +++ b/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts @@ -1,80 +1,148 @@ -import crypto from 'crypto' - import { - listMavenPackagesToSync, + listCriticalMavenPackagesToSync, upsertMaintainer, upsertPackage, upsertPackageMaintainer, + upsertPackageRepo, + upsertRepo, + upsertVersionsBatch, } from '@crowd/data-access-layer' import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' import { getServiceChildLogger } from '@crowd/logging' import { getPomFetcherConfig } from '../config' -import { extractArtifact, normalizeScmUrl } from './extract' -import { resolveLatestVersion } from './metadata' +import { MAX_PARENT_HOPS, extractArtifact, normalizeScmUrl } from './extract' +import { resolveVersionsList } from './metadata' const log = getServiceChildLogger('pom-fetcher') -// ─── Types ──────────────────────────────────────────────────────────────────── - export interface BatchResult { processed: number skipped: number errors: number } -type PackageToSync = Awaited>[number] +type PomFetcherConfig = ReturnType +type PackageRow = Awaited>[number] -// ─── Non-critical: copy universe stats into packages ───────────────────────── +// ─── Helpers ────────────────────────────────────────────────────────────────── function mavenRegistryUrl(groupId: string, artifactId: string): string { return `https://central.sonatype.com/artifact/${groupId}/${artifactId}` } -async function processNonCriticalPackage(qx: QueryExecutor, pkg: PackageToSync): Promise { - await upsertPackage(qx, { - purl: pkg.purl, - ecosystem: 'maven', - namespace: pkg.namespace, - name: pkg.name, - description: null, - homepage: null, - registryUrl: pkg.namespace ? mavenRegistryUrl(pkg.namespace, pkg.name) : null, - declaredRepositoryUrl: null, - repositoryUrl: null, - licenses: null, - licensesRaw: null, - latestVersion: null, - ingestionSource: 'packages_universe', - criticalityScore: pkg.criticalityScore, - dependentPackagesCount: pkg.dependentPackagesCount, - dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, - }) +function isPrerelease(version: string): boolean { + return /-(SNAPSHOT|alpha|beta|rc|m\d+)/i.test(version) } -// ─── Critical: full POM extraction ─────────────────────────────────────────── +function interleaveByNamespace(packages: PackageRow[]): PackageRow[] { + const byNamespace = new Map() + for (const pkg of packages) { + const ns = pkg.namespace ?? '__unknown__' + if (!byNamespace.has(ns)) byNamespace.set(ns, []) + byNamespace.get(ns)!.push(pkg) + } + const queues = [...byNamespace.values()] + const result: PackageRow[] = [] + let i = 0 + while (result.length < packages.length) { + const q = queues[i % queues.length] + if (q.length > 0) result.push(q.shift()!) + i++ + } + return result +} -async function processCriticalPackage( - qx: QueryExecutor, - pkg: PackageToSync, -): Promise<'processed' | 'skipped' | 'error'> { +interface PersonWithRole { + username: string | null + displayName: string | null + email: string | null + url: string | null + role: 'author' | 'maintainer' +} + +async function writeMaintainers(qx: QueryExecutor, packageId: number, people: PersonWithRole[]): Promise { + let count = 0 + for (const person of people) { + const username = person.username ?? person.email ?? person.displayName + if (!username) continue + const maintainerId = await upsertMaintainer(qx, { + ecosystem: 'maven', + username, + displayName: person.displayName, + url: person.url, + emailHash: person.email, + }) + await upsertPackageMaintainer(qx, { packageId, maintainerId, role: person.role }) + count++ + } + return count +} + +function parseRepoUrl(url: string): { host: string; owner: string | null; name: string | null } | null { + try { + const parsed = new URL(url) + const h = parsed.hostname.toLowerCase() + let host: string + if (h === 'github.com' || h.endsWith('.github.com')) host = 'github' + else if (h === 'gitlab.com' || h.includes('gitlab')) host = 'gitlab' + else if (h === 'bitbucket.org') host = 'bitbucket' + else host = 'other' + const parts = parsed.pathname.split('/').filter(Boolean) + return { host, owner: parts[0] ?? null, name: parts[1] ?? null } + } catch { + return null + } +} + +async function writeRepoLink(qx: QueryExecutor, packageId: number, repositoryUrl: string | null): Promise { + if (!repositoryUrl) return + const parsed = parseRepoUrl(repositoryUrl) + if (!parsed) return + const repoId = await upsertRepo(qx, { url: repositoryUrl, ...parsed }) + await upsertPackageRepo(qx, { packageId, repoId, source: 'declared', confidence: 0.8 }) +} + +// ─── Package processing ─────────────────────────────────────────────────────── + +async function processPackage(qx: QueryExecutor, pkg: PackageRow): Promise<'processed' | 'skipped' | 'error'> { const groupId = pkg.namespace const artifactId = pkg.name if (!groupId) { - log.warn({ purl: pkg.purl }, 'Skipping critical package with null namespace (groupId)') + log.warn({ purl: pkg.purl }, 'Skipping: null namespace (groupId)') return 'skipped' } - let version = pkg.latestVersion ?? null - if (!version) { - log.debug({ groupId, artifactId }, 'No baseline version — falling back to maven-metadata.xml') - version = await resolveLatestVersion(groupId, artifactId) + const meta = await resolveVersionsList(groupId, artifactId) + + if (!meta) { + await upsertPackage(qx, { + purl: pkg.purl, + ecosystem: 'maven', + namespace: groupId, + name: artifactId, + description: null, + homepage: null, + registryUrl: mavenRegistryUrl(groupId, artifactId), + declaredRepositoryUrl: null, + repositoryUrl: null, + licenses: null, + licensesRaw: null, + latestVersion: pkg.latestVersion ?? null, + ingestionSource: 'pom_fetcher_not_on_central', + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, + }) + log.warn({ groupId, artifactId }, 'Not on Maven Central — writing minimal record') + return 'skipped' } + const version = meta.releaseVersion + if (!version) { - log.warn({ groupId, artifactId }, 'Could not resolve latest version, skipping') await upsertPackage(qx, { purl: pkg.purl, ecosystem: 'maven', @@ -94,6 +162,7 @@ async function processCriticalPackage( dependentReposCount: pkg.dependentReposCount, downloadsLastMonth: pkg.downloads30d, }) + log.warn({ groupId, artifactId }, 'No release version in metadata — writing minimal record') return 'skipped' } @@ -102,10 +171,38 @@ async function processCriticalPackage( }) if (result.error) { - log.warn({ groupId, artifactId, version, error: result.error }, 'POM extraction error') + log.warn({ groupId, artifactId, version, error: result.error }, 'POM extraction failed') + await upsertPackage(qx, { + purl: pkg.purl, + ecosystem: 'maven', + namespace: groupId, + name: artifactId, + description: null, + homepage: null, + registryUrl: mavenRegistryUrl(groupId, artifactId), + declaredRepositoryUrl: null, + repositoryUrl: null, + licenses: null, + licensesRaw: null, + latestVersion: version, + ingestionSource: 'pom_fetcher_error', + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, + }) return 'error' } + if (result.parentHops > MAX_PARENT_HOPS) { + log.warn( + { groupId, artifactId, parentHops: result.parentHops, missingLicenses: result.licenses.length === 0, missingScm: !result.scmUrl }, + 'Parent hop limit reached — data may be incomplete', + ) + } + + const repositoryUrl = normalizeScmUrl(result.scmUrl) + const packageId = await upsertPackage(qx, { purl: pkg.purl, ecosystem: 'maven', @@ -115,7 +212,7 @@ async function processCriticalPackage( homepage: result.homepageUrl, registryUrl: mavenRegistryUrl(groupId, artifactId), declaredRepositoryUrl: result.scmUrl, - repositoryUrl: normalizeScmUrl(result.scmUrl), + repositoryUrl, licenses: result.licenses.length > 0 ? result.licenses : null, licensesRaw: result.licensesRaw, latestVersion: version, @@ -130,176 +227,111 @@ async function processCriticalPackage( ...result.developers.map((d) => ({ ...d, role: 'author' as const })), ...result.contributors.map((c) => ({ ...c, role: 'maintainer' as const })), ] + const maintainerCount = await writeMaintainers(qx, packageId, allPeople) - for (const person of allPeople) { - const username = person.username ?? person.email ?? person.displayName - if (!username) continue - - const emailHash = person.email - ? crypto.createHash('sha256').update(person.email.toLowerCase().trim()).digest('hex') - : null - - const maintainerId = await upsertMaintainer(qx, { + const allVersions = meta.versions.length > 0 ? meta.versions : [version] + await upsertVersionsBatch( + qx, + allVersions.map((v) => ({ + packageId, ecosystem: 'maven', - username, - displayName: person.displayName, - url: person.url, - emailHash, - }) + number: v, + isLatest: v === meta.releaseVersion, + isPrerelease: isPrerelease(v), + license: result.licenses[0] ?? null, + })), + ) - await upsertPackageMaintainer(qx, { - packageId, - maintainerId, - role: person.role, - }) - } + await writeRepoLink(qx, packageId, repositoryUrl) + + log.info( + { groupId, artifactId, version, parentHops: result.parentHops, licenses: result.licenses.length, maintainers: maintainerCount, versions: allVersions.length }, + 'ok', + ) return 'processed' } -// ─── Batch processing ───────────────────────────────────────────────────────── +// ─── Batch ──────────────────────────────────────────────────────────────────── -export async function processBatch( - qx: QueryExecutor, - config: ReturnType, - isCritical: boolean, -): Promise { - const batchSize = isCritical ? config.batchSize : config.nonCriticalBatchSize - const concurrency = isCritical ? config.concurrency : config.nonCriticalConcurrency - - const packages = await listMavenPackagesToSync(qx, { - limit: batchSize, - offset: 0, - fullRefreshDays: config.fullRefreshDays, - nonCriticalRefreshDays: config.nonCriticalRefreshDays, - isCritical, +export async function processBatch(qx: QueryExecutor, config: PomFetcherConfig): Promise { + const packages = await listCriticalMavenPackagesToSync(qx, { + limit: config.batchSize, + refreshDays: config.refreshDays, }) - if (packages.length === 0) { - return { processed: 0, skipped: 0, errors: 0 } - } + if (packages.length === 0) return { processed: 0, skipped: 0, errors: 0 } - log.info({ count: packages.length, isCritical }, 'Processing batch...') + log.info({ count: packages.length }, 'Batch started') let processed = 0 let skipped = 0 let errors = 0 - const PROGRESS_EVERY = 25 + const queue = interleaveByNamespace(packages) + + for (let i = 0; i < queue.length; i += config.concurrency) { + const group = queue.slice(i, i + config.concurrency) - for (let i = 0; i < packages.length; i += concurrency) { - const group = packages.slice(i, i + concurrency) + if (config.groupDelayMs > 0 && i > 0) { + await new Promise((r) => setTimeout(r, config.groupDelayMs)) + } await Promise.all( group.map(async (pkg) => { try { - if (!isCritical) { - await processNonCriticalPackage(qx, pkg) - processed++ - return - } - - const status = await processCriticalPackage(qx, pkg) + const status = await processPackage(qx, pkg) if (status === 'processed') processed++ else if (status === 'skipped') skipped++ else errors++ } catch (err) { const message = err instanceof Error ? err.message : String(err) - log.error({ purl: pkg.purl, error: message }, 'Unexpected error processing package') + const isRateLimit = message.includes('403') || message.includes('429') + log.error( + { purl: pkg.purl, error: message }, + isRateLimit ? 'Rate limited — will retry next pass' : 'Unexpected error processing package', + ) errors++ } }), ) const done = i + group.length - const prevDone = i - const crossedBoundary = Math.floor(done / PROGRESS_EVERY) > Math.floor(prevDone / PROGRESS_EVERY) - if (crossedBoundary || done === packages.length) { - log.debug( - { done, total: packages.length, processed, skipped, errors }, - `Progress: ${done}/${packages.length}`, - ) + if (done % 25 === 0 || done === queue.length) { + log.debug({ done, total: queue.length, processed, skipped, errors }, 'Progress') } } return { processed, skipped, errors } } -// ─── Phase runner ───────────────────────────────────────────────────────────── - -async function runPhase( - qx: QueryExecutor, - config: ReturnType, - isCritical: boolean, - isShuttingDown: () => boolean, -): Promise<{ processed: number; skipped: number; errors: number }> { - const label = isCritical ? 'critical' : 'non-critical' - let total = { processed: 0, skipped: 0, errors: 0 } - let batchNum = 0 - const phaseStartedAt = Date.now() - - log.info({ phase: label }, 'Phase started') - - while (!isShuttingDown()) { - const result = await processBatch(qx, config, isCritical) - - if (result.processed + result.skipped + result.errors === 0) { - const durationSec = Math.round((Date.now() - phaseStartedAt) / 1000) - log.info({ phase: label, ...total, durationSec }, 'Phase complete') - return total - } - - batchNum++ - total.processed += result.processed - total.skipped += result.skipped - total.errors += result.errors - - log.info( - { - phase: label, - batch: batchNum, - totalProcessed: total.processed, - totalSkipped: total.skipped, - totalErrors: total.errors, - elapsedSec: Math.round((Date.now() - phaseStartedAt) / 1000), - }, - 'Batch done', - ) - } - - return total -} - // ─── Main loop ──────────────────────────────────────────────────────────────── export async function runPomEnrichmentLoop( qx: QueryExecutor, - config: ReturnType, + config: PomFetcherConfig, isShuttingDown: () => boolean, ): Promise { + log.info({ batchSize: config.batchSize, concurrency: config.concurrency, refreshDays: config.refreshDays }, 'POM fetcher started') + let passNumber = 0 while (!isShuttingDown()) { passNumber++ const passStartedAt = Date.now() - log.info({ pass: passNumber }, 'Starting pass') - - // Phase 1: non-critical first — DB-only, high throughput - const nonCritical = await runPhase(qx, config, false, isShuttingDown) - - // Phase 2: critical — HTTP-bound, lower throughput - const critical = await runPhase(qx, config, true, isShuttingDown) - - const durationMs = Date.now() - passStartedAt - log.info( - { - pass: passNumber, - totalProcessed: nonCritical.processed + critical.processed, - totalSkipped: nonCritical.skipped + critical.skipped, - totalErrors: nonCritical.errors + critical.errors, - durationSec: Math.round(durationMs / 1000), - }, - `Pass complete. Sleeping ${config.idleSleepSec}s before next pass.`, - ) + log.info({ pass: passNumber }, 'Pass started') + + let total = { processed: 0, skipped: 0, errors: 0 } + + while (!isShuttingDown()) { + const result = await processBatch(qx, config) + if (result.processed + result.skipped + result.errors === 0) break + total.processed += result.processed + total.skipped += result.skipped + total.errors += result.errors + } + + const durationSec = Math.round((Date.now() - passStartedAt) / 1000) + log.info({ pass: passNumber, ...total, durationSec }, `Pass complete — sleeping ${config.idleSleepSec}s`) await new Promise((r) => setTimeout(r, config.idleSleepSec * 1000)) } diff --git a/services/libs/data-access-layer/src/osspckgs/index.ts b/services/libs/data-access-layer/src/osspckgs/index.ts index 49fc5f85e3..d235aa9713 100644 --- a/services/libs/data-access-layer/src/osspckgs/index.ts +++ b/services/libs/data-access-layer/src/osspckgs/index.ts @@ -1,3 +1,5 @@ export * from './types' export * from './packages' export * from './maintainers' +export * from './versions' +export * from './repos' diff --git a/services/libs/data-access-layer/src/osspckgs/packages.ts b/services/libs/data-access-layer/src/osspckgs/packages.ts index 9791ea76a8..78e3798b91 100644 --- a/services/libs/data-access-layer/src/osspckgs/packages.ts +++ b/services/libs/data-access-layer/src/osspckgs/packages.ts @@ -14,28 +14,19 @@ export async function findPackageIdsByPurl( // ─── packages_universe ──────────────────────────────────────────────────────── /** - * Returns a page of Maven packages from packages_universe that need syncing - * into the packages table. - * - * Eligibility rules: - * - p.purl IS NULL → never added to packages (any criticality) - * - is_critical = false → periodic refresh of universe stats (default 180d) - * - is_critical = true → not yet POM-enriched, new version released, or - * periodic full refresh (default 90d) - * - * Critical packages are returned first so POM enrichment is prioritised. + * Returns a page of critical Maven packages from packages_universe that need + * syncing into the packages table (never synced, or stale by refreshDays). */ -export async function listMavenPackagesToSync( +export async function listCriticalMavenPackagesToSync( qx: QueryExecutor, - options: { limit: number; offset: number; fullRefreshDays?: number; nonCriticalRefreshDays?: number; isCritical?: boolean }, + options: { limit: number; refreshDays: number }, ): Promise< - (Pick & { + (Pick & { purl: string latestVersion: string | null })[] > { - const { limit, offset, fullRefreshDays = 90, nonCriticalRefreshDays = 180, isCritical } = options - const isCriticalFilter = isCritical !== undefined ? isCritical : null + const { limit, refreshDays } = options return qx.select( ` @@ -44,7 +35,6 @@ export async function listMavenPackagesToSync( pu.purl, pu.namespace, pu.name, - pu.is_critical AS "isCritical", pu.criticality_score AS "criticalityScore", pu.dependent_packages_count AS "dependentPackagesCount", pu.dependent_repos_count AS "dependentReposCount", @@ -54,26 +44,19 @@ export async function listMavenPackagesToSync( LEFT JOIN packages p ON p.purl = pu.purl WHERE pu.ecosystem = 'maven' + AND pu.is_critical = true AND pu.purl IS NOT NULL AND pu.namespace IS NOT NULL - AND ($(isCriticalFilter)::boolean IS NULL OR pu.is_critical = $(isCriticalFilter)::boolean) AND ( p.purl IS NULL - OR (pu.is_critical = false - AND p.last_synced_at < NOW() - ($(nonCriticalRefreshDays) || ' days')::interval) - OR (pu.is_critical = true - AND p.ingestion_source IN ('maven_index', 'packages_universe')) - OR (pu.is_critical = true - AND p.latest_release_at > p.last_synced_at) - OR (pu.is_critical = true - AND p.last_synced_at < NOW() - ($(fullRefreshDays) || ' days')::interval) + OR p.last_synced_at < NOW() - ($(refreshDays) || ' days')::interval ) ORDER BY pu.rank_in_ecosystem ASC NULLS LAST, pu.id ASC - LIMIT $(limit) OFFSET $(offset) + LIMIT $(limit) `, - { limit, offset, fullRefreshDays, nonCriticalRefreshDays, isCriticalFilter }, + { limit, refreshDays }, ) } diff --git a/services/libs/data-access-layer/src/osspckgs/repos.ts b/services/libs/data-access-layer/src/osspckgs/repos.ts index c06bfe5216..a8fc92b9d8 100644 --- a/services/libs/data-access-layer/src/osspckgs/repos.ts +++ b/services/libs/data-access-layer/src/osspckgs/repos.ts @@ -1,4 +1,5 @@ import { QueryExecutor } from '../queryExecutor' +import { IDbPackageRepoUpsert, IDbRepoUpsert } from './types' export async function findRepoIdsByUrl( qx: QueryExecutor, @@ -8,3 +9,46 @@ export async function findRepoIdsByUrl( const rows = await qx.select(`SELECT id, url FROM repos WHERE url = ANY($(urls))`, { urls }) return new Map(rows.map((r: { url: string; id: number }) => [r.url, r.id])) } + +/** + * Inserts or updates a repo row keyed on url. + * Uses COALESCE so richer data from other enrichers (GitHub, deps.dev) is never + * overwritten with nulls from a partial write. + * Returns the repo id. + */ +export async function upsertRepo(qx: QueryExecutor, item: IDbRepoUpsert): Promise { + const row = await qx.selectOne( + ` + INSERT INTO repos (url, host, owner, name, last_synced_at) + VALUES ($(url), $(host), $(owner), $(name), NOW()) + ON CONFLICT (url) DO UPDATE SET + host = COALESCE(EXCLUDED.host, repos.host), + owner = COALESCE(EXCLUDED.owner, repos.owner), + name = COALESCE(EXCLUDED.name, repos.name), + last_synced_at = NOW() + RETURNING id + `, + item, + ) + return row.id as number +} + +/** + * Links a package to a repo with provenance metadata. + * On conflict keeps the higher confidence value and refreshes verified_at. + */ +export async function upsertPackageRepo( + qx: QueryExecutor, + item: IDbPackageRepoUpsert, +): Promise { + await qx.result( + ` + INSERT INTO package_repos (package_id, repo_id, source, confidence, verified_at) + VALUES ($(packageId), $(repoId), $(source), $(confidence), NOW()) + ON CONFLICT (package_id, repo_id) DO UPDATE SET + confidence = GREATEST(EXCLUDED.confidence, package_repos.confidence), + verified_at = NOW() + `, + item, + ) +} diff --git a/services/libs/data-access-layer/src/osspckgs/types.ts b/services/libs/data-access-layer/src/osspckgs/types.ts index ee3d93b849..5065d15114 100644 --- a/services/libs/data-access-layer/src/osspckgs/types.ts +++ b/services/libs/data-access-layer/src/osspckgs/types.ts @@ -53,3 +53,32 @@ export type IDbPackageMaintainerUpsert = { maintainerId: number role: 'author' | 'maintainer' | null } + +// ─── versions ───────────────────────────────────────────────────────────────── + +export type IDbVersionUpsert = { + packageId: number + ecosystem: string + number: string + isLatest: boolean + isPrerelease: boolean + license: string | null +} + +// ─── repos ──────────────────────────────────────────────────────────────────── + +export type IDbRepoUpsert = { + url: string + host: string | null + owner: string | null + name: string | null +} + +// ─── package_repos ──────────────────────────────────────────────────────────── + +export type IDbPackageRepoUpsert = { + packageId: number + repoId: number + source: 'declared' | 'deps_dev' | 'heuristic' | 'manual' + confidence: number +} diff --git a/services/libs/data-access-layer/src/osspckgs/versions.ts b/services/libs/data-access-layer/src/osspckgs/versions.ts new file mode 100644 index 0000000000..bbd1f1f55d --- /dev/null +++ b/services/libs/data-access-layer/src/osspckgs/versions.ts @@ -0,0 +1,53 @@ +import { QueryExecutor } from '../queryExecutor' + +import { IDbVersionUpsert } from './types' + +/** + * Bulk-upserts a list of versions for a single package. + * Uses UNNEST arrays to avoid N individual round-trips. + * On conflict (package_id, number) updates is_latest, is_prerelease, and + * license (never overwrites an existing license with NULL). + */ +export async function upsertVersionsBatch( + qx: QueryExecutor, + versions: IDbVersionUpsert[], +): Promise { + if (versions.length === 0) return + + // maven-metadata.xml sometimes contains duplicate version strings — deduplicate + // by number before inserting to avoid "ON CONFLICT DO UPDATE command cannot affect + // row a second time" from PostgreSQL + const seen = new Set() + versions = versions.filter((v) => { + if (seen.has(v.number)) return false + seen.add(v.number) + return true + }) + + await qx.result( + ` + INSERT INTO versions (package_id, ecosystem, number, is_latest, is_prerelease, license, last_synced_at) + SELECT + UNNEST($(packageIds)::bigint[]), + UNNEST($(ecosystems)::text[]), + UNNEST($(numbers)::text[]), + UNNEST($(isLatests)::bool[]), + UNNEST($(isPreleases)::bool[]), + UNNEST($(licenses)::text[]), + NOW() + ON CONFLICT (package_id, number) DO UPDATE SET + is_latest = EXCLUDED.is_latest, + is_prerelease = EXCLUDED.is_prerelease, + license = COALESCE(EXCLUDED.license, versions.license), + last_synced_at = NOW() + `, + { + packageIds: versions.map((v) => v.packageId), + ecosystems: versions.map((v) => v.ecosystem), + numbers: versions.map((v) => v.number), + isLatests: versions.map((v) => v.isLatest), + isPreleases: versions.map((v) => v.isPrerelease), + licenses: versions.map((v) => v.license), + }, + ) +} From b595b717eb204f0afcd4bc2b29145aebfef360bf Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 2 Jun 2026 13:48:24 +0200 Subject: [PATCH 07/22] refactor: more solid and simple structure Signed-off-by: Umberto Sgueglia --- backend/.env.dist.local | 32 +- .../services/{pom-fetcher.yaml => maven.yaml} | 12 +- services/apps/packages_worker/package.json | 7 +- .../apps/packages_worker/src/activities.ts | 2 +- .../src/bin/{pom-fetcher.ts => maven.ts} | 16 +- .../src/bin/packages-worker.ts | 5 +- services/apps/packages_worker/src/config.ts | 15 +- .../src/{pom-fetcher => maven}/README.md | 28 +- .../src/{pom-fetcher => maven}/activities.ts | 14 +- .../src/{pom-fetcher => maven}/extract.ts | 65 +-- .../src/{pom-fetcher => maven}/metadata.ts | 0 .../src/maven/runMavenEnrichmentLoop.ts | 468 ++++++++++++++++++ .../packages_worker/src/maven/schedule.ts | 76 +++ .../packages_worker/src/maven/workflows.ts | 19 + .../src/pom-fetcher/runPomEnrichmentLoop.ts | 338 ------------- .../src/pom-fetcher/schedule.ts | 41 -- .../src/pom-fetcher/workflows.ts | 36 -- .../packages_worker/src/workflows/index.ts | 2 +- .../src/osspckgs/packages.ts | 72 ++- .../data-access-layer/src/osspckgs/types.ts | 1 + .../src/osspckgs/versions.ts | 4 +- 21 files changed, 712 insertions(+), 541 deletions(-) rename scripts/services/{pom-fetcher.yaml => maven.yaml} (94%) rename services/apps/packages_worker/src/bin/{pom-fetcher.ts => maven.ts} (58%) rename services/apps/packages_worker/src/{pom-fetcher => maven}/README.md (89%) rename services/apps/packages_worker/src/{pom-fetcher => maven}/activities.ts (58%) rename services/apps/packages_worker/src/{pom-fetcher => maven}/extract.ts (87%) rename services/apps/packages_worker/src/{pom-fetcher => maven}/metadata.ts (100%) create mode 100644 services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts create mode 100644 services/apps/packages_worker/src/maven/schedule.ts create mode 100644 services/apps/packages_worker/src/maven/workflows.ts delete mode 100644 services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts delete mode 100644 services/apps/packages_worker/src/pom-fetcher/schedule.ts delete mode 100644 services/apps/packages_worker/src/pom-fetcher/workflows.ts diff --git a/backend/.env.dist.local b/backend/.env.dist.local index 91c19e85c1..3f8bd2bdef 100755 --- a/backend/.env.dist.local +++ b/backend/.env.dist.local @@ -199,34 +199,14 @@ OSV_ECOSYSTEMS=npm,Maven OSV_TMP_DIR=/tmp/osv OSV_BATCH_SIZE=500 OSV_DERIVE_BATCH_SIZE=1000 -# pom-fetcher non-critical (DB-only) -POM_FETCHER_NON_CRITICAL_BATCH_SIZE=500 -POM_FETCHER_NON_CRITICAL_CONCURRENCY=20 - -# pom-fetcher critical (HTTP) -POM_FETCHER_BATCH_SIZE=100 -POM_FETCHER_CONCURRENCY=5 -# ── non-critical DB-only (usato quando DIRECT_POM_FOR_ALL=false) ────────────── -# POM_FETCHER_NON_CRITICAL_BATCH_SIZE=500 -POM_FETCHER_NON_CRITICAL_CONCURRENCY=20 # solo DB writes, nessun HTTP -POM_FETCHER_NON_CRITICAL_REFRESH_DAYS=1 - -# ── non-critical HTTP / direct-pom (usato quando DIRECT_POM_FOR_ALL=true) ───── -# POM_FETCHER_NON_CRITICAL_POM_CONCURRENCY=5 -# POM_FETCHER_NON_CRITICAL_POM_REFRESH_DAYS=1 - -# ── critical HTTP / full-pom (sempre attivo) ────────────────────────────────── -# POM_FETCHER_BATCH_SIZE=100 -# POM_FETCHER_CONCURRENCY=5 -# POM_FETCHER_FULL_REFRESH_DAYS=1 - -# ── modalità e rate limiting ────────────────────────────────────────────────── -# POM_FETCHER_DIRECT_POM_FOR_ALL=true -# POM_FETCHER_GROUP_DELAY_MS=200 - - +# maven enricher POM_FETCHER_BATCH_SIZE=50 POM_FETCHER_CONCURRENCY=3 +POM_FETCHER_NON_CRITICAL_BATCH_SIZE=500 +POM_FETCHER_NON_CRITICAL_CONCURRENCY=20 POM_FETCHER_REFRESH_DAYS=1 POM_FETCHER_GROUP_DELAY_MS=500 POM_FETCHER_IDLE_SLEEP_SEC=3600 +# Set to 'true' on first run against a fresh/restored DB to skip the version-unchanged +# optimisation and force full POM extraction. Set to 'false' after the first pass. +POM_FETCHER_FORCE_FULL_EXTRACTION=false diff --git a/scripts/services/pom-fetcher.yaml b/scripts/services/maven.yaml similarity index 94% rename from scripts/services/pom-fetcher.yaml rename to scripts/services/maven.yaml index 54f083c598..a90e0f693a 100644 --- a/scripts/services/pom-fetcher.yaml +++ b/scripts/services/maven.yaml @@ -3,7 +3,7 @@ version: '3.1' x-env-args: &env-args DOCKER_BUILDKIT: 1 NODE_ENV: docker - SERVICE: pom-fetcher + SERVICE: maven SHELL: /bin/sh SUPPRESS_NO_CONFIG_WARNING: 'true' LOG_LEVEL: 'info' @@ -14,11 +14,11 @@ x-env-args: &env-args POM_FETCHER_IDLE_SLEEP_SEC: '3600' services: - pom-fetcher: + maven: build: context: ../../ dockerfile: ./scripts/services/docker/Dockerfile.packages-worker - command: 'pnpm run start:pom-fetcher' + command: 'pnpm run start:maven' working_dir: /usr/crowd/app/services/apps/packages_worker env_file: - ../../backend/.env.dist.local @@ -31,11 +31,11 @@ services: networks: - crowd-bridge - pom-fetcher-dev: + maven-dev: build: context: ../../ dockerfile: ./scripts/services/docker/Dockerfile.packages-worker - command: 'pnpm run dev:pom-fetcher' + command: 'pnpm run dev:maven' working_dir: /usr/crowd/app/services/apps/packages_worker # user: '${USER_ID}:${GROUP_ID}' env_file: @@ -45,7 +45,7 @@ services: - ../../backend/.env.override.composed environment: <<: *env-args - hostname: pom-fetcher + hostname: maven networks: - crowd-bridge volumes: diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index 364385dce7..cfef6bfe3a 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -8,10 +8,12 @@ "start:pom-fetcher": "SERVICE=pom-fetcher tsx src/bin/pom-fetcher.ts", "dev:packages-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", "dev:deps-dev-ingest": "CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/deps-dev-ingest.ts", - "dev:deps-dev-ingest:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/deps-dev-ingest.ts", + "start:maven": "SERVICE=maven tsx src/bin/maven.ts", "dev:github-repos-enricher": "SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", - "dev:github-repos-enricher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", + "dev:maven": "SERVICE=maven LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/maven.ts", "dev:packages-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", + "dev:deps-dev-ingest:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/deps-dev-ingest.ts", + "dev:github-repos-enricher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", "export-to-bucket": "SERVICE=deps-dev-ingest tsx src/scripts/exportToBucket.ts", "export-to-bucket:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=deps-dev-ingest tsx src/scripts/exportToBucket.ts", "monitor:osspckgs": "SERVICE=monitor tsx src/scripts/monitorOsspckgs.ts", @@ -20,6 +22,7 @@ "trigger-bootstrap:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=deps-dev-ingest tsx src/scripts/triggerBootstrap.ts", "dev:pom-fetcher": "SERVICE=pom-fetcher LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", "dev:pom-fetcher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=pom-fetcher LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", + "dev:maven:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=maven LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/maven.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", "format": "npx prettier --write \"src/**/*.ts\"", "format-check": "npx prettier --check .", diff --git a/services/apps/packages_worker/src/activities.ts b/services/apps/packages_worker/src/activities.ts index 95dbcccfc0..988f6d5c64 100644 --- a/services/apps/packages_worker/src/activities.ts +++ b/services/apps/packages_worker/src/activities.ts @@ -1,4 +1,4 @@ export * from './deps-dev/activities' export * from './npm/activities' export { osvSyncEcosystem, osvDeriveCriticalFlag } from './osv/activities' -export { processMavenCriticalBatch, processMavenNonCriticalBatch } from './pom-fetcher/activities' +export { processMavenCriticalBatch, processMavenNonCriticalBatch } from './maven/activities' diff --git a/services/apps/packages_worker/src/bin/pom-fetcher.ts b/services/apps/packages_worker/src/bin/maven.ts similarity index 58% rename from services/apps/packages_worker/src/bin/pom-fetcher.ts rename to services/apps/packages_worker/src/bin/maven.ts index efac07125e..63a0e1a9af 100644 --- a/services/apps/packages_worker/src/bin/pom-fetcher.ts +++ b/services/apps/packages_worker/src/bin/maven.ts @@ -1,8 +1,8 @@ import { getServiceLogger } from '@crowd/logging' -import { getPomFetcherConfig } from '../config' +import { getMavenConfig } from '../config' import { getPackagesDb } from '../db' -import { runPomEnrichmentLoop } from '../pom-fetcher/runPomEnrichmentLoop' +import { runMavenEnrichmentLoop } from '../maven/runMavenEnrichmentLoop' const log = getServiceLogger() @@ -11,29 +11,29 @@ let shuttingDown = false const shutdown = async () => { if (shuttingDown) return shuttingDown = true - log.info('Shutting down pom-fetcher...') + log.info('Shutting down maven...') } process.on('SIGINT', shutdown) process.on('SIGTERM', shutdown) const main = async () => { - log.info('pom-fetcher starting...') + log.info('maven starting...') - const config = getPomFetcherConfig() + const config = getMavenConfig() log.info(config, 'Config loaded') const qx = await getPackagesDb() await qx.selectOne('SELECT 1') log.info('Connected to packages-db.') - await runPomEnrichmentLoop(qx, config, () => shuttingDown) + await runMavenEnrichmentLoop(qx, config, () => shuttingDown) - log.info('pom-fetcher stopped.') + log.info('maven stopped.') process.exit(0) } main().catch((err) => { - log.error({ err }, 'pom-fetcher fatal error') + log.error({ err }, 'maven fatal error') process.exit(1) }) diff --git a/services/apps/packages_worker/src/bin/packages-worker.ts b/services/apps/packages_worker/src/bin/packages-worker.ts index 4396c6a2d1..39c33ffc87 100644 --- a/services/apps/packages_worker/src/bin/packages-worker.ts +++ b/services/apps/packages_worker/src/bin/packages-worker.ts @@ -1,12 +1,13 @@ import { scheduleNpmIngest } from '../npm/schedule' import { scheduleOsvSync } from '../osv/schedule' -import { schedulePomFetcher } from '../pom-fetcher/schedule' +import { scheduleMavenCritical, scheduleMavenNonCritical } from '../maven/schedule' import { svc } from '../service' setImmediate(async () => { await svc.init() await scheduleNpmIngest() await scheduleOsvSync() - await schedulePomFetcher() + await scheduleMavenCritical() + await scheduleMavenNonCritical() await svc.start() }) diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index 6901fd9da2..66b6750214 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -46,12 +46,15 @@ export function getEnricherConfig() { } } -export function getPomFetcherConfig() { +export function getMavenConfig() { return { - batchSize: parseInt(process.env.POM_FETCHER_BATCH_SIZE ?? '50', 10), - concurrency: parseInt(process.env.POM_FETCHER_CONCURRENCY ?? '5', 10), - refreshDays: parseInt(process.env.POM_FETCHER_REFRESH_DAYS ?? '1', 10), - groupDelayMs: parseInt(process.env.POM_FETCHER_GROUP_DELAY_MS ?? '200', 10), - idleSleepSec: parseInt(process.env.POM_FETCHER_IDLE_SLEEP_SEC ?? '3600', 10), + batchSize: requireEnvInt('POM_FETCHER_BATCH_SIZE'), + concurrency: requireEnvInt('POM_FETCHER_CONCURRENCY'), + nonCriticalBatchSize: requireEnvInt('POM_FETCHER_NON_CRITICAL_BATCH_SIZE'), + nonCriticalConcurrency: requireEnvInt('POM_FETCHER_NON_CRITICAL_CONCURRENCY'), + refreshDays: requireEnvInt('POM_FETCHER_REFRESH_DAYS'), + groupDelayMs: requireEnvInt('POM_FETCHER_GROUP_DELAY_MS'), + idleSleepSec: requireEnvInt('POM_FETCHER_IDLE_SLEEP_SEC'), + forceFullExtraction: requireEnv('POM_FETCHER_FORCE_FULL_EXTRACTION') === 'true', } } diff --git a/services/apps/packages_worker/src/pom-fetcher/README.md b/services/apps/packages_worker/src/maven/README.md similarity index 89% rename from services/apps/packages_worker/src/pom-fetcher/README.md rename to services/apps/packages_worker/src/maven/README.md index 8ca08638df..f69a0d0af5 100644 --- a/services/apps/packages_worker/src/pom-fetcher/README.md +++ b/services/apps/packages_worker/src/maven/README.md @@ -127,12 +127,12 @@ The POM fetcher seeds `repos` with URL-derivable fields only. The GitHub enriche | Value | Meaning | |-------|---------| -| `pom_fetcher` | Critical — full POM + parent resolution succeeded | -| `pom_fetcher_direct` | Non-critical — direct POM fetch succeeded (no parent resolution) | -| `pom_fetcher_not_on_central` | `maven-metadata.xml` not found on `repo1.maven.org` — artifact is hosted on a third-party repository (e.g. WSO2 Nexus, JBoss, Atlassian). Universe data came from an aggregator (deps.dev, OSV). | -| `pom_fetcher_no_version` | `maven-metadata.xml` found but `` is empty — artifact has no stable release | -| `pom_fetcher_error` | `maven-metadata.xml` has a release version but the `.pom` file for that version is a 404. Typical cause: partial deploy to Maven Central (metadata updated, artifact not uploaded) or Eclipse P2 feature artifacts that don't publish a standard POM. | -| `pom_fetcher_rate_limited` | Maven Central returned 403/429 on all retry attempts. Package will be retried on the next pass. | +| `maven` | Critical — full POM + parent resolution succeeded | +| `maven_direct` | Non-critical — direct POM fetch succeeded (no parent resolution) | +| `maven_not_on_central` | `maven-metadata.xml` not found on `repo1.maven.org` — artifact is hosted on a third-party repository (e.g. WSO2 Nexus, JBoss, Atlassian). Universe data came from an aggregator (deps.dev, OSV). | +| `maven_no_version` | `maven-metadata.xml` found but `` is empty — artifact has no stable release | +| `maven_error` | `maven-metadata.xml` has a release version but the `.pom` file for that version is a 404. Typical cause: partial deploy to Maven Central (metadata updated, artifact not uploaded) or Eclipse P2 feature artifacts that don't publish a standard POM. | +| `maven_rate_limited` | Maven Central returned 403/429 on all retry attempts. Package will be retried on the next pass. | | `packages_universe` | Non-critical, DB-only mode (`POM_FETCHER_DIRECT_POM_FOR_ALL=false`) — only universe stats copied | --- @@ -143,11 +143,11 @@ The POM fetcher seeds `repos` with URL-derivable fields only. The GitHub enriche WSO2 publishes some artifacts exclusively to their own Nexus at `maven.wso2.org`. A subset of their artifacts appear in `packages_universe` (sourced from deps.dev/OSV which aggregates all Maven repositories) but are **not** available on `repo1.maven.org/maven2`. -Affected pattern: `org.wso2.carbon.*` — specifically `.feature` Eclipse P2 artifacts and `.stub` artifacts. These are written with `ingestion_source = 'pom_fetcher_not_on_central'` and are not retried until the next `nonCriticalPomRefreshDays` window. +Affected pattern: `org.wso2.carbon.*` — specifically `.feature` Eclipse P2 artifacts and `.stub` artifacts. These are written with `ingestion_source = 'maven_not_on_central'` and are not retried until the next `nonCriticalPomRefreshDays` window. ### Eclipse P2 Feature Artifacts (`*.feature`) -Eclipse/OSGi feature artifacts (e.g. `org.wso2.carbon.identity.xacml.server.feature`) are packaged as `.zip` files, not `.jar`. Some publishers update `maven-metadata.xml` on Central without uploading the corresponding `.pom`. These land in `pom_fetcher_error`. No fix is possible without the publisher correcting their CI/CD pipeline. +Eclipse/OSGi feature artifacts (e.g. `org.wso2.carbon.identity.xacml.server.feature`) are packaged as `.zip` files, not `.jar`. Some publishers update `maven-metadata.xml` on Central without uploading the corresponding `.pom`. These land in `maven_error`. No fix is possible without the publisher correcting their CI/CD pipeline. ### Maven Central 403 rate limiting @@ -155,7 +155,7 @@ Maven Central (`repo1.maven.org`) restituisce 403 come meccanismo di throttle ol 1. **Retry con backoff esponenziale** — 403 e 429 vengono ritentati fino a 3 volte (2s base, ×2 per tentativo). Gestito in `getWithRetry` (extract.ts) e `resolveVersionsList` (metadata.ts). -2. **Fallback su DB** — se tutti i retry esauriscono, il pacchetto viene scritto con `ingestion_source = 'pom_fetcher_rate_limited'` e `last_synced_at = NOW()`, evitando loop infiniti. Verrà ritentato al prossimo ciclo di refresh. +2. **Fallback su DB** — se tutti i retry esauriscono, il pacchetto viene scritto con `ingestion_source = 'maven_rate_limited'` e `last_synced_at = NOW()`, evitando loop infiniti. Verrà ritentato al prossimo ciclo di refresh. **Causa root dei 403 persistenti:** `packages_universe` è ordinato per `rank_in_ecosystem`, quindi pacchetti dello stesso namespace (es. `com.google.apis`, `org.wso2`, `software.amazon.awssdk`) si raggruppano nel batch e colpiscono lo stesso CDN node di Maven Central in rapida successione. Il rate limit scatta sistematicamente dopo ~150–200 pacchetti processati. @@ -167,7 +167,7 @@ Namespace noti per triggerare il rate limit a causa dell'alta densità di artefa ### Partial Maven Central Deploys -Occasionally a publisher's CI/CD updates `` in `maven-metadata.xml` before the `.pom` is fully propagated to all Central mirrors. These appear as `pom_fetcher_error` on the first pass and usually resolve on the next periodic refresh. +Occasionally a publisher's CI/CD updates `` in `maven-metadata.xml` before the `.pom` is fully propagated to all Central mirrors. These appear as `maven_error` on the first pass and usually resolve on the next periodic refresh. --- @@ -221,7 +221,7 @@ The daily Temporal schedule has a **12-hour workflow timeout**. With `POM_FETCHE ## Scheduling -Runs daily at **4am UTC** via Temporal schedule `maven-pom-fetcher`. +Runs daily at **4am UTC** via Temporal schedule `maven`. Overlap policy: `SKIP` — if a previous run is still active, the new trigger is dropped. Catchup window: 1 hour. Workflow timeout: 12 hours. @@ -274,9 +274,9 @@ ORDER BY packages DESC; ``` Expected behaviour: -- `pom_fetcher` (critical, full resolution) → high repo coverage -- `pom_fetcher_direct` (non-critical, no parent resolution) → low repo coverage -- `pom_fetcher_not_on_central` / `pom_fetcher_error` → no repo (no POM data) +- `maven` (critical, full resolution) → high repo coverage +- `maven_direct` (non-critical, no parent resolution) → low repo coverage +- `maven_not_on_central` / `maven_error` → no repo (no POM data) Repo coverage will grow naturally as the critical package set expands and as non-critical packages hit their 30-day POM refresh window. diff --git a/services/apps/packages_worker/src/pom-fetcher/activities.ts b/services/apps/packages_worker/src/maven/activities.ts similarity index 58% rename from services/apps/packages_worker/src/pom-fetcher/activities.ts rename to services/apps/packages_worker/src/maven/activities.ts index 6417e02be9..e4974c3c08 100644 --- a/services/apps/packages_worker/src/pom-fetcher/activities.ts +++ b/services/apps/packages_worker/src/maven/activities.ts @@ -1,23 +1,23 @@ import { getServiceChildLogger } from '@crowd/logging' -import { getPomFetcherConfig } from '../config' +import { getMavenConfig } from '../config' import { getPackagesDb } from '../db' -import { BatchResult, processBatch } from './runPomEnrichmentLoop' +import { BatchResult, processBatch } from './runMavenEnrichmentLoop' -const log = getServiceChildLogger('pom-fetcher-activity') +const log = getServiceChildLogger('maven-activity') export async function processMavenCriticalBatch(): Promise { - const config = getPomFetcherConfig() + const config = getMavenConfig() const qx = await getPackagesDb() const result = await processBatch(qx, config, true) - log.info({ processed: result.processed, skipped: result.skipped, errors: result.errors }, 'Maven critical batch complete') + log.info({ processed: result.processed, skipped: result.skipped, unchanged: result.unchanged, error: result.error }, 'Maven critical batch complete') return result } export async function processMavenNonCriticalBatch(): Promise { - const config = getPomFetcherConfig() + const config = getMavenConfig() const qx = await getPackagesDb() const result = await processBatch(qx, config, false) - log.info({ processed: result.processed, skipped: result.skipped, errors: result.errors }, 'Maven non-critical batch complete') + log.info({ processed: result.processed, skipped: result.skipped, unchanged: result.unchanged, error: result.error }, 'Maven non-critical batch complete') return result } diff --git a/services/apps/packages_worker/src/pom-fetcher/extract.ts b/services/apps/packages_worker/src/maven/extract.ts similarity index 87% rename from services/apps/packages_worker/src/pom-fetcher/extract.ts rename to services/apps/packages_worker/src/maven/extract.ts index 4ce81fd25e..b9f558b01b 100644 --- a/services/apps/packages_worker/src/pom-fetcher/extract.ts +++ b/services/apps/packages_worker/src/maven/extract.ts @@ -6,6 +6,10 @@ import axios from 'axios' import { XMLParser } from 'fast-xml-parser' +import { getServiceChildLogger } from '@crowd/logging' + +const log = getServiceChildLogger('maven') + // ─── Types ──────────────────────────────────────────────────────────────────── export interface PomMaintainer { @@ -101,13 +105,7 @@ export function buildPomUrl(groupId: string, artifactId: string, version: string return `${MAVEN_REPO}/${groupPath}/${artifactId}/${version}/${artifactId}-${version}.pom` } -export async function fetchPom( - groupId: string, - artifactId: string, - version: string, - log?: (msg: string) => void, -): Promise { - const url = buildPomUrl(groupId, artifactId, version) +export async function fetchPom(groupId: string, artifactId: string, version: string, url: string): Promise { try { const data = await getWithRetry(url) const parsed = parser.parse(data) @@ -116,10 +114,10 @@ export async function fetchPom( if (axios.isAxiosError(err)) { const status = err.response?.status if (status === 404) { - log?.(`POM not found (404): ${url}`) + log.debug({ groupId, artifactId, version }, `POM not found (404): ${url}`) return null } - log?.(`HTTP ${status ?? 'unknown'} fetching POM: ${url}`) + log.debug({ groupId, artifactId, version }, `HTTP ${status ?? 'unknown'} fetching POM: ${url}`) return null } throw err @@ -139,19 +137,13 @@ interface ResolvedFields { hops: number } -async function resolveWithInheritance( - groupId: string, - artifactId: string, - version: string, - log: (msg: string) => void, - depth = 0, -): Promise { +async function resolveWithInheritance(groupId: string, artifactId: string, version: string, depth = 0): Promise { if (depth > MAX_PARENT_HOPS) { - log(`Max parent hops (${MAX_PARENT_HOPS}) reached`) + log.debug({ groupId, artifactId, version }, `Max parent hops (${MAX_PARENT_HOPS}) reached`) return emptyFields(depth) } - const pom = await fetchPom(groupId, artifactId, version, log) + const pom = await fetchPom(groupId, artifactId, version, buildPomUrl(groupId, artifactId, version)) if (!pom) return emptyFields(depth) const licenses = extractLicenses(pom) @@ -164,14 +156,8 @@ async function resolveWithInheritance( const parent = extractParent(pom) if (parent && (missingLicense || missingScm)) { - log(`[hop ${depth + 1}] ${parent.groupId}:${parent.artifactId}:${parent.version}`) - const parentFields = await resolveWithInheritance( - parent.groupId, - parent.artifactId, - parent.version, - log, - depth + 1, - ) + log.debug({ groupId, artifactId, version }, `[hop ${depth + 1}] ${parent.groupId}:${parent.artifactId}:${parent.version}`) + const parentFields = await resolveWithInheritance(parent.groupId, parent.artifactId, parent.version, depth + 1) return { description: extractStr(pom.description) ?? parentFields.description, licenses: licenses.length > 0 ? licenses : parentFields.licenses, @@ -203,14 +189,10 @@ async function resolveWithInheritance( * Faster than extractArtifact — use for non-critical packages where inherited * fields (licenses, SCM) may be missing but throughput matters more. */ -export async function extractArtifactDirect( - groupId: string, - artifactId: string, - version: string, - log: (msg: string) => void = () => undefined, -): Promise { +export async function extractArtifactDirect(groupId: string, artifactId: string, version: string): Promise { const purl = `pkg:maven/${groupId}/${artifactId}@${version}` - const pom = await fetchPom(groupId, artifactId, version, log) + const pomUrl = buildPomUrl(groupId, artifactId, version) + const pom = await fetchPom(groupId, artifactId, version, pomUrl) if (!pom) { return { @@ -226,7 +208,7 @@ export async function extractArtifactDirect( developers: [], contributors: [], parentHops: 0, - error: `POM not found: ${buildPomUrl(groupId, artifactId, version)}`, + error: `POM not found: ${pomUrl}`, } } @@ -257,17 +239,12 @@ export async function extractArtifactDirect( * the parent chain to inherit licenses and SCM when not in the direct POM. * Always returns a result object; errors are captured in `result.error`. */ -export async function extractArtifact( - groupId: string, - artifactId: string, - version: string, - log: (msg: string) => void = () => undefined, -): Promise { +export async function extractArtifact(groupId: string, artifactId: string, version: string): Promise { const purl = `pkg:maven/${groupId}/${artifactId}@${version}` - const rootPom = await fetchPom(groupId, artifactId, version, log) + const pomUrl = buildPomUrl(groupId, artifactId, version) + const rootPom = await fetchPom(groupId, artifactId, version, pomUrl) if (!rootPom) { - const pomUrl = buildPomUrl(groupId, artifactId, version) return { groupId, artifactId, @@ -286,7 +263,7 @@ export async function extractArtifact( } try { - const resolved = await resolveWithInheritance(groupId, artifactId, version, log) + const resolved = await resolveWithInheritance(groupId, artifactId, version) return { groupId, artifactId, @@ -304,7 +281,7 @@ export async function extractArtifact( } } catch (err) { const message = err instanceof Error ? err.message : String(err) - log(`Error resolving POM: ${message}`) + log.debug({ groupId, artifactId, version }, `Error resolving POM: ${message}`) return { groupId, artifactId, diff --git a/services/apps/packages_worker/src/pom-fetcher/metadata.ts b/services/apps/packages_worker/src/maven/metadata.ts similarity index 100% rename from services/apps/packages_worker/src/pom-fetcher/metadata.ts rename to services/apps/packages_worker/src/maven/metadata.ts diff --git a/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts new file mode 100644 index 0000000000..0c3b6beebc --- /dev/null +++ b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts @@ -0,0 +1,468 @@ +import crypto from 'crypto' + +import { + listMavenPackagesToSync, + logAuditFieldChange, + touchPackageSyncedAt, + upsertMaintainer, + upsertPackage, + upsertPackageMaintainer, + upsertPackageRepo, + upsertRepo, + upsertVersionsBatch, +} from '@crowd/data-access-layer' +import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' +import { getServiceChildLogger } from '@crowd/logging' + +import { getMavenConfig } from '../config' +import { MAX_PARENT_HOPS, extractArtifact, normalizeScmUrl } from './extract' +import { resolveVersionsList } from './metadata' + +const log = getServiceChildLogger('maven') + +// ─── Types ──────────────────────────────────────────────────────────────────── + +export interface BatchResult { + processed: number + skipped: number + error: number + unchanged: number +} + +type MavenConfig = ReturnType +type PackageRow = Awaited>[number] + +// ─── Helpers ────────────────────────────────────────────────────────────────── + +function mavenRegistryUrl(groupId: string, artifactId: string): string { + return `https://central.sonatype.com/artifact/${groupId}/${artifactId}` +} + +function isPrerelease(version: string): boolean { + return /-(SNAPSHOT|alpha|beta|rc|m\d+)/i.test(version) +} + +// Reorders packages so that consecutive items come from different namespaces (e.g. org.apache, com.google). +// This spreads Maven Central requests across different group IDs, avoiding bursts that could hit rate limits +// on the same namespace in a tight loop. +// function interleaveByNamespace(packages: PackageRow[]): PackageRow[] { +// const byNamespace = new Map() +// for (const pkg of packages) { +// const ns = pkg.namespace ?? '__unknown__' +// if (!byNamespace.has(ns)) byNamespace.set(ns, []) +// byNamespace.get(ns)!.push(pkg) +// } +// const queues = [...byNamespace.values()] +// const result: PackageRow[] = [] +// let i = 0 +// while (result.length < packages.length) { +// const q = queues[i % queues.length] +// if (q.length > 0) result.push(q.shift()!) +// i++ +// } +// return result +// } + +function parseRepoUrl(url: string): { host: string; owner: string | null; name: string | null } | null { + try { + const parsed = new URL(url) + const h = parsed.hostname.toLowerCase() + let host: string + if (h === 'github.com' || h.endsWith('.github.com')) host = 'github' + else if (h === 'gitlab.com' || h.includes('gitlab')) host = 'gitlab' + else if (h === 'bitbucket.org') host = 'bitbucket' + else host = 'other' + const parts = parsed.pathname.split('/').filter(Boolean) + return { host, owner: parts[0] ?? null, name: parts[1] ?? null } + } catch { + return null + } +} + +async function writeRepoLink(qx: QueryExecutor, packageId: number, repositoryUrl: string | null): Promise { + if (!repositoryUrl) return + const parsed = parseRepoUrl(repositoryUrl) + if (!parsed) return + const repoId = await upsertRepo(qx, { url: repositoryUrl, ...parsed }) + await upsertPackageRepo(qx, { packageId, repoId, source: 'declared', confidence: 0.8 }) +} + +// ─── Non-critical: copy universe stats into packages ───────────────────────── + +async function processNonCriticalPackage(qx: QueryExecutor, pkg: PackageRow): Promise { + await upsertPackage(qx, { + purl: pkg.purl, + ecosystem: 'maven', + namespace: pkg.namespace, + name: pkg.name, + description: null, + homepage: null, + registryUrl: pkg.namespace ? mavenRegistryUrl(pkg.namespace, pkg.name) : null, + declaredRepositoryUrl: null, + repositoryUrl: null, + licenses: null, + licensesRaw: null, + latestVersion: null, + ingestionSource: 'packages_universe', + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, + }) +} + +// ─── Critical: full POM extraction ─────────────────────────────────────────── + +async function processCriticalPackage( + qx: QueryExecutor, + pkg: PackageRow, + forceFullExtraction: boolean, +): Promise<'processed' | 'skipped' | 'unchanged' | 'error'> { + const groupId = pkg.namespace + const artifactId = pkg.name + + if (!groupId) { + log.warn({ purl: pkg.purl }, 'Skipping: null namespace (groupId)') + return 'skipped' + } + + // Phase 1: lightweight metadata fetch to get the current upstream version. + const metadata = await resolveVersionsList(groupId, artifactId) + + if (!metadata) { + await upsertPackage(qx, { + purl: pkg.purl, + ecosystem: 'maven', + namespace: groupId, + name: artifactId, + description: null, + homepage: null, + registryUrl: mavenRegistryUrl(groupId, artifactId), + declaredRepositoryUrl: null, + repositoryUrl: null, + licenses: null, + licensesRaw: null, + latestVersion: pkg.latestVersion ?? null, + ingestionSource: 'maven_not_on_central', + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, + }) + log.warn({ groupId, artifactId }, 'Not on Maven Central — writing minimal record') + return 'skipped' + } + + const version = metadata.releaseVersion + + if (!version) { + await upsertPackage(qx, { + purl: pkg.purl, + ecosystem: 'maven', + namespace: groupId, + name: artifactId, + description: null, + homepage: null, + registryUrl: mavenRegistryUrl(groupId, artifactId), + declaredRepositoryUrl: null, + repositoryUrl: null, + licenses: null, + licensesRaw: null, + latestVersion: null, + ingestionSource: 'maven_no_version', + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, + }) + log.warn({ groupId, artifactId }, 'No release version in metadata — writing minimal record') + return 'skipped' + } + + // Phase 2: skip full POM extraction when upstream version matches what we already have. + // This avoids 1-8 HTTP calls (POM + parent chain) for packages that haven't released + // a new version since the last sync. + // Skipped on forceFullExtraction (first run against a fresh/restored DB) because + // packages.latest_version may carry stale data from the dump. + if (!forceFullExtraction && version === pkg.latestVersion) { + await touchPackageSyncedAt(qx, pkg.purl, { + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, + }) + log.debug({ groupId, artifactId, version }, 'Version unchanged — skipping POM extraction') + return 'unchanged' + } + + // Phase 3: full POM extraction with parent-chain resolution. + const result = await extractArtifact(groupId, artifactId, version) + + if (result.error) { + log.warn({ groupId, artifactId, version, error: result.error }, 'POM extraction failed') + await upsertPackage(qx, { + purl: pkg.purl, + ecosystem: 'maven', + namespace: groupId, + name: artifactId, + description: null, + homepage: null, + registryUrl: mavenRegistryUrl(groupId, artifactId), + declaredRepositoryUrl: null, + repositoryUrl: null, + licenses: null, + licensesRaw: null, + latestVersion: version, + ingestionSource: 'maven_error', + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, + }) + return 'error' + } + + if (result.parentHops > MAX_PARENT_HOPS) { + log.warn( + { groupId, artifactId, parentHops: result.parentHops, missingLicenses: result.licenses.length === 0, missingScm: !result.scmUrl }, + 'Parent hop limit reached — data may be incomplete', + ) + } + + const repositoryUrl = normalizeScmUrl(result.scmUrl) + + const packageId = await upsertPackage(qx, { + purl: pkg.purl, + ecosystem: 'maven', + namespace: groupId, + name: artifactId, + description: result.description, + homepage: result.homepageUrl, + registryUrl: mavenRegistryUrl(groupId, artifactId), + declaredRepositoryUrl: result.scmUrl, + repositoryUrl, + licenses: result.licenses.length > 0 ? result.licenses : null, + licensesRaw: result.licensesRaw, + latestVersion: version, + ingestionSource: 'maven', + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, + }) + + const allPeople = [ + ...result.developers.map((d) => ({ ...d, role: 'author' as const })), + ...result.contributors.map((c) => ({ ...c, role: 'maintainer' as const })), + ] + + let maintainerCount = 0 + for (const person of allPeople) { + const username = person.username ?? person.email ?? person.displayName + if (!username) continue + const emailHash = person.email ? crypto.createHash('sha256').update(person.email.toLowerCase().trim()).digest('hex') : null + const maintainerId = await upsertMaintainer(qx, { + ecosystem: 'maven', + username, + displayName: person.displayName, + url: person.url, + emailHash, + }) + await upsertPackageMaintainer(qx, { packageId, maintainerId, role: person.role }) + maintainerCount++ + } + + const allVersions = metadata.versions.length > 0 ? metadata.versions : [version] + await upsertVersionsBatch( + qx, + allVersions.map((v) => ({ + packageId, + ecosystem: 'maven', + name: artifactId, + number: v, + isLatest: v === metadata.releaseVersion, + isPrerelease: isPrerelease(v), + license: result.licenses[0] ?? null, + })), + ) + + await writeRepoLink(qx, packageId, repositoryUrl) + + const auditFields = ['latest_version'] + if (result.licenses.length > 0) auditFields.push('licenses') + if (repositoryUrl) auditFields.push('repository_url') + if (result.description) auditFields.push('description') + await logAuditFieldChange(qx, 'maven', pkg.purl, auditFields) + + log.info( + { groupId, artifactId, version, parentHops: result.parentHops, licenses: result.licenses.length, maintainers: maintainerCount, versions: allVersions.length }, + 'ok', + ) + + return 'processed' +} + +// ─── Batch processing ───────────────────────────────────────────────────────── + +export async function processBatch( + qx: QueryExecutor, + config: MavenConfig, + isCritical: boolean, +): Promise { + const batchSize = isCritical ? config.batchSize : config.nonCriticalBatchSize + const concurrency = isCritical ? config.concurrency : config.nonCriticalConcurrency + const refreshDays = config.refreshDays + const forceFullExtraction = config.forceFullExtraction + + const packages = await listMavenPackagesToSync(qx, { limit: batchSize, refreshDays, isCritical }) + + if (packages.length === 0) return { processed: 0, skipped: 0, error: 0, unchanged: 0 } + + log.info({ count: packages.length, isCritical }, 'Batch started') + + const counts = { processed: 0, skipped: 0, error: 0, unchanged: 0 } + // interleaveByNamespace was introduced as a workaround when the local dev IP was throttled by Maven Central. + // In production runs are 24h apart so the IP is always cold — leaving packages in their natural order for now. + // const queue = isCritical ? interleaveByNamespace(packages) : packages + // const queue = packages + + for (let batchStart = 0; batchStart < packages.length; batchStart += concurrency) { + const group = packages.slice(batchStart, batchStart + concurrency) + + if (isCritical && config.groupDelayMs > 0 && batchStart > 0) { + await new Promise((r) => setTimeout(r, config.groupDelayMs)) + } + + await Promise.all( + group.map(async (pkg) => { + try { + if (!isCritical) { + await processNonCriticalPackage(qx, pkg) + counts.processed++ + return + } + + const status = await processCriticalPackage(qx, pkg, forceFullExtraction) + counts[status]++ + } catch (err) { + const message = err instanceof Error ? err.message : String(err) + const isRateLimit = message.includes('403') || message.includes('429') + log.error( + { purl: pkg.purl, error: message }, + isRateLimit ? 'Rate limited — will retry next pass' : 'Unexpected error processing package', + ) + counts.error++ + } + }), + ) + + const done = batchStart + group.length + if (done % 25 === 0 || done === packages.length) { + log.debug({ done, total: packages.length, ...counts }, 'Progress') + } + } + + return counts +} + +// ─── Phase runner ───────────────────────────────────────────────────────────── + +async function runPhase( + qx: QueryExecutor, + config: MavenConfig, + isCritical: boolean, + isShuttingDown: () => boolean, +): Promise { + const label = isCritical ? 'critical' : 'non-critical' + const total: BatchResult = { processed: 0, skipped: 0, error: 0, unchanged: 0 } + let batchNum = 0 + const phaseStartedAt = Date.now() + + log.info({ phase: label }, 'Phase started') + + while (!isShuttingDown()) { + const result = await processBatch(qx, config, isCritical) + + if (result.processed + result.skipped + result.error + result.unchanged === 0) { + const durationSec = Math.round((Date.now() - phaseStartedAt) / 1000) + log.info({ phase: label, ...total, durationSec }, 'Phase complete') + return total + } + + batchNum++ + total.processed += result.processed + total.skipped += result.skipped + total.error += result.error + total.unchanged += result.unchanged + + log.info( + { + phase: label, + batch: batchNum, + totalProcessed: total.processed, + totalSkipped: total.skipped, + totalUnchanged: total.unchanged, + totalErrors: total.error, + elapsedSec: Math.round((Date.now() - phaseStartedAt) / 1000), + }, + 'Batch done', + ) + } + + return total +} + +// ─── Main loop ──────────────────────────────────────────────────────────────── + +export async function runMavenEnrichmentLoop( + qx: QueryExecutor, + config: MavenConfig, + isShuttingDown: () => boolean, +): Promise { + log.info( + { + batchSize: config.batchSize, + concurrency: config.concurrency, + nonCriticalBatchSize: config.nonCriticalBatchSize, + nonCriticalConcurrency: config.nonCriticalConcurrency, + refreshDays: config.refreshDays, + forceFullExtraction: config.forceFullExtraction, + }, + config.forceFullExtraction + ? 'POM fetcher started — FORCE FULL EXTRACTION (version-unchanged check disabled)' + : 'POM fetcher started', + ) + + let passNumber = 0 + + while (!isShuttingDown()) { + passNumber++ + const passStartedAt = Date.now() + log.info({ pass: passNumber }, 'Pass started') + + // Phase 1: non-critical — DB-only, high throughput, no HTTP + // const nonCritical = await runPhase(qx, config, false, isShuttingDown) + + // Phase 2: critical — HTTP-bound, two-phase version check + POM extraction + const critical = await runPhase(qx, config, true, isShuttingDown) + + const durationSec = Math.round((Date.now() - passStartedAt) / 1000) + log.info( + { + pass: passNumber, + // totalProcessed: nonCritical.processed + critical.processed, + // totalSkipped: nonCritical.skipped + critical.skipped, + // totalUnchanged: nonCritical.unchanged + critical.unchanged, + // totalErrors: nonCritical.error + critical.error, + totalProcessed: critical.processed, + totalSkipped: critical.skipped, + totalUnchanged: critical.unchanged, + totalErrors: critical.error, + durationSec, + }, + `Pass complete — sleeping ${config.idleSleepSec}s`, + ) + + await new Promise((r) => setTimeout(r, config.idleSleepSec * 1000)) + } +} diff --git a/services/apps/packages_worker/src/maven/schedule.ts b/services/apps/packages_worker/src/maven/schedule.ts new file mode 100644 index 0000000000..5737c7a7d6 --- /dev/null +++ b/services/apps/packages_worker/src/maven/schedule.ts @@ -0,0 +1,76 @@ +import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' + +import { svc } from '../service' +import { mavenCriticalWorkflow, mavenNonCriticalWorkflow } from '../workflows' + +export async function scheduleMavenCritical(): Promise { + const { temporal } = svc + if (!temporal) throw new Error('Temporal client not initialized') + + try { + await temporal.schedule.create({ + scheduleId: 'maven-critical', + spec: { + cronExpressions: ['*/5 * * * *'], + }, + policies: { + overlap: ScheduleOverlapPolicy.SKIP, + catchupWindow: '1 hour', + }, + action: { + type: 'startWorkflow', + workflowType: mavenCriticalWorkflow, + taskQueue: 'packages-worker', + workflowExecutionTimeout: '15 minutes', + retry: { + initialInterval: '30 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, + }, + args: [], + }, + }) + } catch (err) { + if (err instanceof ScheduleAlreadyRunning) { + svc.log.info('Schedule maven-critical already registered.') + } else { + throw err + } + } +} + +export async function scheduleMavenNonCritical(): Promise { + const { temporal } = svc + if (!temporal) throw new Error('Temporal client not initialized') + + try { + await temporal.schedule.create({ + scheduleId: 'maven-non-critical', + spec: { + cronExpressions: ['*/10 * * * *'], + }, + policies: { + overlap: ScheduleOverlapPolicy.SKIP, + catchupWindow: '1 hour', + }, + action: { + type: 'startWorkflow', + workflowType: mavenNonCriticalWorkflow, + taskQueue: 'packages-worker', + workflowExecutionTimeout: '5 minutes', + retry: { + initialInterval: '30 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, + }, + args: [], + }, + }) + } catch (err) { + if (err instanceof ScheduleAlreadyRunning) { + svc.log.info('Schedule maven-non-critical already registered.') + } else { + throw err + } + } +} diff --git a/services/apps/packages_worker/src/maven/workflows.ts b/services/apps/packages_worker/src/maven/workflows.ts new file mode 100644 index 0000000000..223cee6701 --- /dev/null +++ b/services/apps/packages_worker/src/maven/workflows.ts @@ -0,0 +1,19 @@ +import { proxyActivities } from '@temporalio/workflow' + +import type * as activities from './activities' + +const { processMavenCriticalBatch } = proxyActivities({ + startToCloseTimeout: '15 minutes', +}) + +const { processMavenNonCriticalBatch } = proxyActivities({ + startToCloseTimeout: '5 minutes', +}) + +export async function mavenCriticalWorkflow(): Promise { + await processMavenCriticalBatch() +} + +export async function mavenNonCriticalWorkflow(): Promise { + await processMavenNonCriticalBatch() +} diff --git a/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts b/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts deleted file mode 100644 index 5f28e1b12b..0000000000 --- a/services/apps/packages_worker/src/pom-fetcher/runPomEnrichmentLoop.ts +++ /dev/null @@ -1,338 +0,0 @@ -import { - listCriticalMavenPackagesToSync, - upsertMaintainer, - upsertPackage, - upsertPackageMaintainer, - upsertPackageRepo, - upsertRepo, - upsertVersionsBatch, -} from '@crowd/data-access-layer' -import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' -import { getServiceChildLogger } from '@crowd/logging' - -import { getPomFetcherConfig } from '../config' -import { MAX_PARENT_HOPS, extractArtifact, normalizeScmUrl } from './extract' -import { resolveVersionsList } from './metadata' - -const log = getServiceChildLogger('pom-fetcher') - -export interface BatchResult { - processed: number - skipped: number - errors: number -} - -type PomFetcherConfig = ReturnType -type PackageRow = Awaited>[number] - -// ─── Helpers ────────────────────────────────────────────────────────────────── - -function mavenRegistryUrl(groupId: string, artifactId: string): string { - return `https://central.sonatype.com/artifact/${groupId}/${artifactId}` -} - -function isPrerelease(version: string): boolean { - return /-(SNAPSHOT|alpha|beta|rc|m\d+)/i.test(version) -} - -function interleaveByNamespace(packages: PackageRow[]): PackageRow[] { - const byNamespace = new Map() - for (const pkg of packages) { - const ns = pkg.namespace ?? '__unknown__' - if (!byNamespace.has(ns)) byNamespace.set(ns, []) - byNamespace.get(ns)!.push(pkg) - } - const queues = [...byNamespace.values()] - const result: PackageRow[] = [] - let i = 0 - while (result.length < packages.length) { - const q = queues[i % queues.length] - if (q.length > 0) result.push(q.shift()!) - i++ - } - return result -} - -interface PersonWithRole { - username: string | null - displayName: string | null - email: string | null - url: string | null - role: 'author' | 'maintainer' -} - -async function writeMaintainers(qx: QueryExecutor, packageId: number, people: PersonWithRole[]): Promise { - let count = 0 - for (const person of people) { - const username = person.username ?? person.email ?? person.displayName - if (!username) continue - const maintainerId = await upsertMaintainer(qx, { - ecosystem: 'maven', - username, - displayName: person.displayName, - url: person.url, - emailHash: person.email, - }) - await upsertPackageMaintainer(qx, { packageId, maintainerId, role: person.role }) - count++ - } - return count -} - -function parseRepoUrl(url: string): { host: string; owner: string | null; name: string | null } | null { - try { - const parsed = new URL(url) - const h = parsed.hostname.toLowerCase() - let host: string - if (h === 'github.com' || h.endsWith('.github.com')) host = 'github' - else if (h === 'gitlab.com' || h.includes('gitlab')) host = 'gitlab' - else if (h === 'bitbucket.org') host = 'bitbucket' - else host = 'other' - const parts = parsed.pathname.split('/').filter(Boolean) - return { host, owner: parts[0] ?? null, name: parts[1] ?? null } - } catch { - return null - } -} - -async function writeRepoLink(qx: QueryExecutor, packageId: number, repositoryUrl: string | null): Promise { - if (!repositoryUrl) return - const parsed = parseRepoUrl(repositoryUrl) - if (!parsed) return - const repoId = await upsertRepo(qx, { url: repositoryUrl, ...parsed }) - await upsertPackageRepo(qx, { packageId, repoId, source: 'declared', confidence: 0.8 }) -} - -// ─── Package processing ─────────────────────────────────────────────────────── - -async function processPackage(qx: QueryExecutor, pkg: PackageRow): Promise<'processed' | 'skipped' | 'error'> { - const groupId = pkg.namespace - const artifactId = pkg.name - - if (!groupId) { - log.warn({ purl: pkg.purl }, 'Skipping: null namespace (groupId)') - return 'skipped' - } - - const meta = await resolveVersionsList(groupId, artifactId) - - if (!meta) { - await upsertPackage(qx, { - purl: pkg.purl, - ecosystem: 'maven', - namespace: groupId, - name: artifactId, - description: null, - homepage: null, - registryUrl: mavenRegistryUrl(groupId, artifactId), - declaredRepositoryUrl: null, - repositoryUrl: null, - licenses: null, - licensesRaw: null, - latestVersion: pkg.latestVersion ?? null, - ingestionSource: 'pom_fetcher_not_on_central', - criticalityScore: pkg.criticalityScore, - dependentPackagesCount: pkg.dependentPackagesCount, - dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, - }) - log.warn({ groupId, artifactId }, 'Not on Maven Central — writing minimal record') - return 'skipped' - } - - const version = meta.releaseVersion - - if (!version) { - await upsertPackage(qx, { - purl: pkg.purl, - ecosystem: 'maven', - namespace: groupId, - name: artifactId, - description: null, - homepage: null, - registryUrl: mavenRegistryUrl(groupId, artifactId), - declaredRepositoryUrl: null, - repositoryUrl: null, - licenses: null, - licensesRaw: null, - latestVersion: null, - ingestionSource: 'pom_fetcher_no_version', - criticalityScore: pkg.criticalityScore, - dependentPackagesCount: pkg.dependentPackagesCount, - dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, - }) - log.warn({ groupId, artifactId }, 'No release version in metadata — writing minimal record') - return 'skipped' - } - - const result = await extractArtifact(groupId, artifactId, version, (msg) => { - log.debug({ groupId, artifactId, version }, msg) - }) - - if (result.error) { - log.warn({ groupId, artifactId, version, error: result.error }, 'POM extraction failed') - await upsertPackage(qx, { - purl: pkg.purl, - ecosystem: 'maven', - namespace: groupId, - name: artifactId, - description: null, - homepage: null, - registryUrl: mavenRegistryUrl(groupId, artifactId), - declaredRepositoryUrl: null, - repositoryUrl: null, - licenses: null, - licensesRaw: null, - latestVersion: version, - ingestionSource: 'pom_fetcher_error', - criticalityScore: pkg.criticalityScore, - dependentPackagesCount: pkg.dependentPackagesCount, - dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, - }) - return 'error' - } - - if (result.parentHops > MAX_PARENT_HOPS) { - log.warn( - { groupId, artifactId, parentHops: result.parentHops, missingLicenses: result.licenses.length === 0, missingScm: !result.scmUrl }, - 'Parent hop limit reached — data may be incomplete', - ) - } - - const repositoryUrl = normalizeScmUrl(result.scmUrl) - - const packageId = await upsertPackage(qx, { - purl: pkg.purl, - ecosystem: 'maven', - namespace: groupId, - name: artifactId, - description: result.description, - homepage: result.homepageUrl, - registryUrl: mavenRegistryUrl(groupId, artifactId), - declaredRepositoryUrl: result.scmUrl, - repositoryUrl, - licenses: result.licenses.length > 0 ? result.licenses : null, - licensesRaw: result.licensesRaw, - latestVersion: version, - ingestionSource: 'pom_fetcher', - criticalityScore: pkg.criticalityScore, - dependentPackagesCount: pkg.dependentPackagesCount, - dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, - }) - - const allPeople = [ - ...result.developers.map((d) => ({ ...d, role: 'author' as const })), - ...result.contributors.map((c) => ({ ...c, role: 'maintainer' as const })), - ] - const maintainerCount = await writeMaintainers(qx, packageId, allPeople) - - const allVersions = meta.versions.length > 0 ? meta.versions : [version] - await upsertVersionsBatch( - qx, - allVersions.map((v) => ({ - packageId, - ecosystem: 'maven', - number: v, - isLatest: v === meta.releaseVersion, - isPrerelease: isPrerelease(v), - license: result.licenses[0] ?? null, - })), - ) - - await writeRepoLink(qx, packageId, repositoryUrl) - - log.info( - { groupId, artifactId, version, parentHops: result.parentHops, licenses: result.licenses.length, maintainers: maintainerCount, versions: allVersions.length }, - 'ok', - ) - - return 'processed' -} - -// ─── Batch ──────────────────────────────────────────────────────────────────── - -export async function processBatch(qx: QueryExecutor, config: PomFetcherConfig): Promise { - const packages = await listCriticalMavenPackagesToSync(qx, { - limit: config.batchSize, - refreshDays: config.refreshDays, - }) - - if (packages.length === 0) return { processed: 0, skipped: 0, errors: 0 } - - log.info({ count: packages.length }, 'Batch started') - - let processed = 0 - let skipped = 0 - let errors = 0 - const queue = interleaveByNamespace(packages) - - for (let i = 0; i < queue.length; i += config.concurrency) { - const group = queue.slice(i, i + config.concurrency) - - if (config.groupDelayMs > 0 && i > 0) { - await new Promise((r) => setTimeout(r, config.groupDelayMs)) - } - - await Promise.all( - group.map(async (pkg) => { - try { - const status = await processPackage(qx, pkg) - if (status === 'processed') processed++ - else if (status === 'skipped') skipped++ - else errors++ - } catch (err) { - const message = err instanceof Error ? err.message : String(err) - const isRateLimit = message.includes('403') || message.includes('429') - log.error( - { purl: pkg.purl, error: message }, - isRateLimit ? 'Rate limited — will retry next pass' : 'Unexpected error processing package', - ) - errors++ - } - }), - ) - - const done = i + group.length - if (done % 25 === 0 || done === queue.length) { - log.debug({ done, total: queue.length, processed, skipped, errors }, 'Progress') - } - } - - return { processed, skipped, errors } -} - -// ─── Main loop ──────────────────────────────────────────────────────────────── - -export async function runPomEnrichmentLoop( - qx: QueryExecutor, - config: PomFetcherConfig, - isShuttingDown: () => boolean, -): Promise { - log.info({ batchSize: config.batchSize, concurrency: config.concurrency, refreshDays: config.refreshDays }, 'POM fetcher started') - - let passNumber = 0 - - while (!isShuttingDown()) { - passNumber++ - const passStartedAt = Date.now() - log.info({ pass: passNumber }, 'Pass started') - - let total = { processed: 0, skipped: 0, errors: 0 } - - while (!isShuttingDown()) { - const result = await processBatch(qx, config) - if (result.processed + result.skipped + result.errors === 0) break - total.processed += result.processed - total.skipped += result.skipped - total.errors += result.errors - } - - const durationSec = Math.round((Date.now() - passStartedAt) / 1000) - log.info({ pass: passNumber, ...total, durationSec }, `Pass complete — sleeping ${config.idleSleepSec}s`) - - await new Promise((r) => setTimeout(r, config.idleSleepSec * 1000)) - } -} diff --git a/services/apps/packages_worker/src/pom-fetcher/schedule.ts b/services/apps/packages_worker/src/pom-fetcher/schedule.ts deleted file mode 100644 index e80bee8168..0000000000 --- a/services/apps/packages_worker/src/pom-fetcher/schedule.ts +++ /dev/null @@ -1,41 +0,0 @@ -import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' - -import { svc } from '../service' -import { pomFetcherWorkflow } from '../workflows' - -export async function schedulePomFetcher(): Promise { - const { temporal } = svc - if (!temporal) throw new Error('Temporal client not initialized') - - try { - await temporal.schedule.create({ - scheduleId: 'maven-pom-fetcher', - spec: { - // Run daily at 4am UTC — off-peak, after nightly GitHub enrichment completes - cronExpressions: ['0 4 * * *'], - }, - policies: { - overlap: ScheduleOverlapPolicy.SKIP, - catchupWindow: '1 hour', - }, - action: { - type: 'startWorkflow', - workflowType: pomFetcherWorkflow, - taskQueue: 'packages-worker', - workflowExecutionTimeout: '12 hours', - retry: { - initialInterval: '30 seconds', - backoffCoefficient: 2, - maximumAttempts: 3, - }, - args: [], - }, - }) - } catch (err) { - if (err instanceof ScheduleAlreadyRunning) { - svc.log.info('Schedule maven-pom-fetcher already registered.') - } else { - throw err - } - } -} diff --git a/services/apps/packages_worker/src/pom-fetcher/workflows.ts b/services/apps/packages_worker/src/pom-fetcher/workflows.ts deleted file mode 100644 index f9d05eb03c..0000000000 --- a/services/apps/packages_worker/src/pom-fetcher/workflows.ts +++ /dev/null @@ -1,36 +0,0 @@ -import { proxyActivities } from '@temporalio/workflow' - -import type * as activities from './activities' - -const { processMavenCriticalBatch } = proxyActivities({ - startToCloseTimeout: '15 minutes', -}) - -const { processMavenNonCriticalBatch } = proxyActivities({ - startToCloseTimeout: '5 minutes', -}) - -/** - * Temporal workflow: runs a full pass of Maven package syncing. - * - * Phase 1 — non-critical: copies universe stats into packages (DB-only, no HTTP). - * Phase 2 — critical: full POM enrichment (HTTP calls to Maven Central). - * - * Each phase loops until its batch returns empty, then the workflow exits. - * The Temporal schedule re-triggers this workflow on the configured interval. - */ -export async function pomFetcherWorkflow(): Promise { - // Phase 1: non-critical — DB-only, fast - // eslint-disable-next-line no-constant-condition - while (true) { - const result = await processMavenNonCriticalBatch() - if (result.processed + result.skipped + result.errors === 0) break - } - - // Phase 2: critical — HTTP-bound, slower - // eslint-disable-next-line no-constant-condition - while (true) { - const result = await processMavenCriticalBatch() - if (result.processed + result.skipped + result.errors === 0) break - } -} diff --git a/services/apps/packages_worker/src/workflows/index.ts b/services/apps/packages_worker/src/workflows/index.ts index b52ba11277..88471a3f28 100644 --- a/services/apps/packages_worker/src/workflows/index.ts +++ b/services/apps/packages_worker/src/workflows/index.ts @@ -10,4 +10,4 @@ export { } from '../deps-dev/workflows' export { npmHello } from '../npm/workflows' export { osvSync } from '../osv/workflows' -export { pomFetcherWorkflow } from '../pom-fetcher/workflows' +export { mavenCriticalWorkflow, mavenNonCriticalWorkflow } from '../maven/workflows' diff --git a/services/libs/data-access-layer/src/osspckgs/packages.ts b/services/libs/data-access-layer/src/osspckgs/packages.ts index 78e3798b91..777504d44c 100644 --- a/services/libs/data-access-layer/src/osspckgs/packages.ts +++ b/services/libs/data-access-layer/src/osspckgs/packages.ts @@ -14,19 +14,22 @@ export async function findPackageIdsByPurl( // ─── packages_universe ──────────────────────────────────────────────────────── /** - * Returns a page of critical Maven packages from packages_universe that need - * syncing into the packages table (never synced, or stale by refreshDays). + * Returns a page of Maven packages from packages_universe that need syncing + * into the packages table (never synced, or stale by refreshDays). + * + * isCritical=true → critical packages queued for full POM extraction + * isCritical=false → non-critical packages queued for universe-stats refresh (DB-only) */ -export async function listCriticalMavenPackagesToSync( +export async function listMavenPackagesToSync( qx: QueryExecutor, - options: { limit: number; refreshDays: number }, + options: { limit: number; refreshDays: number; isCritical: boolean }, ): Promise< - (Pick & { + (Pick & { purl: string latestVersion: string | null })[] > { - const { limit, refreshDays } = options + const { limit, refreshDays, isCritical } = options return qx.select( ` @@ -35,6 +38,7 @@ export async function listCriticalMavenPackagesToSync( pu.purl, pu.namespace, pu.name, + pu.is_critical AS "isCritical", pu.criticality_score AS "criticalityScore", pu.dependent_packages_count AS "dependentPackagesCount", pu.dependent_repos_count AS "dependentReposCount", @@ -44,7 +48,7 @@ export async function listCriticalMavenPackagesToSync( LEFT JOIN packages p ON p.purl = pu.purl WHERE pu.ecosystem = 'maven' - AND pu.is_critical = true + AND pu.is_critical = $(isCritical) AND pu.purl IS NOT NULL AND pu.namespace IS NOT NULL AND ( @@ -56,7 +60,59 @@ export async function listCriticalMavenPackagesToSync( pu.id ASC LIMIT $(limit) `, - { limit, refreshDays }, + { limit, refreshDays, isCritical }, + ) +} + +// ─── packages touch ─────────────────────────────────────────────────────────── + +/** + * Bumps last_synced_at without re-fetching POM data. + * Used when the upstream version is unchanged — avoids a full extraction pass + * while keeping the staleness timer fresh and syncing latest universe metrics. + */ +export async function touchPackageSyncedAt( + qx: QueryExecutor, + purl: string, + metrics: { + criticalityScore: number | null | undefined + dependentPackagesCount: number | null | undefined + dependentReposCount: number | null | undefined + downloadsLastMonth: bigint | null | undefined + }, +): Promise { + await qx.result( + ` + UPDATE packages SET + last_synced_at = NOW(), + criticality_score = COALESCE($(criticalityScore), criticality_score), + dependent_packages_count = COALESCE($(dependentPackagesCount), dependent_packages_count), + dependent_repos_count = COALESCE($(dependentReposCount), dependent_repos_count), + downloads_last_month = COALESCE($(downloadsLastMonth), downloads_last_month) + WHERE purl = $(purl) + `, + { + purl, + criticalityScore: metrics.criticalityScore ?? null, + dependentPackagesCount: metrics.dependentPackagesCount ?? null, + dependentReposCount: metrics.dependentReposCount ?? null, + downloadsLastMonth: metrics.downloadsLastMonth ?? null, + }, + ) +} + +// ─── audit ──────────────────────────────────────────────────────────────────── + +export async function logAuditFieldChange( + qx: QueryExecutor, + worker: string, + purl: string, + changedFields: string[], +): Promise { + if (changedFields.length === 0) return + await qx.result( + `INSERT INTO audit_field_changes (worker, purl, changed_fields) VALUES ($(worker), $(purl), $(changedFields)::text[])`, + { worker, purl, changedFields }, ) } diff --git a/services/libs/data-access-layer/src/osspckgs/types.ts b/services/libs/data-access-layer/src/osspckgs/types.ts index 5065d15114..c678738787 100644 --- a/services/libs/data-access-layer/src/osspckgs/types.ts +++ b/services/libs/data-access-layer/src/osspckgs/types.ts @@ -59,6 +59,7 @@ export type IDbPackageMaintainerUpsert = { export type IDbVersionUpsert = { packageId: number ecosystem: string + name: string number: string isLatest: boolean isPrerelease: boolean diff --git a/services/libs/data-access-layer/src/osspckgs/versions.ts b/services/libs/data-access-layer/src/osspckgs/versions.ts index bbd1f1f55d..9434693240 100644 --- a/services/libs/data-access-layer/src/osspckgs/versions.ts +++ b/services/libs/data-access-layer/src/osspckgs/versions.ts @@ -26,10 +26,11 @@ export async function upsertVersionsBatch( await qx.result( ` - INSERT INTO versions (package_id, ecosystem, number, is_latest, is_prerelease, license, last_synced_at) + INSERT INTO versions (package_id, ecosystem, name, number, is_latest, is_prerelease, license, last_synced_at) SELECT UNNEST($(packageIds)::bigint[]), UNNEST($(ecosystems)::text[]), + UNNEST($(names)::text[]), UNNEST($(numbers)::text[]), UNNEST($(isLatests)::bool[]), UNNEST($(isPreleases)::bool[]), @@ -44,6 +45,7 @@ export async function upsertVersionsBatch( { packageIds: versions.map((v) => v.packageId), ecosystems: versions.map((v) => v.ecosystem), + names: versions.map((v) => v.name), numbers: versions.map((v) => v.number), isLatests: versions.map((v) => v.isLatest), isPreleases: versions.map((v) => v.isPrerelease), From 6472752c2a4d08c7372c30b34f1247d6fa2a83d3 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 2 Jun 2026 15:52:59 +0200 Subject: [PATCH 08/22] fix: lint Signed-off-by: Umberto Sgueglia --- .../libs/data-access-layer/src/osspckgs/packages.ts | 13 ++++++++++++- .../libs/data-access-layer/src/osspckgs/repos.ts | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/services/libs/data-access-layer/src/osspckgs/packages.ts b/services/libs/data-access-layer/src/osspckgs/packages.ts index 777504d44c..dcb673182d 100644 --- a/services/libs/data-access-layer/src/osspckgs/packages.ts +++ b/services/libs/data-access-layer/src/osspckgs/packages.ts @@ -1,4 +1,5 @@ import { QueryExecutor } from '../queryExecutor' + import { IDbPackageUniverse, IDbPackageUpsert } from './types' export async function findPackageIdsByPurl( @@ -24,7 +25,17 @@ export async function listMavenPackagesToSync( qx: QueryExecutor, options: { limit: number; refreshDays: number; isCritical: boolean }, ): Promise< - (Pick & { + (Pick< + IDbPackageUniverse, + | 'id' + | 'namespace' + | 'name' + | 'isCritical' + | 'criticalityScore' + | 'dependentPackagesCount' + | 'dependentReposCount' + | 'downloads30d' + > & { purl: string latestVersion: string | null })[] diff --git a/services/libs/data-access-layer/src/osspckgs/repos.ts b/services/libs/data-access-layer/src/osspckgs/repos.ts index a8fc92b9d8..e8089366dc 100644 --- a/services/libs/data-access-layer/src/osspckgs/repos.ts +++ b/services/libs/data-access-layer/src/osspckgs/repos.ts @@ -1,4 +1,5 @@ import { QueryExecutor } from '../queryExecutor' + import { IDbPackageRepoUpsert, IDbRepoUpsert } from './types' export async function findRepoIdsByUrl( From b3ac70e7a4fa93cc9ba5f20ff5a0cc56aa689ff9 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Tue, 2 Jun 2026 17:15:46 +0200 Subject: [PATCH 09/22] refactor: align the structure with npm Signed-off-by: Umberto Sgueglia --- .../src/maven/__tests__/normalize.test.ts | 130 +++++++++ .../packages_worker/src/maven/metadata.ts | 25 +- .../packages_worker/src/maven/normalize.ts | 19 ++ .../src/maven/runMavenEnrichmentLoop.ts | 269 ++++++++---------- services/libs/data-access-layer/src/index.ts | 2 + .../src/osspckgs/maintainers.ts | 89 +++--- .../src/osspckgs/packages.ts | 112 ++++---- .../data-access-layer/src/osspckgs/repos.ts | 30 +- .../src/osspckgs/versions.ts | 54 ++-- 9 files changed, 457 insertions(+), 273 deletions(-) create mode 100644 services/apps/packages_worker/src/maven/__tests__/normalize.test.ts create mode 100644 services/apps/packages_worker/src/maven/normalize.ts diff --git a/services/apps/packages_worker/src/maven/__tests__/normalize.test.ts b/services/apps/packages_worker/src/maven/__tests__/normalize.test.ts new file mode 100644 index 0000000000..5f34412959 --- /dev/null +++ b/services/apps/packages_worker/src/maven/__tests__/normalize.test.ts @@ -0,0 +1,130 @@ +import { describe, expect, it } from 'vitest' + +import { normalizeScmUrl } from '../extract' +import { isPrerelease, parseRepoUrl } from '../normalize' + +describe('isPrerelease', () => { + it('returns false for a stable version', () => { + expect(isPrerelease('3.12.0')).toBe(false) + }) + + it('detects SNAPSHOT', () => { + expect(isPrerelease('1.0.0-SNAPSHOT')).toBe(true) + }) + + it('detects alpha', () => { + expect(isPrerelease('2.0.0-alpha')).toBe(true) + expect(isPrerelease('2.0.0-ALPHA.1')).toBe(true) + }) + + it('detects beta', () => { + expect(isPrerelease('1.5.0-beta.2')).toBe(true) + }) + + it('detects rc', () => { + expect(isPrerelease('4.0.0-rc1')).toBe(true) + expect(isPrerelease('4.0.0-RC.2')).toBe(true) + }) + + it('detects milestone (m1, m10)', () => { + expect(isPrerelease('5.3.0-m1')).toBe(true) + expect(isPrerelease('5.3.0-M10')).toBe(true) + }) + + it('returns false for versions with numbers that are not milestones', () => { + expect(isPrerelease('1.2.3')).toBe(false) + expect(isPrerelease('10.0.0')).toBe(false) + }) +}) + +describe('parseRepoUrl', () => { + it('identifies github.com', () => { + expect(parseRepoUrl('https://github.com/apache/commons-lang')).toEqual({ + host: 'github', + owner: 'apache', + name: 'commons-lang', + }) + }) + + it('identifies gitlab.com', () => { + expect(parseRepoUrl('https://gitlab.com/owner/repo')).toEqual({ + host: 'gitlab', + owner: 'owner', + name: 'repo', + }) + }) + + it('identifies bitbucket.org', () => { + expect(parseRepoUrl('https://bitbucket.org/owner/repo')).toEqual({ + host: 'bitbucket', + owner: 'owner', + name: 'repo', + }) + }) + + it('returns other for unknown hosts', () => { + const result = parseRepoUrl('https://svn.example.com/repo') + expect(result?.host).toBe('other') + }) + + it('returns null for invalid URLs', () => { + expect(parseRepoUrl('not-a-url')).toBeNull() + }) + + it('handles URLs with no path segments', () => { + const result = parseRepoUrl('https://github.com/') + expect(result).toEqual({ host: 'github', owner: null, name: null }) + }) +}) + +describe('normalizeScmUrl', () => { + it('returns null for null input', () => { + expect(normalizeScmUrl(null)).toBeNull() + }) + + it('strips scm:git: prefix', () => { + expect(normalizeScmUrl('scm:git:https://github.com/apache/commons-lang')).toBe( + 'https://github.com/apache/commons-lang', + ) + }) + + it('converts SSH git@ to https', () => { + expect(normalizeScmUrl('git@github.com:apache/commons-lang.git')).toBe( + 'https://github.com/apache/commons-lang', + ) + }) + + it('converts git:// to https://', () => { + expect(normalizeScmUrl('git://github.com/apache/commons-lang.git')).toBe( + 'https://github.com/apache/commons-lang', + ) + }) + + it('strips trailing .git', () => { + expect(normalizeScmUrl('https://github.com/apache/commons-lang.git')).toBe( + 'https://github.com/apache/commons-lang', + ) + }) + + it('strips /tree/... path suffix', () => { + expect(normalizeScmUrl('https://github.com/apache/commons-lang/tree/master')).toBe( + 'https://github.com/apache/commons-lang', + ) + }) + + it('strips trailing slash', () => { + expect(normalizeScmUrl('https://github.com/apache/commons-lang/')).toBe( + 'https://github.com/apache/commons-lang', + ) + }) + + it('handles combined scm:git: + SSH form', () => { + expect(normalizeScmUrl('scm:git:git@github.com:apache/commons-lang.git')).toBe( + 'https://github.com/apache/commons-lang', + ) + }) + + it('returns null for non-https result', () => { + expect(normalizeScmUrl('svn://svn.apache.org/repos/commons-lang')).toBeNull() + }) +}) diff --git a/services/apps/packages_worker/src/maven/metadata.ts b/services/apps/packages_worker/src/maven/metadata.ts index c755739b78..f437bb6667 100644 --- a/services/apps/packages_worker/src/maven/metadata.ts +++ b/services/apps/packages_worker/src/maven/metadata.ts @@ -29,6 +29,15 @@ export interface MavenVersionsMetadata { releaseVersion: string | null } +export type MavenFetchError = + | { kind: 'NOT_FOUND' } + | { kind: 'RATE_LIMIT'; status: number } + | { kind: 'TRANSIENT'; message: string } + +export function isMavenFetchError(v: unknown): v is MavenFetchError { + return typeof v === 'object' && v !== null && 'kind' in v +} + async function sleep(ms: number): Promise { return new Promise((r) => setTimeout(r, ms)) } @@ -36,7 +45,7 @@ async function sleep(ms: number): Promise { export async function resolveVersionsList( groupId: string, artifactId: string, -): Promise { +): Promise { const groupPath = groupId.replace(/\./g, '/') const url = `${MAVEN_REPO}/${groupPath}/${artifactId}/maven-metadata.xml` @@ -61,19 +70,22 @@ export async function resolveVersionsList( return { versions, releaseVersion: release || latest || null } } catch (err) { if (axios.isAxiosError(err)) { - if (err.response?.status === 404) return null + const status = err.response?.status + if (status === 404) return { kind: 'NOT_FOUND' } // 429 = explicit rate limit, 403 = CDN throttle (Maven Central uses both) - if ((err.response?.status === 429 || err.response?.status === 403) && attempt < MAX_RETRIES) { + if ((status === 429 || status === 403) && attempt < MAX_RETRIES) { const delay = RETRY_BASE_MS * 2 ** attempt + Math.random() * 500 await sleep(delay) continue } + if (status === 429 || status === 403) return { kind: 'RATE_LIMIT', status: status! } } - throw err + const message = err instanceof Error ? err.message : String(err) + return { kind: 'TRANSIENT', message } } } - return null + return { kind: 'RATE_LIMIT', status: 429 } } export async function resolveLatestVersion( @@ -81,5 +93,6 @@ export async function resolveLatestVersion( artifactId: string, ): Promise { const meta = await resolveVersionsList(groupId, artifactId) - return meta?.releaseVersion ?? null + if (isMavenFetchError(meta)) return null + return meta.releaseVersion } diff --git a/services/apps/packages_worker/src/maven/normalize.ts b/services/apps/packages_worker/src/maven/normalize.ts new file mode 100644 index 0000000000..b19829e02f --- /dev/null +++ b/services/apps/packages_worker/src/maven/normalize.ts @@ -0,0 +1,19 @@ +export function isPrerelease(version: string): boolean { + return /-(SNAPSHOT|alpha|beta|rc|m\d+)/i.test(version) +} + +export function parseRepoUrl(url: string): { host: string; owner: string | null; name: string | null } | null { + try { + const parsed = new URL(url) + const h = parsed.hostname.toLowerCase() + let host: string + if (h === 'github.com' || h.endsWith('.github.com')) host = 'github' + else if (h === 'gitlab.com' || h.includes('gitlab')) host = 'gitlab' + else if (h === 'bitbucket.org') host = 'bitbucket' + else host = 'other' + const parts = parsed.pathname.split('/').filter(Boolean) + return { host, owner: parts[0] ?? null, name: parts[1] ?? null } + } catch { + return null + } +} diff --git a/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts index 0c3b6beebc..32c9740a49 100644 --- a/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts +++ b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts @@ -3,10 +3,10 @@ import crypto from 'crypto' import { listMavenPackagesToSync, logAuditFieldChange, + replacePackageMaintainers, touchPackageSyncedAt, upsertMaintainer, upsertPackage, - upsertPackageMaintainer, upsertPackageRepo, upsertRepo, upsertVersionsBatch, @@ -16,7 +16,8 @@ import { getServiceChildLogger } from '@crowd/logging' import { getMavenConfig } from '../config' import { MAX_PARENT_HOPS, extractArtifact, normalizeScmUrl } from './extract' -import { resolveVersionsList } from './metadata' +import { isMavenFetchError, resolveVersionsList } from './metadata' +import { isPrerelease, parseRepoUrl } from './normalize' const log = getServiceChildLogger('maven') @@ -38,53 +39,18 @@ function mavenRegistryUrl(groupId: string, artifactId: string): string { return `https://central.sonatype.com/artifact/${groupId}/${artifactId}` } -function isPrerelease(version: string): boolean { - return /-(SNAPSHOT|alpha|beta|rc|m\d+)/i.test(version) -} - -// Reorders packages so that consecutive items come from different namespaces (e.g. org.apache, com.google). -// This spreads Maven Central requests across different group IDs, avoiding bursts that could hit rate limits -// on the same namespace in a tight loop. -// function interleaveByNamespace(packages: PackageRow[]): PackageRow[] { -// const byNamespace = new Map() -// for (const pkg of packages) { -// const ns = pkg.namespace ?? '__unknown__' -// if (!byNamespace.has(ns)) byNamespace.set(ns, []) -// byNamespace.get(ns)!.push(pkg) -// } -// const queues = [...byNamespace.values()] -// const result: PackageRow[] = [] -// let i = 0 -// while (result.length < packages.length) { -// const q = queues[i % queues.length] -// if (q.length > 0) result.push(q.shift()!) -// i++ -// } -// return result -// } - -function parseRepoUrl(url: string): { host: string; owner: string | null; name: string | null } | null { - try { - const parsed = new URL(url) - const h = parsed.hostname.toLowerCase() - let host: string - if (h === 'github.com' || h.endsWith('.github.com')) host = 'github' - else if (h === 'gitlab.com' || h.includes('gitlab')) host = 'gitlab' - else if (h === 'bitbucket.org') host = 'bitbucket' - else host = 'other' - const parts = parsed.pathname.split('/').filter(Boolean) - return { host, owner: parts[0] ?? null, name: parts[1] ?? null } - } catch { - return null - } -} - -async function writeRepoLink(qx: QueryExecutor, packageId: number, repositoryUrl: string | null): Promise { +async function writeRepoLink( + qx: QueryExecutor, + packageId: number, + repositoryUrl: string | null, + changed: Set, +): Promise { if (!repositoryUrl) return const parsed = parseRepoUrl(repositoryUrl) if (!parsed) return const repoId = await upsertRepo(qx, { url: repositoryUrl, ...parsed }) - await upsertPackageRepo(qx, { packageId, repoId, source: 'declared', confidence: 0.8 }) + const repoChanged = await upsertPackageRepo(qx, { packageId, repoId, source: 'declared', confidence: 0.8 }) + repoChanged.forEach((f) => changed.add(f)) } // ─── Non-critical: copy universe stats into packages ───────────────────────── @@ -129,28 +95,35 @@ async function processCriticalPackage( // Phase 1: lightweight metadata fetch to get the current upstream version. const metadata = await resolveVersionsList(groupId, artifactId) - if (!metadata) { - await upsertPackage(qx, { - purl: pkg.purl, - ecosystem: 'maven', - namespace: groupId, - name: artifactId, - description: null, - homepage: null, - registryUrl: mavenRegistryUrl(groupId, artifactId), - declaredRepositoryUrl: null, - repositoryUrl: null, - licenses: null, - licensesRaw: null, - latestVersion: pkg.latestVersion ?? null, - ingestionSource: 'maven_not_on_central', - criticalityScore: pkg.criticalityScore, - dependentPackagesCount: pkg.dependentPackagesCount, - dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, - }) - log.warn({ groupId, artifactId }, 'Not on Maven Central — writing minimal record') - return 'skipped' + if (isMavenFetchError(metadata)) { + if (metadata.kind === 'NOT_FOUND') { + await upsertPackage(qx, { + purl: pkg.purl, + ecosystem: 'maven', + namespace: groupId, + name: artifactId, + description: null, + homepage: null, + registryUrl: mavenRegistryUrl(groupId, artifactId), + declaredRepositoryUrl: null, + repositoryUrl: null, + licenses: null, + licensesRaw: null, + latestVersion: pkg.latestVersion ?? null, + ingestionSource: 'maven_not_on_central', + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, + }) + log.warn({ groupId, artifactId }, 'Not on Maven Central — writing minimal record') + return 'skipped' + } + if (metadata.kind === 'RATE_LIMIT') { + log.warn({ groupId, artifactId, status: metadata.status }, 'Rate limited — will retry next pass') + return 'error' + } + throw new Error(`Transient error fetching metadata for ${groupId}:${artifactId} — ${metadata.message}`) } const version = metadata.releaseVersion @@ -180,10 +153,6 @@ async function processCriticalPackage( } // Phase 2: skip full POM extraction when upstream version matches what we already have. - // This avoids 1-8 HTTP calls (POM + parent chain) for packages that haven't released - // a new version since the last sync. - // Skipped on forceFullExtraction (first run against a fresh/restored DB) because - // packages.latest_version may carry stale data from the dump. if (!forceFullExtraction && version === pkg.latestVersion) { await touchPackageSyncedAt(qx, pkg.purl, { criticalityScore: pkg.criticalityScore, @@ -195,7 +164,8 @@ async function processCriticalPackage( return 'unchanged' } - // Phase 3: full POM extraction with parent-chain resolution. + // Phase 3: full POM extraction with parent-chain resolution — wrapped in a + // transaction so partial writes never leave the package in an inconsistent state. const result = await extractArtifact(groupId, artifactId, version) if (result.error) { @@ -231,73 +201,82 @@ async function processCriticalPackage( const repositoryUrl = normalizeScmUrl(result.scmUrl) - const packageId = await upsertPackage(qx, { - purl: pkg.purl, - ecosystem: 'maven', - namespace: groupId, - name: artifactId, - description: result.description, - homepage: result.homepageUrl, - registryUrl: mavenRegistryUrl(groupId, artifactId), - declaredRepositoryUrl: result.scmUrl, - repositoryUrl, - licenses: result.licenses.length > 0 ? result.licenses : null, - licensesRaw: result.licensesRaw, - latestVersion: version, - ingestionSource: 'maven', - criticalityScore: pkg.criticalityScore, - dependentPackagesCount: pkg.dependentPackagesCount, - dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, - }) + await qx.tx(async (t) => { + const changed = new Set() - const allPeople = [ - ...result.developers.map((d) => ({ ...d, role: 'author' as const })), - ...result.contributors.map((c) => ({ ...c, role: 'maintainer' as const })), - ] - - let maintainerCount = 0 - for (const person of allPeople) { - const username = person.username ?? person.email ?? person.displayName - if (!username) continue - const emailHash = person.email ? crypto.createHash('sha256').update(person.email.toLowerCase().trim()).digest('hex') : null - const maintainerId = await upsertMaintainer(qx, { + const { id: packageId, changedFields: pkgChanged } = await upsertPackage(t, { + purl: pkg.purl, ecosystem: 'maven', - username, - displayName: person.displayName, - url: person.url, - emailHash, + namespace: groupId, + name: artifactId, + description: result.description, + homepage: result.homepageUrl, + registryUrl: mavenRegistryUrl(groupId, artifactId), + declaredRepositoryUrl: result.scmUrl, + repositoryUrl, + licenses: result.licenses.length > 0 ? result.licenses : null, + licensesRaw: result.licensesRaw, + latestVersion: version, + ingestionSource: 'maven-registry', + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, }) - await upsertPackageMaintainer(qx, { packageId, maintainerId, role: person.role }) - maintainerCount++ - } + pkgChanged.forEach((f) => changed.add(f)) + + const allVersions = metadata.versions.length > 0 ? metadata.versions : [version] + const verChanged = await upsertVersionsBatch( + t, + allVersions.map((v) => ({ + packageId, + ecosystem: 'maven', + name: artifactId, + number: v, + isLatest: v === metadata.releaseVersion, + isPrerelease: isPrerelease(v), + license: result.licenses[0] ?? null, + })), + ) + verChanged.forEach((f) => changed.add(f)) + + const allPeople = [ + ...result.developers.map((d) => ({ ...d, role: 'author' as const })), + ...result.contributors.map((c) => ({ ...c, role: 'maintainer' as const })), + ] + + const maintainerLinks: Array<{ maintainerId: number; role: 'author' | 'maintainer' }> = [] + for (const person of allPeople) { + const username = person.username ?? person.email ?? person.displayName + if (!username) continue + const emailHash = person.email + ? crypto.createHash('sha256').update(person.email.toLowerCase().trim()).digest('hex') + : null + const { id: maintainerId, changedFields: mChanged } = await upsertMaintainer(t, { + ecosystem: 'maven', + username, + displayName: person.displayName, + url: person.url, + emailHash, + }) + mChanged.forEach((f) => changed.add(f)) + maintainerLinks.push({ maintainerId, role: person.role }) + } - const allVersions = metadata.versions.length > 0 ? metadata.versions : [version] - await upsertVersionsBatch( - qx, - allVersions.map((v) => ({ - packageId, - ecosystem: 'maven', - name: artifactId, - number: v, - isLatest: v === metadata.releaseVersion, - isPrerelease: isPrerelease(v), - license: result.licenses[0] ?? null, - })), - ) + if (maintainerLinks.length > 0) { + const pmChanged = await replacePackageMaintainers(t, packageId, maintainerLinks) + pmChanged.forEach((f) => changed.add(f)) + } - await writeRepoLink(qx, packageId, repositoryUrl) + await writeRepoLink(t, packageId, repositoryUrl, changed) - const auditFields = ['latest_version'] - if (result.licenses.length > 0) auditFields.push('licenses') - if (repositoryUrl) auditFields.push('repository_url') - if (result.description) auditFields.push('description') - await logAuditFieldChange(qx, 'maven', pkg.purl, auditFields) + await logAuditFieldChange(t, 'maven', pkg.purl, Array.from(changed)) - log.info( - { groupId, artifactId, version, parentHops: result.parentHops, licenses: result.licenses.length, maintainers: maintainerCount, versions: allVersions.length }, - 'ok', - ) + log.info( + { groupId, artifactId, version, parentHops: result.parentHops, licenses: result.licenses.length, maintainers: maintainerLinks.length, versions: allVersions.length }, + 'ok', + ) + }) return 'processed' } @@ -321,10 +300,6 @@ export async function processBatch( log.info({ count: packages.length, isCritical }, 'Batch started') const counts = { processed: 0, skipped: 0, error: 0, unchanged: 0 } - // interleaveByNamespace was introduced as a workaround when the local dev IP was throttled by Maven Central. - // In production runs are 24h apart so the IP is always cold — leaving packages in their natural order for now. - // const queue = isCritical ? interleaveByNamespace(packages) : packages - // const queue = packages for (let batchStart = 0; batchStart < packages.length; batchStart += concurrency) { const group = packages.slice(batchStart, batchStart + concurrency) @@ -346,11 +321,7 @@ export async function processBatch( counts[status]++ } catch (err) { const message = err instanceof Error ? err.message : String(err) - const isRateLimit = message.includes('403') || message.includes('429') - log.error( - { purl: pkg.purl, error: message }, - isRateLimit ? 'Rate limited — will retry next pass' : 'Unexpected error processing package', - ) + log.error({ purl: pkg.purl, error: message }, 'Unexpected error processing package') counts.error++ } }), @@ -440,24 +411,16 @@ export async function runMavenEnrichmentLoop( const passStartedAt = Date.now() log.info({ pass: passNumber }, 'Pass started') - // Phase 1: non-critical — DB-only, high throughput, no HTTP - // const nonCritical = await runPhase(qx, config, false, isShuttingDown) - - // Phase 2: critical — HTTP-bound, two-phase version check + POM extraction const critical = await runPhase(qx, config, true, isShuttingDown) const durationSec = Math.round((Date.now() - passStartedAt) / 1000) log.info( { pass: passNumber, - // totalProcessed: nonCritical.processed + critical.processed, - // totalSkipped: nonCritical.skipped + critical.skipped, - // totalUnchanged: nonCritical.unchanged + critical.unchanged, - // totalErrors: nonCritical.error + critical.error, - totalProcessed: critical.processed, - totalSkipped: critical.skipped, - totalUnchanged: critical.unchanged, - totalErrors: critical.error, + totalProcessed: critical.processed, + totalSkipped: critical.skipped, + totalUnchanged: critical.unchanged, + totalErrors: critical.error, durationSec, }, `Pass complete — sleeping ${config.idleSleepSec}s`, diff --git a/services/libs/data-access-layer/src/index.ts b/services/libs/data-access-layer/src/index.ts index 3c319ac5e0..ab7c60db59 100644 --- a/services/libs/data-access-layer/src/index.ts +++ b/services/libs/data-access-layer/src/index.ts @@ -17,5 +17,7 @@ export * from './maintainers' export * from './packages' export * from './project-catalog' export * from './osspckgs/ingestJobs' +export * from './osspckgs/maintainers' export * from './osspckgs/packages' export * from './osspckgs/repos' +export * from './osspckgs/versions' diff --git a/services/libs/data-access-layer/src/osspckgs/maintainers.ts b/services/libs/data-access-layer/src/osspckgs/maintainers.ts index ebd9a77767..ab8d7fa953 100644 --- a/services/libs/data-access-layer/src/osspckgs/maintainers.ts +++ b/services/libs/data-access-layer/src/osspckgs/maintainers.ts @@ -4,52 +4,77 @@ import { IDbMaintainerUpsert, IDbPackageMaintainerUpsert } from './types' /** * Inserts or updates a maintainer row. - * Returns the maintainer id. + * Returns the maintainer id and the list of fields that actually changed. */ export async function upsertMaintainer( qx: QueryExecutor, item: IDbMaintainerUpsert, -): Promise { +): Promise<{ id: number; changedFields: string[] }> { const row = await qx.selectOne( ` - INSERT INTO maintainers ( - ecosystem, - username, - display_name, - url, - email_hash - ) VALUES ( - $(ecosystem), - $(username), - $(displayName), - $(url), - $(emailHash) + WITH old AS ( + SELECT display_name, url, email_hash + FROM maintainers WHERE ecosystem = $(ecosystem) AND username = $(username) + ), + ins AS ( + INSERT INTO maintainers (ecosystem, username, display_name, url, email_hash) + VALUES ($(ecosystem), $(username), $(displayName), $(url), $(emailHash)) + ON CONFLICT (ecosystem, username) DO UPDATE SET + display_name = COALESCE(EXCLUDED.display_name, maintainers.display_name), + url = COALESCE(EXCLUDED.url, maintainers.url), + email_hash = COALESCE(EXCLUDED.email_hash, maintainers.email_hash) + RETURNING id, display_name, url, email_hash ) - ON CONFLICT (ecosystem, username) DO UPDATE SET - display_name = COALESCE(EXCLUDED.display_name, maintainers.display_name), - url = COALESCE(EXCLUDED.url, maintainers.url), - email_hash = COALESCE(EXCLUDED.email_hash, maintainers.email_hash) - RETURNING id + SELECT ins.id, + array_remove(ARRAY[ + CASE WHEN o.display_name IS DISTINCT FROM ins.display_name THEN 'maintainers.display_name' END, + CASE WHEN o.url IS DISTINCT FROM ins.url THEN 'maintainers.url' END, + CASE WHEN o.email_hash IS DISTINCT FROM ins.email_hash THEN 'maintainers.email_hash' END + ], NULL) AS changed_fields + FROM ins LEFT JOIN old o ON true `, item, ) - return row.id as number + return { id: row.id as number, changedFields: row.changed_fields as string[] } } /** - * Links a maintainer to a package with the given role. - * Does nothing on conflict. + * Replaces all maintainer links for a package with the given list. + * Deletes links that are no longer present and inserts/updates new ones. + * Returns the list of fields that changed (additions, removals, role changes). */ -export async function upsertPackageMaintainer( +export async function replacePackageMaintainers( qx: QueryExecutor, - item: IDbPackageMaintainerUpsert, -): Promise { - await qx.result( - ` - INSERT INTO package_maintainers (package_id, maintainer_id, role) - VALUES ($(packageId), $(maintainerId), $(role)) - ON CONFLICT (package_id, maintainer_id) DO NOTHING - `, - item, + packageId: number, + links: Array>, +): Promise { + const before: Array<{ maintainer_id: number; role: string | null }> = await qx.select( + `SELECT maintainer_id, role FROM package_maintainers WHERE package_id = $(packageId)`, + { packageId }, ) + const beforeMap = new Map(before.map((r) => [r.maintainer_id, r.role])) + + await qx.result(`DELETE FROM package_maintainers WHERE package_id = $(packageId)`, { packageId }) + + const afterMap = new Map() + for (const { maintainerId, role } of links) { + await qx.result( + `INSERT INTO package_maintainers (package_id, maintainer_id, role) + VALUES ($(packageId), $(maintainerId), $(role)) + ON CONFLICT (package_id, maintainer_id) DO UPDATE SET role = EXCLUDED.role`, + { packageId, maintainerId, role }, + ) + afterMap.set(maintainerId, role) + } + + const changed = new Set() + for (const id of beforeMap.keys()) { + if (!afterMap.has(id)) changed.add('package_maintainers.maintainer_id') + } + for (const [id, role] of afterMap) { + if (!beforeMap.has(id)) changed.add('package_maintainers.maintainer_id') + else if (beforeMap.get(id) !== role) changed.add('package_maintainers.role') + } + + return Array.from(changed) } diff --git a/services/libs/data-access-layer/src/osspckgs/packages.ts b/services/libs/data-access-layer/src/osspckgs/packages.ts index dcb673182d..34825026bb 100644 --- a/services/libs/data-access-layer/src/osspckgs/packages.ts +++ b/services/libs/data-access-layer/src/osspckgs/packages.ts @@ -131,66 +131,64 @@ export async function logAuditFieldChange( /** * Inserts or updates a row in `packages`. - * Returns the id of the upserted row. + * Returns the id and the list of fields that actually changed value. */ -export async function upsertPackage(qx: QueryExecutor, item: IDbPackageUpsert): Promise { +export async function upsertPackage( + qx: QueryExecutor, + item: IDbPackageUpsert, +): Promise<{ id: number; changedFields: string[] }> { const row = await qx.selectOne( ` - INSERT INTO packages ( - purl, - ecosystem, - namespace, - name, - description, - homepage, - registry_url, - declared_repository_url, - repository_url, - licenses, - licenses_raw, - latest_version, - criticality_score, - dependent_packages_count, - dependent_repos_count, - downloads_last_month, - ingestion_source, - last_synced_at - ) VALUES ( - $(purl), - $(ecosystem), - $(namespace), - $(name), - $(description), - $(homepage), - $(registryUrl), - $(declaredRepositoryUrl), - $(repositoryUrl), - $(licenses)::text[], - $(licensesRaw), - $(latestVersion), - $(criticalityScore), - $(dependentPackagesCount), - $(dependentReposCount), - $(downloadsLastMonth), - $(ingestionSource), - NOW() + WITH old AS ( + SELECT description, homepage, registry_url, declared_repository_url, repository_url, + licenses, licenses_raw, latest_version, ingestion_source + FROM packages WHERE purl = $(purl) + ), + ins AS ( + INSERT INTO packages ( + purl, ecosystem, namespace, name, + description, homepage, registry_url, declared_repository_url, repository_url, + licenses, licenses_raw, latest_version, + criticality_score, dependent_packages_count, dependent_repos_count, downloads_last_month, + ingestion_source, last_synced_at + ) VALUES ( + $(purl), $(ecosystem), $(namespace), $(name), + $(description), $(homepage), $(registryUrl), $(declaredRepositoryUrl), $(repositoryUrl), + $(licenses)::text[], $(licensesRaw), $(latestVersion), + $(criticalityScore), $(dependentPackagesCount), $(dependentReposCount), $(downloadsLastMonth), + $(ingestionSource), NOW() + ) + ON CONFLICT (purl) DO UPDATE SET + description = COALESCE(EXCLUDED.description, packages.description), + homepage = COALESCE(EXCLUDED.homepage, packages.homepage), + registry_url = COALESCE(EXCLUDED.registry_url, packages.registry_url), + declared_repository_url = COALESCE(EXCLUDED.declared_repository_url, packages.declared_repository_url), + repository_url = COALESCE(EXCLUDED.repository_url, packages.repository_url), + licenses = COALESCE(EXCLUDED.licenses, packages.licenses), + licenses_raw = COALESCE(EXCLUDED.licenses_raw, packages.licenses_raw), + latest_version = COALESCE(EXCLUDED.latest_version, packages.latest_version), + criticality_score = COALESCE(EXCLUDED.criticality_score, packages.criticality_score), + dependent_packages_count = COALESCE(EXCLUDED.dependent_packages_count, packages.dependent_packages_count), + dependent_repos_count = COALESCE(EXCLUDED.dependent_repos_count, packages.dependent_repos_count), + downloads_last_month = COALESCE(EXCLUDED.downloads_last_month, packages.downloads_last_month), + ingestion_source = EXCLUDED.ingestion_source, + last_synced_at = NOW() + RETURNING id, description, homepage, registry_url, declared_repository_url, repository_url, + licenses, licenses_raw, latest_version, ingestion_source ) - ON CONFLICT (purl) DO UPDATE SET - description = COALESCE(EXCLUDED.description, packages.description), - homepage = COALESCE(EXCLUDED.homepage, packages.homepage), - registry_url = COALESCE(EXCLUDED.registry_url, packages.registry_url), - declared_repository_url = COALESCE(EXCLUDED.declared_repository_url, packages.declared_repository_url), - repository_url = COALESCE(EXCLUDED.repository_url, packages.repository_url), - licenses = COALESCE(EXCLUDED.licenses, packages.licenses), - licenses_raw = COALESCE(EXCLUDED.licenses_raw, packages.licenses_raw), - latest_version = COALESCE(EXCLUDED.latest_version, packages.latest_version), - criticality_score = COALESCE(EXCLUDED.criticality_score, packages.criticality_score), - dependent_packages_count = COALESCE(EXCLUDED.dependent_packages_count, packages.dependent_packages_count), - dependent_repos_count = COALESCE(EXCLUDED.dependent_repos_count, packages.dependent_repos_count), - downloads_last_month = COALESCE(EXCLUDED.downloads_last_month, packages.downloads_last_month), - ingestion_source = EXCLUDED.ingestion_source, - last_synced_at = NOW() - RETURNING id + SELECT ins.id, + array_remove(ARRAY[ + CASE WHEN o.description IS DISTINCT FROM ins.description THEN 'packages.description' END, + CASE WHEN o.homepage IS DISTINCT FROM ins.homepage THEN 'packages.homepage' END, + CASE WHEN o.registry_url IS DISTINCT FROM ins.registry_url THEN 'packages.registry_url' END, + CASE WHEN o.declared_repository_url IS DISTINCT FROM ins.declared_repository_url THEN 'packages.declared_repository_url' END, + CASE WHEN o.repository_url IS DISTINCT FROM ins.repository_url THEN 'packages.repository_url' END, + CASE WHEN o.licenses IS DISTINCT FROM ins.licenses THEN 'packages.licenses' END, + CASE WHEN o.licenses_raw IS DISTINCT FROM ins.licenses_raw THEN 'packages.licenses_raw' END, + CASE WHEN o.latest_version IS DISTINCT FROM ins.latest_version THEN 'packages.latest_version' END, + CASE WHEN o.ingestion_source IS DISTINCT FROM ins.ingestion_source THEN 'packages.ingestion_source' END + ], NULL) AS changed_fields + FROM ins LEFT JOIN old o ON true `, { ...item, @@ -202,5 +200,5 @@ export async function upsertPackage(qx: QueryExecutor, item: IDbPackageUpsert): downloadsLastMonth: item.downloadsLastMonth ?? null, }, ) - return row.id as number + return { id: row.id as number, changedFields: row.changed_fields as string[] } } diff --git a/services/libs/data-access-layer/src/osspckgs/repos.ts b/services/libs/data-access-layer/src/osspckgs/repos.ts index e8089366dc..a85d521e3f 100644 --- a/services/libs/data-access-layer/src/osspckgs/repos.ts +++ b/services/libs/data-access-layer/src/osspckgs/repos.ts @@ -37,19 +37,35 @@ export async function upsertRepo(qx: QueryExecutor, item: IDbRepoUpsert): Promis /** * Links a package to a repo with provenance metadata. * On conflict keeps the higher confidence value and refreshes verified_at. + * Returns the list of fields that actually changed. */ export async function upsertPackageRepo( qx: QueryExecutor, item: IDbPackageRepoUpsert, -): Promise { - await qx.result( +): Promise { + const row: { changed_fields: string[] } = await qx.selectOne( ` - INSERT INTO package_repos (package_id, repo_id, source, confidence, verified_at) - VALUES ($(packageId), $(repoId), $(source), $(confidence), NOW()) - ON CONFLICT (package_id, repo_id) DO UPDATE SET - confidence = GREATEST(EXCLUDED.confidence, package_repos.confidence), - verified_at = NOW() + WITH old AS ( + SELECT source, confidence FROM package_repos + WHERE package_id = $(packageId) AND repo_id = $(repoId) + ), + ins AS ( + INSERT INTO package_repos (package_id, repo_id, source, confidence, verified_at) + VALUES ($(packageId), $(repoId), $(source), $(confidence), NOW()) + ON CONFLICT (package_id, repo_id) DO UPDATE SET + confidence = GREATEST(EXCLUDED.confidence, package_repos.confidence), + verified_at = NOW() + RETURNING source, confidence + ) + SELECT array_remove(ARRAY[ + CASE WHEN o.source IS NULL THEN 'package_repos.repo_id' END, + CASE WHEN o.source IS NULL THEN 'package_repos.source' END, + CASE WHEN o.source IS NULL + OR o.confidence IS DISTINCT FROM ins.confidence THEN 'package_repos.confidence' END + ], NULL) AS changed_fields + FROM ins LEFT JOIN old o ON true `, item, ) + return row.changed_fields } diff --git a/services/libs/data-access-layer/src/osspckgs/versions.ts b/services/libs/data-access-layer/src/osspckgs/versions.ts index 9434693240..a9a2b0335a 100644 --- a/services/libs/data-access-layer/src/osspckgs/versions.ts +++ b/services/libs/data-access-layer/src/osspckgs/versions.ts @@ -7,12 +7,13 @@ import { IDbVersionUpsert } from './types' * Uses UNNEST arrays to avoid N individual round-trips. * On conflict (package_id, number) updates is_latest, is_prerelease, and * license (never overwrites an existing license with NULL). + * Returns the list of fields that actually changed across all versions. */ export async function upsertVersionsBatch( qx: QueryExecutor, versions: IDbVersionUpsert[], -): Promise { - if (versions.length === 0) return +): Promise { + if (versions.length === 0) return [] // maven-metadata.xml sometimes contains duplicate version strings — deduplicate // by number before inserting to avoid "ON CONFLICT DO UPDATE command cannot affect @@ -24,25 +25,41 @@ export async function upsertVersionsBatch( return true }) - await qx.result( + const row: { changed_fields: string[] } = await qx.selectOne( ` - INSERT INTO versions (package_id, ecosystem, name, number, is_latest, is_prerelease, license, last_synced_at) - SELECT - UNNEST($(packageIds)::bigint[]), - UNNEST($(ecosystems)::text[]), - UNNEST($(names)::text[]), - UNNEST($(numbers)::text[]), - UNNEST($(isLatests)::bool[]), - UNNEST($(isPreleases)::bool[]), - UNNEST($(licenses)::text[]), - NOW() - ON CONFLICT (package_id, number) DO UPDATE SET - is_latest = EXCLUDED.is_latest, - is_prerelease = EXCLUDED.is_prerelease, - license = COALESCE(EXCLUDED.license, versions.license), - last_synced_at = NOW() + WITH old AS ( + SELECT number, is_latest, is_prerelease, license + FROM versions + WHERE package_id = $(packageId)::bigint AND number = ANY($(numbers)::text[]) + ), + ins AS ( + INSERT INTO versions (package_id, ecosystem, name, number, is_latest, is_prerelease, license, last_synced_at) + SELECT + UNNEST($(packageIds)::bigint[]), + UNNEST($(ecosystems)::text[]), + UNNEST($(names)::text[]), + UNNEST($(numbers)::text[]), + UNNEST($(isLatests)::bool[]), + UNNEST($(isPreleases)::bool[]), + UNNEST($(licenses)::text[]), + NOW() + ON CONFLICT (package_id, number) DO UPDATE SET + is_latest = EXCLUDED.is_latest, + is_prerelease = EXCLUDED.is_prerelease, + license = COALESCE(EXCLUDED.license, versions.license), + last_synced_at = NOW() + RETURNING number, is_latest, is_prerelease, license + ) + SELECT array_remove(ARRAY[ + CASE WHEN bool_or(o.number IS NULL) THEN 'versions.number' END, + CASE WHEN bool_or(o.is_latest IS DISTINCT FROM ins.is_latest) THEN 'versions.is_latest' END, + CASE WHEN bool_or(o.is_prerelease IS DISTINCT FROM ins.is_prerelease) THEN 'versions.is_prerelease' END, + CASE WHEN bool_or(o.license IS DISTINCT FROM ins.license) THEN 'versions.license' END + ], NULL) AS changed_fields + FROM ins LEFT JOIN old o ON o.number = ins.number `, { + packageId: versions[0].packageId, packageIds: versions.map((v) => v.packageId), ecosystems: versions.map((v) => v.ecosystem), names: versions.map((v) => v.name), @@ -52,4 +69,5 @@ export async function upsertVersionsBatch( licenses: versions.map((v) => v.license), }, ) + return row.changed_fields } From 301df78a6d1fa713b641b16d96a8ba68803b3bf8 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 3 Jun 2026 21:36:05 +0200 Subject: [PATCH 10/22] refactor: simplify the schedule Signed-off-by: Umberto Sgueglia --- backend/.env.dist.local | 19 +- scripts/services/maven.yaml | 71 --- scripts/services/packages.yaml | 4 - services/apps/packages_worker/package.json | 15 +- .../packages_worker/src/bin/maven-backfill.ts | 42 ++ .../apps/packages_worker/src/bin/maven.ts | 39 -- .../src/bin/packages-worker.ts | 3 +- services/apps/packages_worker/src/config.ts | 33 +- .../apps/packages_worker/src/maven/README.md | 193 ++++++--- .../packages_worker/src/maven/activities.ts | 60 ++- .../packages_worker/src/maven/deltaApi.ts | 128 ++++++ .../apps/packages_worker/src/maven/extract.ts | 169 +++++++- .../packages_worker/src/maven/metadata.ts | 8 +- .../src/maven/runMavenEnrichmentLoop.ts | 404 ++++++++++++------ .../packages_worker/src/maven/schedule.ts | 118 ++--- .../src/scripts/benchmarkMavenDelta.ts | 82 ++++ .../src/scripts/validateDataQuality.ts | 88 ++++ .../src/osspckgs/packages.ts | 135 +++++- .../src/osspckgs/versions.ts | 34 +- 19 files changed, 1211 insertions(+), 434 deletions(-) delete mode 100644 scripts/services/maven.yaml create mode 100644 services/apps/packages_worker/src/bin/maven-backfill.ts delete mode 100644 services/apps/packages_worker/src/bin/maven.ts create mode 100644 services/apps/packages_worker/src/maven/deltaApi.ts create mode 100644 services/apps/packages_worker/src/scripts/benchmarkMavenDelta.ts create mode 100644 services/apps/packages_worker/src/scripts/validateDataQuality.ts diff --git a/backend/.env.dist.local b/backend/.env.dist.local index 3f8bd2bdef..fe988f7c08 100755 --- a/backend/.env.dist.local +++ b/backend/.env.dist.local @@ -185,9 +185,9 @@ ENRICHER_BATCH_SIZE=100 ENRICHER_REPO_UPDATE_INTERVAL_HOURS=24 ENRICHER_IDLE_SLEEP_SEC=60 -OSSPCKGS_GCP_PROJECT= -OSSPCKGS_GCS_BUCKET= -OSSPCKGS_GCP_CREDENTIALS_B64= +OSSPCKGS_GCP_PROJECT=local-dev +OSSPCKGS_GCS_BUCKET=local-dev +OSSPCKGS_GCP_CREDENTIALS_B64=e30= # osv-sync (Temporal-scheduled; see services/apps/packages_worker/src/osv/schedule.ts) # OSV_ECOSYSTEMS uses OSV's canonical bucket case (npm lowercase, Maven titlecase) because @@ -200,13 +200,16 @@ OSV_TMP_DIR=/tmp/osv OSV_BATCH_SIZE=500 OSV_DERIVE_BATCH_SIZE=1000 # maven enricher -POM_FETCHER_BATCH_SIZE=50 -POM_FETCHER_CONCURRENCY=3 + +POM_FETCHER_BATCH_SIZE=2000 +POM_FETCHER_CONCURRENCY=10 POM_FETCHER_NON_CRITICAL_BATCH_SIZE=500 POM_FETCHER_NON_CRITICAL_CONCURRENCY=20 POM_FETCHER_REFRESH_DAYS=1 -POM_FETCHER_GROUP_DELAY_MS=500 -POM_FETCHER_IDLE_SLEEP_SEC=3600 +POM_FETCHER_GROUP_DELAY_MS=100 # Set to 'true' on first run against a fresh/restored DB to skip the version-unchanged # optimisation and force full POM extraction. Set to 'false' after the first pass. -POM_FETCHER_FORCE_FULL_EXTRACTION=false +POM_FETCHER_FORCE_FULL_EXTRACTION=true +POM_FETCHER_MAVEN_BASE_URL=https://maven-central.storage-download.googleapis.com/maven2 +MAVEN_SYNC_SOURCE=both +MAVEN_DELTA_API_URL=https://maven-fetcher-production.up.railway.app \ No newline at end of file diff --git a/scripts/services/maven.yaml b/scripts/services/maven.yaml deleted file mode 100644 index a90e0f693a..0000000000 --- a/scripts/services/maven.yaml +++ /dev/null @@ -1,71 +0,0 @@ -version: '3.1' - -x-env-args: &env-args - DOCKER_BUILDKIT: 1 - NODE_ENV: docker - SERVICE: maven - SHELL: /bin/sh - SUPPRESS_NO_CONFIG_WARNING: 'true' - LOG_LEVEL: 'info' - POM_FETCHER_BATCH_SIZE: '50' - POM_FETCHER_CONCURRENCY: '5' - POM_FETCHER_REFRESH_DAYS: '1' - POM_FETCHER_GROUP_DELAY_MS: '200' - POM_FETCHER_IDLE_SLEEP_SEC: '3600' - -services: - maven: - build: - context: ../../ - dockerfile: ./scripts/services/docker/Dockerfile.packages-worker - command: 'pnpm run start:maven' - working_dir: /usr/crowd/app/services/apps/packages_worker - env_file: - - ../../backend/.env.dist.local - - ../../backend/.env.dist.composed - - ../../backend/.env.override.local - - ../../backend/.env.override.composed - environment: - <<: *env-args - restart: always - networks: - - crowd-bridge - - maven-dev: - build: - context: ../../ - dockerfile: ./scripts/services/docker/Dockerfile.packages-worker - command: 'pnpm run dev:maven' - working_dir: /usr/crowd/app/services/apps/packages_worker - # user: '${USER_ID}:${GROUP_ID}' - env_file: - - ../../backend/.env.dist.local - - ../../backend/.env.dist.composed - - ../../backend/.env.override.local - - ../../backend/.env.override.composed - environment: - <<: *env-args - hostname: maven - networks: - - crowd-bridge - volumes: - - ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src - - ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src - - ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src - - ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src - - ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src - - ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src - - ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src - - ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src - - ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src - - ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src - - ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src - - ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src - - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src - - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src - - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src - - ../../services/apps/packages_worker/src:/usr/crowd/app/services/apps/packages_worker/src - -networks: - crowd-bridge: - external: true diff --git a/scripts/services/packages.yaml b/scripts/services/packages.yaml index bd6f6499fe..459f780ebd 100644 --- a/scripts/services/packages.yaml +++ b/scripts/services/packages.yaml @@ -8,10 +8,6 @@ x-env-args: &env-args CROWD_TEMPORAL_NAMESPACE: ${CROWD_PACKAGES_TEMPORAL_NAMESPACE} SHELL: /bin/sh SUPPRESS_NO_CONFIG_WARNING: 'true' - POM_FETCHER_BATCH_SIZE: '50' - POM_FETCHER_CONCURRENCY: '3' - POM_FETCHER_STALE_DAYS: '7' - POM_FETCHER_IDLE_SLEEP_SEC: '3600' services: packages: diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index cfef6bfe3a..2719ba7dcd 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -6,23 +6,22 @@ "start:github-repos-enricher": "SERVICE=github-repos-enricher tsx src/bin/github-repos-enricher.ts", "start:packages-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker tsx src/bin/packages-worker.ts", "start:pom-fetcher": "SERVICE=pom-fetcher tsx src/bin/pom-fetcher.ts", + "backfill:maven": "SERVICE=maven tsx src/bin/maven-backfill.ts", "dev:packages-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", "dev:deps-dev-ingest": "CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/deps-dev-ingest.ts", "start:maven": "SERVICE=maven tsx src/bin/maven.ts", "dev:github-repos-enricher": "SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", - "dev:maven": "SERVICE=maven LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/maven.ts", "dev:packages-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", "dev:deps-dev-ingest:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/deps-dev-ingest.ts", "dev:github-repos-enricher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", "export-to-bucket": "SERVICE=deps-dev-ingest tsx src/scripts/exportToBucket.ts", "export-to-bucket:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=deps-dev-ingest tsx src/scripts/exportToBucket.ts", - "monitor:osspckgs": "SERVICE=monitor tsx src/scripts/monitorOsspckgs.ts", - "monitor:osspckgs:local": "bash -c 'set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=monitor tsx src/scripts/monitorOsspckgs.ts'", - "trigger-bootstrap": "SERVICE=deps-dev-ingest tsx src/scripts/triggerBootstrap.ts", - "trigger-bootstrap:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=deps-dev-ingest tsx src/scripts/triggerBootstrap.ts", - "dev:pom-fetcher": "SERVICE=pom-fetcher LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", - "dev:pom-fetcher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=pom-fetcher LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/pom-fetcher.ts", - "dev:maven:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=maven LOG_LEVEL=info nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/maven.ts", + "monitor:osspckgs:local": "bash -c 'set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && node ../../../scripts/monitor-osspckgs.mjs'", + "backfill:maven:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=maven LOG_LEVEL=info tsx src/bin/maven-backfill.ts", + "benchmark:maven-delta": "SERVICE=maven tsx src/scripts/benchmarkMavenDelta.ts", + "benchmark:maven-delta:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=maven LOG_LEVEL=info tsx src/scripts/benchmarkMavenDelta.ts", + "validate:maven-quality": "SERVICE=maven tsx src/scripts/validateDataQuality.ts", + "validate:maven-quality:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=maven LOG_LEVEL=info tsx src/scripts/validateDataQuality.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", "format": "npx prettier --write \"src/**/*.ts\"", "format-check": "npx prettier --check .", diff --git a/services/apps/packages_worker/src/bin/maven-backfill.ts b/services/apps/packages_worker/src/bin/maven-backfill.ts new file mode 100644 index 0000000000..6550feeefc --- /dev/null +++ b/services/apps/packages_worker/src/bin/maven-backfill.ts @@ -0,0 +1,42 @@ +import { getServiceLogger } from '@crowd/logging' + +import { getMavenConfig } from '../config' +import { getPackagesDb } from '../db' +import { runMavenCriticalBackfill } from '../maven/runMavenEnrichmentLoop' + +const log = getServiceLogger() + +let shuttingDown = false + +// Graceful stop: finish the in-flight batch, then exit. Safe to interrupt — every +// write is an idempotent upsert and the DB state is the cursor, so re-running +// resumes where it left off. +const shutdown = () => { + if (shuttingDown) return + shuttingDown = true + log.info('Shutting down maven backfill (stopping after the current batch)...') +} + +process.on('SIGINT', shutdown) +process.on('SIGTERM', shutdown) + +const main = async () => { + log.info('maven backfill starting (one-shot, full extraction)...') + + const config = getMavenConfig() + log.info(config, 'Config loaded') + + const qx = await getPackagesDb() + await qx.selectOne('SELECT 1') + log.info('Connected to packages-db.') + + const totals = await runMavenCriticalBackfill(qx, config, () => shuttingDown) + + log.info({ ...totals }, 'maven backfill complete') + process.exit(0) +} + +main().catch((err) => { + log.error({ err }, 'maven backfill fatal error') + process.exit(1) +}) diff --git a/services/apps/packages_worker/src/bin/maven.ts b/services/apps/packages_worker/src/bin/maven.ts deleted file mode 100644 index 63a0e1a9af..0000000000 --- a/services/apps/packages_worker/src/bin/maven.ts +++ /dev/null @@ -1,39 +0,0 @@ -import { getServiceLogger } from '@crowd/logging' - -import { getMavenConfig } from '../config' -import { getPackagesDb } from '../db' -import { runMavenEnrichmentLoop } from '../maven/runMavenEnrichmentLoop' - -const log = getServiceLogger() - -let shuttingDown = false - -const shutdown = async () => { - if (shuttingDown) return - shuttingDown = true - log.info('Shutting down maven...') -} - -process.on('SIGINT', shutdown) -process.on('SIGTERM', shutdown) - -const main = async () => { - log.info('maven starting...') - - const config = getMavenConfig() - log.info(config, 'Config loaded') - - const qx = await getPackagesDb() - await qx.selectOne('SELECT 1') - log.info('Connected to packages-db.') - - await runMavenEnrichmentLoop(qx, config, () => shuttingDown) - - log.info('maven stopped.') - process.exit(0) -} - -main().catch((err) => { - log.error({ err }, 'maven fatal error') - process.exit(1) -}) diff --git a/services/apps/packages_worker/src/bin/packages-worker.ts b/services/apps/packages_worker/src/bin/packages-worker.ts index 39c33ffc87..0752db6a06 100644 --- a/services/apps/packages_worker/src/bin/packages-worker.ts +++ b/services/apps/packages_worker/src/bin/packages-worker.ts @@ -1,6 +1,6 @@ +import { scheduleMavenCritical } from '../maven/schedule' import { scheduleNpmIngest } from '../npm/schedule' import { scheduleOsvSync } from '../osv/schedule' -import { scheduleMavenCritical, scheduleMavenNonCritical } from '../maven/schedule' import { svc } from '../service' setImmediate(async () => { @@ -8,6 +8,5 @@ setImmediate(async () => { await scheduleNpmIngest() await scheduleOsvSync() await scheduleMavenCritical() - await scheduleMavenNonCritical() await svc.start() }) diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index 66b6750214..30726fdd19 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -46,7 +46,26 @@ export function getEnricherConfig() { } } +// Which source drives the critical Maven sync: +// 'maven' → poll packages_universe by staleness (current behaviour, default/fallback) +// 'api' → only enrich what our delta feed reports as changed +// 'both' → run both passes in the same Temporal tick +export type MavenSyncSource = 'api' | 'maven' | 'both' + +function parseMavenSyncSource(raw: string | undefined): MavenSyncSource { + if (raw === 'api' || raw === 'both') return raw + // Anything else (unset, typo, legacy value) falls back to the current behaviour. + return 'maven' +} + export function getMavenConfig() { + const syncSource = parseMavenSyncSource(process.env.MAVEN_SYNC_SOURCE) + const deltaApiBaseUrl = (process.env.MAVEN_DELTA_API_URL ?? '').replace(/\/+$/, '') + + if (syncSource !== 'maven' && !deltaApiBaseUrl) { + throw new Error(`MAVEN_SYNC_SOURCE='${syncSource}' requires MAVEN_DELTA_API_URL to be set`) + } + return { batchSize: requireEnvInt('POM_FETCHER_BATCH_SIZE'), concurrency: requireEnvInt('POM_FETCHER_CONCURRENCY'), @@ -54,7 +73,17 @@ export function getMavenConfig() { nonCriticalConcurrency: requireEnvInt('POM_FETCHER_NON_CRITICAL_CONCURRENCY'), refreshDays: requireEnvInt('POM_FETCHER_REFRESH_DAYS'), groupDelayMs: requireEnvInt('POM_FETCHER_GROUP_DELAY_MS'), - idleSleepSec: requireEnvInt('POM_FETCHER_IDLE_SLEEP_SEC'), - forceFullExtraction: requireEnv('POM_FETCHER_FORCE_FULL_EXTRACTION') === 'true', + syncSource, + deltaApi: { + baseUrl: deltaApiBaseUrl, + token: process.env.MAVEN_DELTA_API_TOKEN || undefined, + pageSize: process.env.MAVEN_DELTA_API_PAGE_SIZE + ? parseInt(process.env.MAVEN_DELTA_API_PAGE_SIZE, 10) + : 100, + lookbackMinutes: process.env.MAVEN_DELTA_API_LOOKBACK_MINUTES + ? parseInt(process.env.MAVEN_DELTA_API_LOOKBACK_MINUTES, 10) + : 15, + includePrerelease: process.env.MAVEN_DELTA_API_INCLUDE_PRERELEASE === 'true', + }, } } diff --git a/services/apps/packages_worker/src/maven/README.md b/services/apps/packages_worker/src/maven/README.md index f69a0d0af5..ac8671af4e 100644 --- a/services/apps/packages_worker/src/maven/README.md +++ b/services/apps/packages_worker/src/maven/README.md @@ -1,34 +1,87 @@ # Maven POM Fetcher Worker that syncs Maven package metadata from Maven Central into the `packages` DB. -Runs as a standalone entry point inside `packages_worker` on a daily Temporal schedule (4am UTC). +Lives in `packages_worker` and can run in two ways: + +- **Temporal (production, incremental):** entry point `bin/packages-worker.ts` registers the + `maven-critical` Temporal schedule. Each tick runs a **single batch** as a Temporal activity, + and **skips POM extraction when the version is unchanged**. See [Scheduling](#scheduling). +- **One-shot backfill:** entry point `bin/maven-backfill.ts` (`pnpm backfill:maven`) drains the + Tier 2 critical queue once with **full extraction** (no version short-circuit), then exits. + Run it `exec`ed into the `packages-worker` container for the initial fill / periodic full + refresh. It is resumable — the DB state is the cursor, so re-running picks up where it left off. --- ## Architecture: Two-Tier Fetch -All Maven packages in `packages_universe` are processed in two sequential phases per pass. +Both phases pull candidates from `packages_universe` (filtered by `is_critical`, ordered by +`rank_in_ecosystem`) and write into `packages`. A package is a candidate when it is not yet +in `packages` (`last_synced_at IS NULL`) or its `last_synced_at` is older than +`POM_FETCHER_REFRESH_DAYS`. The two phases run as **separate Temporal schedules** (or, in the +standalone loop, only the critical phase runs). + +### Non-Critical phase (`is_critical = false`) + +DB-only. Copies universe stats (criticality score, downloads, dependent counts) into +`packages` with `ingestion_source = 'packages_universe'`. **No HTTP.** Fast (~1000 pkg/sec). + +### Critical phase (`is_critical = true`) + +Full POM extraction from Maven Central with parent-chain resolution (max 8 hops). Populates +description, homepage, SCM/repo, licenses, maintainers and the full version list. + +Whether the version short-circuit applies is fixed per **entry point** (not a runtime flag): + +| Entry point | Mode | Behaviour | +|-------------|------|-----------| +| Standalone `bin/maven.ts` | **backfill** | Always runs full POM extraction for every selected critical package, regardless of version. Use for the initial fill / periodic full refresh. | +| Temporal `mavenCriticalWorkflow` | **incremental** | If the upstream release version equals the stored `latest_version`, skips the POM fetch and only bumps `last_synced_at` (status `unchanged`). Full extraction runs only for new packages or when the version changed. | -### Phase 1 — Non-Critical +This is passed as the `forceFullExtraction` argument to `processBatch` — `true` from the +standalone loop, `false` from the Temporal activity. There is no env variable for it. -| Mode | Trigger | What happens | -|------|---------|-------------| -| `POM_FETCHER_DIRECT_POM_FOR_ALL=false` (default) | `last_synced_at > 1 day` | DB-only: copies universe stats (criticality, downloads, dependents) into `packages`. No HTTP. | -| `POM_FETCHER_DIRECT_POM_FOR_ALL=true` | `last_synced_at > 30 days` | Fetches `maven-metadata.xml` + root POM (no parent chain). Populates description, homepage, SCM, maintainers, versions. | +**Why two tiers?** Parent POM resolution is the expensive part — multiple HTTP requests per +package (up to 8 extra fetches). Running it on millions of non-critical packages is not +feasible; for them the DB-only universe stats are enough. The extra cost is reserved for +critical packages, where data quality matters. -### Phase 2 — Critical +### Parent POM cache -Always active. Full POM extraction with parent chain resolution (max 5 hops). Runs when: -- Package not yet in `packages` table -- `ingestion_source IN ('maven_index', 'packages_universe')` — not yet POM-enriched -- New version released (`latest_release_at > last_synced_at`) -- Periodic full refresh (`last_synced_at > 90 days`) +Parent POMs are shared across many artifacts of the same namespace (`org.apache:apache`, +`org.springframework.boot:spring-boot-starter-parent`, `com.google.cloud:google-cloud-shared-config`, …). +Because the queue is ordered by `rank_in_ecosystem`, those siblings are processed close +together. A module-level, coordinate-keyed in-process cache in `extract.ts` collapses the +repeated parent fetches into a **single** HTTP request, and also removes the redundant second +fetch of each artifact's own POM (`extractArtifact` fetches the leaf, then +`resolveWithInheritance` would fetch it again at depth 0). This is the **single biggest lever +against Maven Central rate limiting** — and it works *because* of the namespace clustering, so +shuffling the batch (which the queue's `rank` ordering produces) would be counter-productive. -**Why two tiers?** Parent POM resolution is the expensive part — it requires multiple HTTP requests per package (up to 5 extra fetches). Running it on millions of non-critical packages every day is not feasible. For non-critical packages a single direct-POM fetch is sufficient; for critical packages the extra cost is justified by data quality. +- **Only successful fetches are cached.** `fetchPom` returns `null` for both a real 404 and a + transient failure (throttle/timeout), so caching `null` would poison the cache — it is never + done. Missing/failed POMs are simply re-fetched on the next pass. +- **No TTL.** Maven coordinates are immutable, so a cached POM never goes stale. The cache is + bounded by an LRU size cap (`POM_CACHE_MAX_ENTRIES`, default 5000) purely to cap memory. +- **Request coalescing.** Concurrent fetches for the same coordinates share a single in-flight + request instead of issuing duplicates. +- **Observability.** `getPomCacheStats()` returns `{ size, hits, coalesced, misses, evictions, + hitRate }`; the critical batch logs it once per batch under message **`POM cache`**, so you + can watch the hit rate climb as the cache warms. + +> The cache lives for the lifetime of the worker process. Under Temporal it persists **across +> batches/ticks** (same process), so the hit rate keeps improving across the run; in the +> standalone loop it persists across passes until the process is restarted. --- -## Coverage Matrix (`POM_FETCHER_DIRECT_POM_FOR_ALL=true`) +## Coverage Matrix (critical packages — full POM extraction) + +The matrix below describes the **critical** path (full POM + parent resolution). +Non-critical packages are DB-only: they receive just the universe-stat columns +(`criticality_score`, `dependent_packages_count`, `dependent_repos_count`, +`downloads_last_month`) plus `purl`/`namespace`/`name`/`registry_url`/`last_synced_at`; +all POM-derived columns stay null for them. ### packages @@ -69,7 +122,7 @@ Always active. Full POM extraction with parent chain resolution (max 5 hops). Ru | is_latest | `number === ` | ✅ all | | is_prerelease | regex on version string³ | ✅ all | | last_synced_at | NOW() | ✅ all | -| license | package-level license applied to all versions⁴ | ✅ best-effort¹ | +| licenses | package-level license applied to all versions⁴ (stored as a single-element `text[]`) | ✅ best-effort¹ | | published_at | Sonatype: release timestamp | 🔜 Sonatype | | is_yanked | no yank mechanism in Maven | ❌ | | download_count | no public per-version API | ❌ | @@ -113,9 +166,9 @@ The POM fetcher seeds `repos` with URL-derivable fields only. The GitHub enriche **Notes:** -> ¹ **best-effort**: field is populated when declared in the direct POM. If inherited from a parent POM (common for `licenses`, `` in Apache/Spring/Google projects), it is null for non-critical packages. +> ¹ **best-effort**: field is populated only when present in the resolved POM chain. Non-critical packages skip POM fetching entirely (DB-only), so these columns are always null for them. > -> ² **full resolution for critical**: parent chain is followed (max 5 hops), so inherited fields are resolved correctly. +> ² **full resolution for critical**: parent chain is followed (max 8 hops), so inherited fields are resolved correctly. > > ³ **prerelease regex**: matches `-SNAPSHOT`, `-alpha`, `-beta`, `-rc`, `-M[0-9]+` (case-insensitive). > @@ -127,13 +180,14 @@ The POM fetcher seeds `repos` with URL-derivable fields only. The GitHub enriche | Value | Meaning | |-------|---------| -| `maven` | Critical — full POM + parent resolution succeeded | -| `maven_direct` | Non-critical — direct POM fetch succeeded (no parent resolution) | +| `maven-registry` | Critical — full POM + parent resolution succeeded | +| `packages_universe` | Non-critical (DB-only) — only universe stats copied, no POM fetch | | `maven_not_on_central` | `maven-metadata.xml` not found on `repo1.maven.org` — artifact is hosted on a third-party repository (e.g. WSO2 Nexus, JBoss, Atlassian). Universe data came from an aggregator (deps.dev, OSV). | | `maven_no_version` | `maven-metadata.xml` found but `` is empty — artifact has no stable release | | `maven_error` | `maven-metadata.xml` has a release version but the `.pom` file for that version is a 404. Typical cause: partial deploy to Maven Central (metadata updated, artifact not uploaded) or Eclipse P2 feature artifacts that don't publish a standard POM. | -| `maven_rate_limited` | Maven Central returned 403/429 on all retry attempts. Package will be retried on the next pass. | -| `packages_universe` | Non-critical, DB-only mode (`POM_FETCHER_DIRECT_POM_FOR_ALL=false`) — only universe stats copied | + +> On a 403/429 rate-limit or a transient network error, **no sentinel record is written**: +> the batch counts the package as an error and it is simply retried on the next tick/pass. --- @@ -155,11 +209,17 @@ Maven Central (`repo1.maven.org`) restituisce 403 come meccanismo di throttle ol 1. **Retry con backoff esponenziale** — 403 e 429 vengono ritentati fino a 3 volte (2s base, ×2 per tentativo). Gestito in `getWithRetry` (extract.ts) e `resolveVersionsList` (metadata.ts). -2. **Fallback su DB** — se tutti i retry esauriscono, il pacchetto viene scritto con `ingestion_source = 'maven_rate_limited'` e `last_synced_at = NOW()`, evitando loop infiniti. Verrà ritentato al prossimo ciclo di refresh. +2. **Retry al prossimo pass** — se tutti i retry esauriscono, il batch conta il pacchetto come errore (nessun record sentinel viene scritto su `packages`) e lo riprenderà al tick/pass successivo, quando l'IP è di nuovo freddo. **Causa root dei 403 persistenti:** `packages_universe` è ordinato per `rank_in_ecosystem`, quindi pacchetti dello stesso namespace (es. `com.google.apis`, `org.wso2`, `software.amazon.awssdk`) si raggruppano nel batch e colpiscono lo stesso CDN node di Maven Central in rapida successione. Il rate limit scatta sistematicamente dopo ~150–200 pacchetti processati. -**Fix applicato:** i batch HTTP vengono shufflati prima dell'esecuzione (`[...packages].sort(() => Math.random() - 0.5)`) per distribuire i namespace uniformemente nei gruppi concorrenti. Un delay configurabile tra i gruppi (`POM_FETCHER_GROUP_DELAY_MS`, default 200ms) riduce ulteriormente il rate di richieste. +**Mitigazione applicata, in ordine di efficacia:** + +1. **Cache in-process dei parent POM** (vedi [Parent POM cache](#parent-pom-cache)) — sfrutta il clustering per namespace per collassare i fetch dei parent condivisi e il doppio fetch del leaf POM, riducendo il **volume totale** di richieste. È la leva principale: il throttle è volume-based per IP, quindi meno richieste = meno 403. +2. Un delay configurabile tra i gruppi concorrenti (`POM_FETCHER_GROUP_DELAY_MS`) + `POM_FETCHER_CONCURRENCY` basso (≤5) → abbassano il rate istantaneo. +3. Backoff di retry con jitter (`±500ms`, vedi `extract.ts` / `metadata.ts`) → evita retry sincronizzati. + +> Nota: **lo shuffle dei batch non aiuta** — riordina gli stessi N request nella stessa finestra temporale (stesso volume → stesso throttle) e in più romperebbe la località che rende efficace la cache dei parent. Namespace noti per triggerare il rate limit a causa dell'alta densità di artefatti: `com.google.apis`, `software.amazon.awssdk`, `org.wso2.*`. @@ -173,22 +233,38 @@ Occasionally a publisher's CI/CD updates `` in `maven-metadata.xml` bef ## Configuration Reference -All variables are optional — defaults are shown. +**All variables are required** — `getMavenConfig()` (`config.ts`) calls `requireEnv` for each, +so the worker throws on startup if any is missing. Suggested values shown. -| Env var | Default | Description | -|---------|---------|-------------| -| `POM_FETCHER_DIRECT_POM_FOR_ALL` | `false` | `true` = direct POM fetch for all packages + full resolution for critical | +| Env var | Suggested | Description | +|---------|-----------|-------------| | `POM_FETCHER_BATCH_SIZE` | `50` | Packages per batch — critical phase | | `POM_FETCHER_CONCURRENCY` | `5` | Concurrent fetches — critical phase | -| `POM_FETCHER_FULL_REFRESH_DAYS` | `90` | Re-sync critical packages after N days | | `POM_FETCHER_NON_CRITICAL_BATCH_SIZE` | `500` | Packages per batch — non-critical phase | -| `POM_FETCHER_NON_CRITICAL_CONCURRENCY` | `20` | Concurrent writes — non-critical DB-only mode | -| `POM_FETCHER_NON_CRITICAL_POM_CONCURRENCY` | `5` | Concurrent fetches — non-critical direct-pom mode | -| `POM_FETCHER_NON_CRITICAL_REFRESH_DAYS` | `1` | Re-sync non-critical stats after N days (DB-only) | -| `POM_FETCHER_NON_CRITICAL_POM_REFRESH_DAYS` | `30` | Re-sync non-critical POM data after N days (direct-pom) | -| `POM_FETCHER_IDLE_SLEEP_SEC` | `3600` | Sleep between passes | +| `POM_FETCHER_NON_CRITICAL_CONCURRENCY` | `20` | Concurrent writes — non-critical DB-only phase | +| `POM_FETCHER_REFRESH_DAYS` | `1` | Staleness window — re-sync a package once its `last_synced_at` is older than N days (applies to both phases) | +| `POM_FETCHER_GROUP_DELAY_MS` | `200`–`400` | Delay between concurrent groups in the critical phase (rate-limit mitigation) | + +### Sync source (Temporal critical path) + +These select **where the critical sync gets its work from**. They affect only the Temporal +`processMavenCriticalBatch` activity — the standalone backfill loop is unaffected. All are +optional; unset/invalid values fall back to the current universe-polling behaviour. + +| Env var | Default | Description | +|---------|---------|-------------| +| `MAVEN_SYNC_SOURCE` | `maven` | `maven` = poll `packages_universe` by staleness (current behaviour). `api` = enrich only what the delta feed reports. `both` = run both passes per tick. | +| `MAVEN_DELTA_API_URL` | — | Base URL of our delta feed (e.g. the Railway deployment). **Required** when source is `api` or `both`. | +| `MAVEN_DELTA_API_TOKEN` | — | Optional bearer token for the delta feed. | +| `MAVEN_DELTA_API_PAGE_SIZE` | `100` | Page size for `/api/changes` pagination. | +| `MAVEN_DELTA_API_LOOKBACK_MINUTES` | `15` | Rolling window size: each tick fetches `[now-N, now)`. Overlaps the cron interval on purpose — re-processing is safe (idempotent upserts). | +| `MAVEN_DELTA_API_INCLUDE_PRERELEASE` | `false` | Forwarded as `includePrerelease` to the feed. | + +The delta-API path always runs **full extraction** (the feed is an explicit "this changed" +signal) and only enriches packages that are `is_critical` in `packages_universe`; non-critical +purls in the feed are dropped. -**Concurrency guidance:** Maven Central handles 10–15 concurrent requests per IP without throttling. Retry logic with exponential backoff handles 429s. Keep `POM_FETCHER_CONCURRENCY` + `POM_FETCHER_NON_CRITICAL_POM_CONCURRENCY` ≤ 15 in production. +**Concurrency guidance:** Maven Central handles 10–15 concurrent requests per IP without throttling. Retry logic with exponential backoff handles 429/403s. Keep `POM_FETCHER_CONCURRENCY` ≤ 5 locally — repeated local runs heat the IP (see [Known Exceptions](#maven-central-403-rate-limiting)). --- @@ -202,30 +278,49 @@ Observed on ~2K packages (local dev, Maven Central over the network): | Non-critical | direct-pom | ~25 pkg/sec | 2 HTTP requests/pkg: metadata.xml + POM | | Critical | full-pom | ~15–25 pkg/sec | Faster when packages share parent POMs (CDN cache warm) | -**Estimated time for 800K packages (10% critical) at default settings:** +**Estimated time for ~800K packages (≈18% critical):** | Phase | Packages | Estimated time | |-------|----------|---------------| -| Non-critical (DB-only, first pass) | 720K | ~12 min | -| Non-critical (direct-pom, first pass) | 720K | ~8 h | -| Critical (full-pom, first pass) | 80K | ~3–4 h | +| Non-critical (DB-only) | ~670K | ~12 min | +| Critical (full POM, first extraction) | ~150K | several hours | -First pass is the expensive one. Subsequent daily passes are incremental: -- Non-critical DB-only: re-syncs all packages daily (~12 min) -- Non-critical direct-pom: re-syncs after 30 days (~8 h every 30 days) -- Critical: only packages with new versions or approaching 90-day refresh window +The first critical extraction is the expensive part — run it with the standalone backfill +loop. Afterwards the Temporal schedules keep things incremental: non-critical re-syncs cheaply +every `POM_FETCHER_REFRESH_DAYS`, and critical packages are re-fetched only when a new release +is published (the Temporal path skips unchanged versions) +or once their refresh window elapses. -The daily Temporal schedule has a **12-hour workflow timeout**. With `POM_FETCHER_DIRECT_POM_FOR_ALL=true`, the first pass is within budget (~11–12 h); increase `POM_FETCHER_NON_CRITICAL_POM_CONCURRENCY` to 10 to halve non-critical time if needed. +Under Temporal **each tick processes one batch** within its workflow timeout (15 min critical / +5 min non-critical); the backlog is drained over many ticks, not in one long pass. To go +faster, raise `POM_FETCHER_BATCH_SIZE` / `POM_FETCHER_CONCURRENCY` (keep concurrency ≤ 15 to +avoid Maven Central throttling) or trigger the schedule manually. --- ## Scheduling -Runs daily at **4am UTC** via Temporal schedule `maven`. -Overlap policy: `SKIP` — if a previous run is still active, the new trigger is dropped. -Catchup window: 1 hour. -Workflow timeout: 12 hours. -Max retries: 3 (30s initial, 2× backoff). +Two Temporal schedules are registered on startup of `bin/packages-worker.ts` +(see `maven/schedule.ts`): + +| Schedule ID | Cron | Workflow | Activity | Workflow timeout | +|-------------|------|----------|----------|------------------| +| `maven-critical` | `*/5 * * * *` (every 5 min) | `mavenCriticalWorkflow` | `processMavenCriticalBatch` → one critical batch | 15 min | +| `maven-non-critical` | `*/10 * * * *` (every 10 min) | `mavenNonCriticalWorkflow` | `processMavenNonCriticalBatch` → one non-critical batch | 5 min | + +Both: overlap policy `SKIP` (a tick is dropped if the previous run is still active), +catchup window 1 hour, retry 3× (30s initial, 2× backoff). + +**Each tick processes a single batch** (`POM_FETCHER_BATCH_SIZE` / +`POM_FETCHER_NON_CRITICAL_BATCH_SIZE`), not a full pass — the queue is drained incrementally +across ticks. + +To run a batch on demand instead of waiting for the cron, trigger the schedule from the +Temporal UI (the schedule's **Trigger** button) or the CLI: + +```bash +temporal schedule trigger --schedule-id maven-critical +``` --- diff --git a/services/apps/packages_worker/src/maven/activities.ts b/services/apps/packages_worker/src/maven/activities.ts index e4974c3c08..66db62dd2e 100644 --- a/services/apps/packages_worker/src/maven/activities.ts +++ b/services/apps/packages_worker/src/maven/activities.ts @@ -2,22 +2,70 @@ import { getServiceChildLogger } from '@crowd/logging' import { getMavenConfig } from '../config' import { getPackagesDb } from '../db' -import { BatchResult, processBatch } from './runMavenEnrichmentLoop' + +import { BatchResult, processApiChangesBatch, processBatch } from './runMavenEnrichmentLoop' const log = getServiceChildLogger('maven-activity') +function addBatchResult(into: BatchResult, from: BatchResult): void { + into.processed += from.processed + into.skipped += from.skipped + into.error += from.error + into.unchanged += from.unchanged + into.hopLimitReached += from.hopLimitReached +} + export async function processMavenCriticalBatch(): Promise { const config = getMavenConfig() const qx = await getPackagesDb() - const result = await processBatch(qx, config, true) - log.info({ processed: result.processed, skipped: result.skipped, unchanged: result.unchanged, error: result.error }, 'Maven critical batch complete') - return result + + const total: BatchResult = { + processed: 0, + skipped: 0, + error: 0, + unchanged: 0, + hopLimitReached: 0, + } + + // Delta-API pass: enrich what our feed reports as changed (forces full extraction). + if (config.syncSource === 'api' || config.syncSource === 'both') { + try { + const apiResult = await processApiChangesBatch(qx, config) + log.info({ ...apiResult }, 'Maven delta-API batch complete') + addBatchResult(total, apiResult) + } catch (err) { + // In 'both' mode the universe-polling pass is the reliable backbone, so a flaky + // delta feed must never block it — log and continue. In 'api' mode there is no + // fallback, so let the activity fail and have Temporal retry it. + if (config.syncSource === 'api') throw err + const message = err instanceof Error ? err.message : String(err) + log.warn({ error: message }, 'Delta-API pass failed — continuing with universe-polling pass') + } + } + + // Universe-polling pass: current behaviour — skip POM extraction when version is unchanged. + if (config.syncSource === 'maven' || config.syncSource === 'both') { + const mavenResult = await processBatch(qx, config, true, false) + log.info({ ...mavenResult }, 'Maven critical batch complete') + addBatchResult(total, mavenResult) + } + + return total } export async function processMavenNonCriticalBatch(): Promise { const config = getMavenConfig() const qx = await getPackagesDb() - const result = await processBatch(qx, config, false) - log.info({ processed: result.processed, skipped: result.skipped, unchanged: result.unchanged, error: result.error }, 'Maven non-critical batch complete') + // Non-critical is DB-only (no POM fetch); the flag is unused on this path. + const result = await processBatch(qx, config, false, false) + log.info( + { + processed: result.processed, + skipped: result.skipped, + unchanged: result.unchanged, + error: result.error, + }, + 'Maven non-critical batch complete', + ) return result } diff --git a/services/apps/packages_worker/src/maven/deltaApi.ts b/services/apps/packages_worker/src/maven/deltaApi.ts new file mode 100644 index 0000000000..0b9a24a505 --- /dev/null +++ b/services/apps/packages_worker/src/maven/deltaApi.ts @@ -0,0 +1,128 @@ +/** + * Client for our own Maven delta feed (deployed separately, e.g. on Railway). + * It diffs the Maven Central index and exposes the artifacts that changed in a + * time window, so we can enrich exactly the packages that moved instead of + * polling the whole universe. + * + * Endpoint: + * GET {baseUrl}/api/changes?since=&until=&pageSize=&includePrerelease= + * + * Response: + * { + * "window": { "since": "...", "until": "..." }, + * "changes": [ + * { "purl", "groupId", "artifactId", "version", "publishedAt", + * "isPrerelease", "changeType" }, ... + * ], + * "nextCursor": "..." // present while more pages remain + * } + */ +import axios from 'axios' + +import { getServiceChildLogger } from '@crowd/logging' + +const log = getServiceChildLogger('maven-delta-api') + +const REQUEST_TIMEOUT_MS = 15_000 +// Hard stop so a misbehaving cursor can never spin forever. +const MAX_PAGES = 1_000 +const MAX_RETRIES = 3 +const RETRY_BASE_MS = 1_000 + +function sleep(ms: number): Promise { + return new Promise((r) => setTimeout(r, ms)) +} + +// Transient: no HTTP response at all (socket aborted/reset, timeout — the +// 'Error: aborted' we see when Railway drops the gzip stream mid-flight) or a +// retryable status (5xx / 429). Everything else (4xx, parse errors) is fatal. +function isTransientError(err: unknown): boolean { + if (!axios.isAxiosError(err)) return false + const status = err.response?.status + if (status === undefined) return true + return status >= 500 || status === 429 +} + +export interface MavenChange { + purl: string + groupId: string + artifactId: string + version: string + publishedAt: string + isPrerelease: boolean + changeType: string +} + +interface ChangesResponse { + window: { since: string; until: string } + changes: MavenChange[] + nextCursor?: string +} + +export interface FetchChangesOptions { + baseUrl: string + token?: string + since: string // ISO timestamp + until: string // ISO timestamp + pageSize: number + includePrerelease: boolean +} + +/** + * Fetches every change in [since, until), following `nextCursor` pagination. + */ +export async function fetchMavenChanges(opts: FetchChangesOptions): Promise { + const all: MavenChange[] = [] + let cursor: string | undefined + let page = 0 + + do { + const params: Record = { + since: opts.since, + until: opts.until, + pageSize: opts.pageSize, + includePrerelease: opts.includePrerelease, + } + if (cursor) params.cursor = cursor + + let res: { data: ChangesResponse } | undefined + for (let attempt = 0; ; attempt++) { + try { + res = await axios.get(`${opts.baseUrl}/api/changes`, { + params, + timeout: REQUEST_TIMEOUT_MS, + headers: opts.token ? { Authorization: `Bearer ${opts.token}` } : undefined, + }) + break + } catch (err) { + if (attempt < MAX_RETRIES && isTransientError(err)) { + const delay = RETRY_BASE_MS * 2 ** attempt + Math.random() * 300 + log.warn( + { page, attempt, error: err instanceof Error ? err.message : String(err) }, + 'Delta page fetch failed — retrying', + ) + await sleep(delay) + continue + } + throw err + } + } + + const changes = res.data?.changes ?? [] + all.push(...changes) + cursor = res.data?.nextCursor || undefined + page++ + + log.debug( + { page, batch: changes.length, total: all.length, hasMore: Boolean(cursor) }, + 'Fetched delta page', + ) + + if (page >= MAX_PAGES) { + log.warn({ page, total: all.length }, 'Hit MAX_PAGES — stopping pagination early') + break + } + } while (cursor) + + return all +} diff --git a/services/apps/packages_worker/src/maven/extract.ts b/services/apps/packages_worker/src/maven/extract.ts index b9f558b01b..5c9c5f4a47 100644 --- a/services/apps/packages_worker/src/maven/extract.ts +++ b/services/apps/packages_worker/src/maven/extract.ts @@ -2,7 +2,6 @@ * Core POM extraction logic — pure functions (no I/O side-effects, no DB calls). * Callers are responsible for concurrency, retries, and persistence. */ - import axios from 'axios' import { XMLParser } from 'fast-xml-parser' @@ -57,8 +56,11 @@ interface PomPerson { // ─── Config ─────────────────────────────────────────────────────────────────── -const MAVEN_REPO = 'https://repo1.maven.org/maven2' -export const MAX_PARENT_HOPS = 7 +// Base URL for fetching POMs/metadata. Defaults to canonical Central (repo1, Fastly — +// aggressive per-IP throttling). Point POM_FETCHER_MAVEN_BASE_URL at a high-throughput +// mirror (e.g. the Google GCS mirror) for bulk backfills. +const MAVEN_REPO = process.env.POM_FETCHER_MAVEN_BASE_URL ?? 'https://repo1.maven.org/maven2' +export const MAX_PARENT_HOPS = 8 const REQUEST_TIMEOUT_MS = 15_000 const parser = new XMLParser({ @@ -80,7 +82,10 @@ async function sleep(ms: number): Promise { async function getWithRetry(url: string): Promise { for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) { try { - const res = await axios.get(url, { responseType: 'text', timeout: REQUEST_TIMEOUT_MS }) + const res = await axios.get(url, { + responseType: 'text', + timeout: REQUEST_TIMEOUT_MS, + }) return res.data } catch (err) { if (axios.isAxiosError(err)) { @@ -105,7 +110,12 @@ export function buildPomUrl(groupId: string, artifactId: string, version: string return `${MAVEN_REPO}/${groupPath}/${artifactId}/${version}/${artifactId}-${version}.pom` } -export async function fetchPom(groupId: string, artifactId: string, version: string, url: string): Promise { +export async function fetchPom( + groupId: string, + artifactId: string, + version: string, + url: string, +): Promise { try { const data = await getWithRetry(url) const parsed = parser.parse(data) @@ -117,13 +127,123 @@ export async function fetchPom(groupId: string, artifactId: string, version: str log.debug({ groupId, artifactId, version }, `POM not found (404): ${url}`) return null } - log.debug({ groupId, artifactId, version }, `HTTP ${status ?? 'unknown'} fetching POM: ${url}`) + log.debug( + { groupId, artifactId, version }, + `HTTP ${status ?? 'unknown'} fetching POM: ${url}`, + ) return null } throw err } } +// ─── POM cache ────────────────────────────────────────────────────────────── +// +// Parent POMs are heavily shared across artifacts of the same namespace +// (e.g. org.apache:apache, org.springframework.boot:spring-boot-starter-parent), +// and the sync queue is ordered by rank_in_ecosystem, so those siblings are +// processed close together. A module-level, coordinate-keyed in-process cache +// collapses those repeated parent fetches into a single HTTP request — the single +// biggest lever against Maven Central rate limiting. It also removes the redundant +// second fetch of each artifact's own POM (extractArtifact fetches the leaf, then +// resolveWithInheritance fetches it again at depth 0). +// +// Only *successful* fetches are cached: fetchPom() returns null for both a real 404 +// and a transient failure (throttle/timeout), so caching null would poison the cache +// with transient errors — we never do it. Maven coordinates are immutable, so a cached +// POM never goes stale; the LRU size cap is purely to bound memory. + +const POM_CACHE_MAX_ENTRIES = 5_000 + +const pomCache = new Map() +const inFlight = new Map>() +const pomCacheStats = { hits: 0, coalesced: 0, misses: 0, evictions: 0 } + +function pomCacheKey(groupId: string, artifactId: string, version: string): string { + return `${groupId}:${artifactId}:${version}` +} + +function cacheSet(key: string, pom: PomData): void { + pomCache.delete(key) // re-insert to refresh recency (LRU) + pomCache.set(key, pom) + if (pomCache.size > POM_CACHE_MAX_ENTRIES) { + const oldest = pomCache.keys().next().value + if (oldest !== undefined) { + pomCache.delete(oldest) + pomCacheStats.evictions++ + } + } +} + +/** + * Cached + request-coalescing wrapper around fetchPom(). + * - Cache hit → returns the stored POM, no HTTP. + * - In-flight → a concurrent fetch for the same coordinates is already running; + * await it instead of issuing a duplicate request. + * - Miss → performs the network fetch; caches the result only if non-null. + */ +async function fetchPomCached( + groupId: string, + artifactId: string, + version: string, +): Promise { + const key = pomCacheKey(groupId, artifactId, version) + + const cached = pomCache.get(key) + if (cached !== undefined) { + pomCacheStats.hits++ + pomCache.delete(key) // refresh recency on read (LRU) + pomCache.set(key, cached) + return cached + } + + const pending = inFlight.get(key) + if (pending) { + pomCacheStats.coalesced++ + return pending + } + + pomCacheStats.misses++ + const promise = fetchPom(groupId, artifactId, version, buildPomUrl(groupId, artifactId, version)) + .then((pom) => { + if (pom) cacheSet(key, pom) + return pom + }) + .finally(() => { + inFlight.delete(key) + }) + + inFlight.set(key, promise) + return promise +} + +/** Snapshot of cache effectiveness — logged once per critical batch by the enrichment loop. */ +export function getPomCacheStats(): { + size: number + hits: number + coalesced: number + misses: number + evictions: number + hitRate: number +} { + const lookups = pomCacheStats.hits + pomCacheStats.coalesced + pomCacheStats.misses + const hitRate = + lookups === 0 + ? 0 + : Math.round(((pomCacheStats.hits + pomCacheStats.coalesced) / lookups) * 100) / 100 + return { size: pomCache.size, ...pomCacheStats, hitRate } +} + +/** Clears the cache and counters. Intended for tests. */ +export function resetPomCache(): void { + pomCache.clear() + inFlight.clear() + pomCacheStats.hits = 0 + pomCacheStats.coalesced = 0 + pomCacheStats.misses = 0 + pomCacheStats.evictions = 0 +} + // ─── Inheritance resolution ─────────────────────────────────────────────────── interface ResolvedFields { @@ -137,13 +257,18 @@ interface ResolvedFields { hops: number } -async function resolveWithInheritance(groupId: string, artifactId: string, version: string, depth = 0): Promise { +async function resolveWithInheritance( + groupId: string, + artifactId: string, + version: string, + depth = 0, +): Promise { if (depth > MAX_PARENT_HOPS) { log.debug({ groupId, artifactId, version }, `Max parent hops (${MAX_PARENT_HOPS}) reached`) return emptyFields(depth) } - const pom = await fetchPom(groupId, artifactId, version, buildPomUrl(groupId, artifactId, version)) + const pom = await fetchPomCached(groupId, artifactId, version) if (!pom) return emptyFields(depth) const licenses = extractLicenses(pom) @@ -156,8 +281,16 @@ async function resolveWithInheritance(groupId: string, artifactId: string, versi const parent = extractParent(pom) if (parent && (missingLicense || missingScm)) { - log.debug({ groupId, artifactId, version }, `[hop ${depth + 1}] ${parent.groupId}:${parent.artifactId}:${parent.version}`) - const parentFields = await resolveWithInheritance(parent.groupId, parent.artifactId, parent.version, depth + 1) + log.debug( + { groupId, artifactId, version }, + `[hop ${depth + 1}] ${parent.groupId}:${parent.artifactId}:${parent.version}`, + ) + const parentFields = await resolveWithInheritance( + parent.groupId, + parent.artifactId, + parent.version, + depth + 1, + ) return { description: extractStr(pom.description) ?? parentFields.description, licenses: licenses.length > 0 ? licenses : parentFields.licenses, @@ -189,10 +322,14 @@ async function resolveWithInheritance(groupId: string, artifactId: string, versi * Faster than extractArtifact — use for non-critical packages where inherited * fields (licenses, SCM) may be missing but throughput matters more. */ -export async function extractArtifactDirect(groupId: string, artifactId: string, version: string): Promise { +export async function extractArtifactDirect( + groupId: string, + artifactId: string, + version: string, +): Promise { const purl = `pkg:maven/${groupId}/${artifactId}@${version}` const pomUrl = buildPomUrl(groupId, artifactId, version) - const pom = await fetchPom(groupId, artifactId, version, pomUrl) + const pom = await fetchPomCached(groupId, artifactId, version) if (!pom) { return { @@ -239,11 +376,15 @@ export async function extractArtifactDirect(groupId: string, artifactId: string, * the parent chain to inherit licenses and SCM when not in the direct POM. * Always returns a result object; errors are captured in `result.error`. */ -export async function extractArtifact(groupId: string, artifactId: string, version: string): Promise { +export async function extractArtifact( + groupId: string, + artifactId: string, + version: string, +): Promise { const purl = `pkg:maven/${groupId}/${artifactId}@${version}` const pomUrl = buildPomUrl(groupId, artifactId, version) - const rootPom = await fetchPom(groupId, artifactId, version, pomUrl) + const rootPom = await fetchPomCached(groupId, artifactId, version) if (!rootPom) { return { groupId, diff --git a/services/apps/packages_worker/src/maven/metadata.ts b/services/apps/packages_worker/src/maven/metadata.ts index f437bb6667..18a7b317d4 100644 --- a/services/apps/packages_worker/src/maven/metadata.ts +++ b/services/apps/packages_worker/src/maven/metadata.ts @@ -8,11 +8,10 @@ * Returns null when the artifact is not found (404) or the metadata is * malformed. */ - import axios from 'axios' import { XMLParser } from 'fast-xml-parser' -const MAVEN_REPO = 'https://repo1.maven.org/maven2' +const MAVEN_REPO = process.env.POM_FETCHER_MAVEN_BASE_URL ?? 'https://repo1.maven.org/maven2' const REQUEST_TIMEOUT_MS = 10_000 const MAX_RETRIES = 3 const RETRY_BASE_MS = 2_000 @@ -51,7 +50,10 @@ export async function resolveVersionsList( for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) { try { - const res = await axios.get(url, { responseType: 'text', timeout: REQUEST_TIMEOUT_MS }) + const res = await axios.get(url, { + responseType: 'text', + timeout: REQUEST_TIMEOUT_MS, + }) const parsed = parser.parse(res.data) // Prefer over — release excludes snapshots/alphas diff --git a/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts index 32c9740a49..84f9961b71 100644 --- a/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts +++ b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts @@ -1,6 +1,8 @@ import crypto from 'crypto' import { + MavenPackageToSync, + listMavenPackagesByPurls, listMavenPackagesToSync, logAuditFieldChange, replacePackageMaintainers, @@ -15,7 +17,9 @@ import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' import { getServiceChildLogger } from '@crowd/logging' import { getMavenConfig } from '../config' -import { MAX_PARENT_HOPS, extractArtifact, normalizeScmUrl } from './extract' + +import { fetchMavenChanges } from './deltaApi' +import { MAX_PARENT_HOPS, extractArtifact, getPomCacheStats, normalizeScmUrl } from './extract' import { isMavenFetchError, resolveVersionsList } from './metadata' import { isPrerelease, parseRepoUrl } from './normalize' @@ -28,10 +32,19 @@ export interface BatchResult { skipped: number error: number unchanged: number + // critical packages whose parent chain was truncated at MAX_PARENT_HOPS (POM data may be incomplete) + hopLimitReached: number +} + +type CriticalStatus = 'processed' | 'skipped' | 'unchanged' | 'error' + +interface CriticalPackageResult { + status: CriticalStatus + hopLimitReached: boolean } type MavenConfig = ReturnType -type PackageRow = Awaited>[number] +type PackageRow = MavenPackageToSync // ─── Helpers ────────────────────────────────────────────────────────────────── @@ -49,10 +62,36 @@ async function writeRepoLink( const parsed = parseRepoUrl(repositoryUrl) if (!parsed) return const repoId = await upsertRepo(qx, { url: repositoryUrl, ...parsed }) - const repoChanged = await upsertPackageRepo(qx, { packageId, repoId, source: 'declared', confidence: 0.8 }) + const repoChanged = await upsertPackageRepo(qx, { + packageId, + repoId, + source: 'declared', + confidence: 0.8, + }) repoChanged.forEach((f) => changed.add(f)) } +// Postgres deadlock (40P01) is transient: concurrent transactions upserting the same shared +// rows (e.g. maintainer 'hboutemy' across many org.apache packages, or the shared apache repo) +// can form a lock cycle. Re-running the whole transaction resolves it — the upserts are idempotent. +async function withDeadlockRetry(fn: () => Promise, maxAttempts = 4): Promise { + for (let attempt = 1; ; attempt++) { + try { + return await fn() + } catch (err) { + const code = (err as { code?: string }).code + const isDeadlock = + code === '40P01' || /deadlock detected/i.test(String((err as Error)?.message)) + if (isDeadlock && attempt < maxAttempts) { + await new Promise((r) => setTimeout(r, 50 * attempt + Math.random() * 100)) + log.debug({ attempt }, 'Deadlock detected — retrying transaction') + continue + } + throw err + } + } +} + // ─── Non-critical: copy universe stats into packages ───────────────────────── async function processNonCriticalPackage(qx: QueryExecutor, pkg: PackageRow): Promise { @@ -83,13 +122,13 @@ async function processCriticalPackage( qx: QueryExecutor, pkg: PackageRow, forceFullExtraction: boolean, -): Promise<'processed' | 'skipped' | 'unchanged' | 'error'> { +): Promise { const groupId = pkg.namespace const artifactId = pkg.name if (!groupId) { log.warn({ purl: pkg.purl }, 'Skipping: null namespace (groupId)') - return 'skipped' + return { status: 'skipped', hopLimitReached: false } } // Phase 1: lightweight metadata fetch to get the current upstream version. @@ -117,13 +156,18 @@ async function processCriticalPackage( downloadsLastMonth: pkg.downloads30d, }) log.warn({ groupId, artifactId }, 'Not on Maven Central — writing minimal record') - return 'skipped' + return { status: 'skipped', hopLimitReached: false } } if (metadata.kind === 'RATE_LIMIT') { - log.warn({ groupId, artifactId, status: metadata.status }, 'Rate limited — will retry next pass') - return 'error' + log.warn( + { groupId, artifactId, status: metadata.status }, + 'Rate limited — will retry next pass', + ) + return { status: 'error', hopLimitReached: false } } - throw new Error(`Transient error fetching metadata for ${groupId}:${artifactId} — ${metadata.message}`) + throw new Error( + `Transient error fetching metadata for ${groupId}:${artifactId} — ${metadata.message}`, + ) } const version = metadata.releaseVersion @@ -149,7 +193,7 @@ async function processCriticalPackage( downloadsLastMonth: pkg.downloads30d, }) log.warn({ groupId, artifactId }, 'No release version in metadata — writing minimal record') - return 'skipped' + return { status: 'skipped', hopLimitReached: false } } // Phase 2: skip full POM extraction when upstream version matches what we already have. @@ -161,7 +205,7 @@ async function processCriticalPackage( downloadsLastMonth: pkg.downloads30d, }) log.debug({ groupId, artifactId, version }, 'Version unchanged — skipping POM extraction') - return 'unchanged' + return { status: 'unchanged', hopLimitReached: false } } // Phase 3: full POM extraction with parent-chain resolution — wrapped in a @@ -189,96 +233,119 @@ async function processCriticalPackage( dependentReposCount: pkg.dependentReposCount, downloadsLastMonth: pkg.downloads30d, }) - return 'error' + return { status: 'error', hopLimitReached: false } } - if (result.parentHops > MAX_PARENT_HOPS) { + const hopLimitReached = result.parentHops > MAX_PARENT_HOPS + if (hopLimitReached) { log.warn( - { groupId, artifactId, parentHops: result.parentHops, missingLicenses: result.licenses.length === 0, missingScm: !result.scmUrl }, + { + groupId, + artifactId, + parentHops: result.parentHops, + missingLicenses: result.licenses.length === 0, + missingScm: !result.scmUrl, + }, 'Parent hop limit reached — data may be incomplete', ) } const repositoryUrl = normalizeScmUrl(result.scmUrl) - await qx.tx(async (t) => { - const changed = new Set() + await withDeadlockRetry(() => + qx.tx(async (t) => { + const changed = new Set() - const { id: packageId, changedFields: pkgChanged } = await upsertPackage(t, { - purl: pkg.purl, - ecosystem: 'maven', - namespace: groupId, - name: artifactId, - description: result.description, - homepage: result.homepageUrl, - registryUrl: mavenRegistryUrl(groupId, artifactId), - declaredRepositoryUrl: result.scmUrl, - repositoryUrl, - licenses: result.licenses.length > 0 ? result.licenses : null, - licensesRaw: result.licensesRaw, - latestVersion: version, - ingestionSource: 'maven-registry', - criticalityScore: pkg.criticalityScore, - dependentPackagesCount: pkg.dependentPackagesCount, - dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, - }) - pkgChanged.forEach((f) => changed.add(f)) - - const allVersions = metadata.versions.length > 0 ? metadata.versions : [version] - const verChanged = await upsertVersionsBatch( - t, - allVersions.map((v) => ({ - packageId, + const { id: packageId, changedFields: pkgChanged } = await upsertPackage(t, { + purl: pkg.purl, ecosystem: 'maven', + namespace: groupId, name: artifactId, - number: v, - isLatest: v === metadata.releaseVersion, - isPrerelease: isPrerelease(v), - license: result.licenses[0] ?? null, - })), - ) - verChanged.forEach((f) => changed.add(f)) - - const allPeople = [ - ...result.developers.map((d) => ({ ...d, role: 'author' as const })), - ...result.contributors.map((c) => ({ ...c, role: 'maintainer' as const })), - ] - - const maintainerLinks: Array<{ maintainerId: number; role: 'author' | 'maintainer' }> = [] - for (const person of allPeople) { - const username = person.username ?? person.email ?? person.displayName - if (!username) continue - const emailHash = person.email - ? crypto.createHash('sha256').update(person.email.toLowerCase().trim()).digest('hex') - : null - const { id: maintainerId, changedFields: mChanged } = await upsertMaintainer(t, { - ecosystem: 'maven', - username, - displayName: person.displayName, - url: person.url, - emailHash, + description: result.description, + homepage: result.homepageUrl, + registryUrl: mavenRegistryUrl(groupId, artifactId), + declaredRepositoryUrl: result.scmUrl, + repositoryUrl, + licenses: result.licenses.length > 0 ? result.licenses : null, + licensesRaw: result.licensesRaw, + latestVersion: version, + ingestionSource: 'maven-registry', + criticalityScore: pkg.criticalityScore, + dependentPackagesCount: pkg.dependentPackagesCount, + dependentReposCount: pkg.dependentReposCount, + downloadsLastMonth: pkg.downloads30d, + }) + pkgChanged.forEach((f) => changed.add(f)) + + const allVersions = metadata.versions.length > 0 ? metadata.versions : [version] + const verChanged = await upsertVersionsBatch( + t, + allVersions.map((v) => ({ + packageId, + ecosystem: 'maven', + name: artifactId, + number: v, + isLatest: v === metadata.releaseVersion, + isPrerelease: isPrerelease(v), + license: result.licenses[0] ?? null, + })), + ) + verChanged.forEach((f) => changed.add(f)) + + const allPeople = [ + ...result.developers.map((d) => ({ ...d, role: 'author' as const })), + ...result.contributors.map((c) => ({ ...c, role: 'maintainer' as const })), + ].sort((a, b) => { + // Stable order on the shared maintainers table so concurrent transactions acquire + // row locks in the same order → no deadlock cycles. + const ka = a.username ?? a.email ?? a.displayName ?? '' + const kb = b.username ?? b.email ?? b.displayName ?? '' + return ka < kb ? -1 : ka > kb ? 1 : 0 }) - mChanged.forEach((f) => changed.add(f)) - maintainerLinks.push({ maintainerId, role: person.role }) - } - - if (maintainerLinks.length > 0) { - const pmChanged = await replacePackageMaintainers(t, packageId, maintainerLinks) - pmChanged.forEach((f) => changed.add(f)) - } - - await writeRepoLink(t, packageId, repositoryUrl, changed) - - await logAuditFieldChange(t, 'maven', pkg.purl, Array.from(changed)) - log.info( - { groupId, artifactId, version, parentHops: result.parentHops, licenses: result.licenses.length, maintainers: maintainerLinks.length, versions: allVersions.length }, - 'ok', - ) - }) + const maintainerLinks: Array<{ maintainerId: number; role: 'author' | 'maintainer' }> = [] + for (const person of allPeople) { + const username = person.username ?? person.email ?? person.displayName + if (!username) continue + const emailHash = person.email + ? crypto.createHash('sha256').update(person.email.toLowerCase().trim()).digest('hex') + : null + const { id: maintainerId, changedFields: mChanged } = await upsertMaintainer(t, { + ecosystem: 'maven', + username, + displayName: person.displayName, + url: person.url, + emailHash, + }) + mChanged.forEach((f) => changed.add(f)) + maintainerLinks.push({ maintainerId, role: person.role }) + } + + if (maintainerLinks.length > 0) { + const pmChanged = await replacePackageMaintainers(t, packageId, maintainerLinks) + pmChanged.forEach((f) => changed.add(f)) + } + + await writeRepoLink(t, packageId, repositoryUrl, changed) + + await logAuditFieldChange(t, 'maven', pkg.purl, Array.from(changed)) + + log.info( + { + groupId, + artifactId, + version, + parentHops: result.parentHops, + licenses: result.licenses.length, + maintainers: maintainerLinks.length, + versions: allVersions.length, + }, + 'ok', + ) + }), + ) - return 'processed' + return { status: 'processed', hopLimitReached } } // ─── Batch processing ───────────────────────────────────────────────────────── @@ -287,19 +354,33 @@ export async function processBatch( qx: QueryExecutor, config: MavenConfig, isCritical: boolean, + forceFullExtraction: boolean, ): Promise { const batchSize = isCritical ? config.batchSize : config.nonCriticalBatchSize - const concurrency = isCritical ? config.concurrency : config.nonCriticalConcurrency const refreshDays = config.refreshDays - const forceFullExtraction = config.forceFullExtraction const packages = await listMavenPackagesToSync(qx, { limit: batchSize, refreshDays, isCritical }) - if (packages.length === 0) return { processed: 0, skipped: 0, error: 0, unchanged: 0 } + return processPackages(qx, config, packages, isCritical, forceFullExtraction) +} + +// Runs a concrete list of packages through the enrichment pipeline. Shared by the +// universe-polling path (processBatch) and the delta-API path (processApiChangesBatch). +async function processPackages( + qx: QueryExecutor, + config: MavenConfig, + packages: PackageRow[], + isCritical: boolean, + forceFullExtraction: boolean, +): Promise { + const concurrency = isCritical ? config.concurrency : config.nonCriticalConcurrency + + if (packages.length === 0) + return { processed: 0, skipped: 0, error: 0, unchanged: 0, hopLimitReached: 0 } log.info({ count: packages.length, isCritical }, 'Batch started') - const counts = { processed: 0, skipped: 0, error: 0, unchanged: 0 } + const counts = { processed: 0, skipped: 0, error: 0, unchanged: 0, hopLimitReached: 0 } for (let batchStart = 0; batchStart < packages.length; batchStart += concurrency) { const group = packages.slice(batchStart, batchStart + concurrency) @@ -317,8 +398,9 @@ export async function processBatch( return } - const status = await processCriticalPackage(qx, pkg, forceFullExtraction) - counts[status]++ + const res = await processCriticalPackage(qx, pkg, forceFullExtraction) + counts[res.status]++ + if (res.hopLimitReached) counts.hopLimitReached++ } catch (err) { const message = err instanceof Error ? err.message : String(err) log.error({ purl: pkg.purl, error: message }, 'Unexpected error processing package') @@ -333,9 +415,82 @@ export async function processBatch( } } + if (isCritical) { + // POM cache only fills on the critical path (parent-chain resolution). + log.info(getPomCacheStats(), 'POM cache') + } + return counts } +// ─── Delta-API batch ────────────────────────────────────────────────────────── + +// BatchResult plus delta-feed-specific counters and a fetch/process timing split — +// handy for the benchmark script and for spotting whether time goes to the feed or +// to POM extraction. Extra fields are ignored by callers that only need BatchResult. +export interface DeltaApiBatchResult extends BatchResult { + apiChanges: number + uniquePackages: number + matchedCritical: number + fetchMs: number + processMs: number +} + +// Pulls the changed artifacts from our delta feed over a rolling [now-lookback, now) +// window and enriches the critical ones. The window deliberately overlaps the +// Temporal schedule interval; re-processing is safe because every write is an +// idempotent upsert. Always forces full extraction — the feed is an explicit +// "this changed" signal, so we never trust the version-unchanged shortcut here. +export async function processApiChangesBatch( + qx: QueryExecutor, + config: MavenConfig, +): Promise { + const until = new Date() + const since = new Date(until.getTime() - config.deltaApi.lookbackMinutes * 60_000) + + const fetchStartedAt = Date.now() + + const changes = await fetchMavenChanges({ + baseUrl: config.deltaApi.baseUrl, + token: config.deltaApi.token, + since: since.toISOString(), + until: until.toISOString(), + pageSize: config.deltaApi.pageSize, + includePrerelease: config.deltaApi.includePrerelease, + }) + + // Collapse to package-level purls (drop the @version) and dedup — the feed + // reports one entry per version, but we enrich the package as a whole. + const purls = Array.from(new Set(changes.map((c) => `pkg:maven/${c.groupId}/${c.artifactId}`))) + + const packages = await listMavenPackagesByPurls(qx, purls) + const fetchMs = Date.now() - fetchStartedAt + + log.info( + { + changes: changes.length, + uniquePackages: purls.length, + matchedCritical: packages.length, + lookbackMinutes: config.deltaApi.lookbackMinutes, + fetchMs, + }, + 'Delta-API window fetched', + ) + + const processStartedAt = Date.now() + const counts = await processPackages(qx, config, packages, true, true) + const processMs = Date.now() - processStartedAt + + return { + ...counts, + apiChanges: changes.length, + uniquePackages: purls.length, + matchedCritical: packages.length, + fetchMs, + processMs, + } +} + // ─── Phase runner ───────────────────────────────────────────────────────────── async function runPhase( @@ -345,14 +500,21 @@ async function runPhase( isShuttingDown: () => boolean, ): Promise { const label = isCritical ? 'critical' : 'non-critical' - const total: BatchResult = { processed: 0, skipped: 0, error: 0, unchanged: 0 } + const total: BatchResult = { + processed: 0, + skipped: 0, + error: 0, + unchanged: 0, + hopLimitReached: 0, + } let batchNum = 0 const phaseStartedAt = Date.now() log.info({ phase: label }, 'Phase started') while (!isShuttingDown()) { - const result = await processBatch(qx, config, isCritical) + // The standalone loop is the backfill entry point → always full extraction. + const result = await processBatch(qx, config, isCritical, true) if (result.processed + result.skipped + result.error + result.unchanged === 0) { const durationSec = Math.round((Date.now() - phaseStartedAt) / 1000) @@ -365,6 +527,7 @@ async function runPhase( total.skipped += result.skipped total.error += result.error total.unchanged += result.unchanged + total.hopLimitReached += result.hopLimitReached log.info( { @@ -374,6 +537,7 @@ async function runPhase( totalSkipped: total.skipped, totalUnchanged: total.unchanged, totalErrors: total.error, + totalHopLimitReached: total.hopLimitReached, elapsedSec: Math.round((Date.now() - phaseStartedAt) / 1000), }, 'Batch done', @@ -383,49 +547,19 @@ async function runPhase( return total } -// ─── Main loop ──────────────────────────────────────────────────────────────── +// ─── One-shot backfill ────────────────────────────────────────────────────────── -export async function runMavenEnrichmentLoop( +/** + * Drains the Tier 2 critical queue once, with full POM extraction, and returns + * the totals. It does NOT idle-loop — it runs until a batch comes back empty (or + * shutdown is requested) and then returns, so the caller can exit. Meant to be + * triggered manually (e.g. `pnpm backfill:maven` execed into the packages-worker + * container). + */ +export async function runMavenCriticalBackfill( qx: QueryExecutor, config: MavenConfig, isShuttingDown: () => boolean, -): Promise { - log.info( - { - batchSize: config.batchSize, - concurrency: config.concurrency, - nonCriticalBatchSize: config.nonCriticalBatchSize, - nonCriticalConcurrency: config.nonCriticalConcurrency, - refreshDays: config.refreshDays, - forceFullExtraction: config.forceFullExtraction, - }, - config.forceFullExtraction - ? 'POM fetcher started — FORCE FULL EXTRACTION (version-unchanged check disabled)' - : 'POM fetcher started', - ) - - let passNumber = 0 - - while (!isShuttingDown()) { - passNumber++ - const passStartedAt = Date.now() - log.info({ pass: passNumber }, 'Pass started') - - const critical = await runPhase(qx, config, true, isShuttingDown) - - const durationSec = Math.round((Date.now() - passStartedAt) / 1000) - log.info( - { - pass: passNumber, - totalProcessed: critical.processed, - totalSkipped: critical.skipped, - totalUnchanged: critical.unchanged, - totalErrors: critical.error, - durationSec, - }, - `Pass complete — sleeping ${config.idleSleepSec}s`, - ) - - await new Promise((r) => setTimeout(r, config.idleSleepSec * 1000)) - } +): Promise { + return runPhase(qx, config, true, isShuttingDown) } diff --git a/services/apps/packages_worker/src/maven/schedule.ts b/services/apps/packages_worker/src/maven/schedule.ts index 5737c7a7d6..26e9890f5b 100644 --- a/services/apps/packages_worker/src/maven/schedule.ts +++ b/services/apps/packages_worker/src/maven/schedule.ts @@ -7,70 +7,76 @@ export async function scheduleMavenCritical(): Promise { const { temporal } = svc if (!temporal) throw new Error('Temporal client not initialized') - try { - await temporal.schedule.create({ - scheduleId: 'maven-critical', - spec: { - cronExpressions: ['*/5 * * * *'], - }, - policies: { - overlap: ScheduleOverlapPolicy.SKIP, - catchupWindow: '1 hour', - }, - action: { - type: 'startWorkflow', - workflowType: mavenCriticalWorkflow, - taskQueue: 'packages-worker', - workflowExecutionTimeout: '15 minutes', - retry: { - initialInterval: '30 seconds', - backoffCoefficient: 2, - maximumAttempts: 3, - }, - args: [], + const scheduleOptions: Parameters[0] = { + scheduleId: 'maven-critical', + spec: { + cronExpressions: ['*/1 * * * *'], + }, + policies: { + overlap: ScheduleOverlapPolicy.SKIP, + catchupWindow: '1 hour', + }, + action: { + type: 'startWorkflow', + workflowType: mavenCriticalWorkflow, + taskQueue: 'packages-worker', + workflowExecutionTimeout: '15 minutes', + retry: { + initialInterval: '30 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, }, - }) - } catch (err) { - if (err instanceof ScheduleAlreadyRunning) { - svc.log.info('Schedule maven-critical already registered.') - } else { - throw err - } + args: [], + }, } -} - -export async function scheduleMavenNonCritical(): Promise { - const { temporal } = svc - if (!temporal) throw new Error('Temporal client not initialized') try { - await temporal.schedule.create({ - scheduleId: 'maven-non-critical', - spec: { - cronExpressions: ['*/10 * * * *'], - }, - policies: { - overlap: ScheduleOverlapPolicy.SKIP, - catchupWindow: '1 hour', - }, - action: { - type: 'startWorkflow', - workflowType: mavenNonCriticalWorkflow, - taskQueue: 'packages-worker', - workflowExecutionTimeout: '5 minutes', - retry: { - initialInterval: '30 seconds', - backoffCoefficient: 2, - maximumAttempts: 3, - }, - args: [], - }, - }) + await temporal.schedule.create(scheduleOptions) } catch (err) { if (err instanceof ScheduleAlreadyRunning) { - svc.log.info('Schedule maven-non-critical already registered.') + // Schedule exists → delete and recreate so cron/spec changes take effect on + // restart (schedule.create is a no-op when the id exists → it would keep the old cron). + await temporal.schedule.getHandle('maven-critical').delete() + await temporal.schedule.create(scheduleOptions) + svc.log.info('Schedule maven-critical recreated (cron synced).') } else { throw err } } } + +// export async function scheduleMavenNonCritical(): Promise { +// const { temporal } = svc +// if (!temporal) throw new Error('Temporal client not initialized') + +// try { +// await temporal.schedule.create({ +// scheduleId: 'maven-non-critical', +// spec: { +// cronExpressions: ['*/10 * * * *'], +// }, +// policies: { +// overlap: ScheduleOverlapPolicy.SKIP, +// catchupWindow: '1 hour', +// }, +// action: { +// type: 'startWorkflow', +// workflowType: mavenNonCriticalWorkflow, +// taskQueue: 'packages-worker', +// workflowExecutionTimeout: '5 minutes', +// retry: { +// initialInterval: '30 seconds', +// backoffCoefficient: 2, +// maximumAttempts: 3, +// }, +// args: [], +// }, +// }) +// } catch (err) { +// if (err instanceof ScheduleAlreadyRunning) { +// svc.log.info('Schedule maven-non-critical already registered.') +// } else { +// throw err +// } +// } +// } diff --git a/services/apps/packages_worker/src/scripts/benchmarkMavenDelta.ts b/services/apps/packages_worker/src/scripts/benchmarkMavenDelta.ts new file mode 100644 index 0000000000..7b5c7c27b4 --- /dev/null +++ b/services/apps/packages_worker/src/scripts/benchmarkMavenDelta.ts @@ -0,0 +1,82 @@ +/** + * One-shot benchmark for the Maven delta-API sync path. + * + * Runs a single processApiChangesBatch() against the configured delta feed and + * prints a performance summary (fetch vs. process split, throughput, per-package + * average). Use it to gather numbers before wiring the path into Temporal. + * + * Run with env loaded: + * pnpm run benchmark:maven-delta:local + * + * Requires MAVEN_DELTA_API_URL; set MAVEN_SYNC_SOURCE=api|both so the config + * validates. Widen MAVEN_DELTA_API_LOOKBACK_MINUTES to pull a bigger window for a + * more meaningful sample. + */ +import { getServiceLogger } from '@crowd/logging' + +import { getMavenConfig } from '../config' +import { getPackagesDb } from '../db' +import { processApiChangesBatch } from '../maven/runMavenEnrichmentLoop' + +const log = getServiceLogger() + +const main = async () => { + const config = getMavenConfig() + + if (!config.deltaApi.baseUrl) { + throw new Error('MAVEN_DELTA_API_URL is required to benchmark the delta-API path') + } + + log.info( + { + baseUrl: config.deltaApi.baseUrl, + lookbackMinutes: config.deltaApi.lookbackMinutes, + pageSize: config.deltaApi.pageSize, + includePrerelease: config.deltaApi.includePrerelease, + concurrency: config.concurrency, + groupDelayMs: config.groupDelayMs, + }, + 'Delta-API benchmark starting', + ) + + const qx = await getPackagesDb() + await qx.selectOne('SELECT 1') + + const startedAt = Date.now() + const r = await processApiChangesBatch(qx, config) + const totalMs = Date.now() - startedAt + + const enriched = r.processed + r.skipped + r.error + r.unchanged + const throughputPerSec = r.processMs > 0 ? +(enriched / (r.processMs / 1000)).toFixed(2) : 0 + const avgMsPerPkg = enriched > 0 ? +(r.processMs / enriched).toFixed(1) : 0 + + log.info( + { + // window + apiChanges: r.apiChanges, + uniquePackages: r.uniquePackages, + matchedCritical: r.matchedCritical, + // outcomes + processed: r.processed, + skipped: r.skipped, + unchanged: r.unchanged, + error: r.error, + hopLimitReached: r.hopLimitReached, + // timing + fetchMs: r.fetchMs, + processMs: r.processMs, + totalMs, + // perf + throughputPerSec, + avgMsPerPkg, + }, + 'Delta-API benchmark complete', + ) + + process.exit(0) +} + +main().catch((err) => { + log.error({ err }, 'Delta-API benchmark failed') + process.exit(1) +}) diff --git a/services/apps/packages_worker/src/scripts/validateDataQuality.ts b/services/apps/packages_worker/src/scripts/validateDataQuality.ts new file mode 100644 index 0000000000..abec5301f7 --- /dev/null +++ b/services/apps/packages_worker/src/scripts/validateDataQuality.ts @@ -0,0 +1,88 @@ +/** + * Data-quality scorecard for the Maven enrichment output. + * + * Runs the read-only checks in ../maven/data_quality.sql against the packages DB + * and prints a grouped table (coverage, ingestion-source breakdown, anomalies, + * integrity, freshness). Useful before/after a backfill, and to validate prod. + * + * Run with env loaded: + * pnpm run validate:maven-quality:local + * + * Read-only: a single SELECT, safe to point at a prod read-replica. + * Exits non-zero if any check has status FAIL (handy for CI / gating a deploy). + */ +import { readFileSync } from 'fs' +import { join } from 'path' + +import { getServiceLogger } from '@crowd/logging' + +import { getPackagesDb } from '../db' + +const log = getServiceLogger() + +interface ReportRow { + section: string + metric: string + value: string | number + pct: string | null + status: string +} + +const SQL_PATH = join(__dirname, '../maven/data_quality.sql') + +function pad(s: string, len: number): string { + return s.length >= len ? s : s + ' '.repeat(len - s.length) +} + +function padLeft(s: string, len: number): string { + return s.length >= len ? s : ' '.repeat(len - s.length) + s +} + +function render(rows: ReportRow[]): string { + const lines: string[] = ['', 'Maven enrichment — data quality scorecard', ''] + let currentSection = '' + + for (const r of rows) { + if (r.section !== currentSection) { + currentSection = r.section + lines.push('', currentSection) + } + const value = padLeft(String(r.value), 10) + const pct = padLeft(r.pct ?? '', 8) + const status = pad(r.status, 4) + lines.push(` [${status}] ${pad(r.metric, 44)} ${value} ${pct}`) + } + lines.push('') + return lines.join('\n') +} + +const main = async () => { + const qx = await getPackagesDb() + const sql = readFileSync(SQL_PATH, 'utf8') + + const rows: ReportRow[] = await qx.select(sql) + + process.stdout.write(render(rows)) + + const failures = rows.filter((r) => r.status === 'FAIL') + const warnings = rows.filter((r) => r.status === 'WARN' || r.status === 'POOR' || r.status === 'LOW') + + if (failures.length > 0) { + log.error( + { failures: failures.map((f) => `${f.section} / ${f.metric} = ${f.value}`) }, + `Data quality: ${failures.length} FAIL check(s)`, + ) + process.exit(1) + } + + log.info( + { warnings: warnings.length, checks: rows.length }, + `Data quality OK — no FAIL checks (${warnings.length} warning(s))`, + ) + process.exit(0) +} + +main().catch((err) => { + log.error({ err }, 'Data quality validation failed') + process.exit(1) +}) diff --git a/services/libs/data-access-layer/src/osspckgs/packages.ts b/services/libs/data-access-layer/src/osspckgs/packages.ts index 34825026bb..33f04edba6 100644 --- a/services/libs/data-access-layer/src/osspckgs/packages.ts +++ b/services/libs/data-access-layer/src/osspckgs/packages.ts @@ -15,33 +15,86 @@ export async function findPackageIdsByPurl( // ─── packages_universe ──────────────────────────────────────────────────────── /** - * Returns a page of Maven packages from packages_universe that need syncing - * into the packages table (never synced, or stale by refreshDays). + * Carries everything the Maven enrichment path needs to sync a package. + * For the Tier 2 (critical) path these fields come from `packages`; the disabled + * non-critical path reads the same shape from `packages_universe`. + */ +export type MavenPackageToSync = Pick< + IDbPackageUniverse, + | 'id' + | 'namespace' + | 'name' + | 'criticalityScore' + | 'dependentPackagesCount' + | 'dependentReposCount' + | 'downloads30d' +> & { + purl: string + latestVersion: string | null +} + +// ingestion_source values this worker writes once it has attempted a package +// (success or a terminal outcome). A `packages` row carrying any other value — +// e.g. the marker the criticality worker sets when it promotes a package to +// Tier 2 — has never been POM-enriched, so we pick it up immediately instead of +// waiting for the staleness window. Errors/skips re-run only once stale, so a +// broken package isn't retried every pass. +const MAVEN_WORKER_OUTCOMES = [ + 'maven-registry', + 'maven_error', + 'maven_not_on_central', + 'maven_no_version', +] + +/** + * Returns a page of Maven packages that need syncing via POM extraction. * - * isCritical=true → critical packages queued for full POM extraction - * isCritical=false → non-critical packages queued for universe-stats refresh (DB-only) + * isCritical=true → Tier 2: reads from `packages` (populated by the criticality + * worker, which writes ingestion_source + last_synced_at). + * A row is due when it hasn't been POM-enriched yet, or is + * stale by refreshDays. Ordered by criticality_score. + * isCritical=false → disabled non-critical path: reads from `packages_universe`. + * Kept for reference only — the universe→packages copy is owned + * by the criticality worker and this path is not scheduled. */ export async function listMavenPackagesToSync( qx: QueryExecutor, options: { limit: number; refreshDays: number; isCritical: boolean }, -): Promise< - (Pick< - IDbPackageUniverse, - | 'id' - | 'namespace' - | 'name' - | 'isCritical' - | 'criticalityScore' - | 'dependentPackagesCount' - | 'dependentReposCount' - | 'downloads30d' - > & { - purl: string - latestVersion: string | null - })[] -> { +): Promise { const { limit, refreshDays, isCritical } = options + if (isCritical) { + return qx.select( + ` + SELECT + p.id, + p.purl, + p.namespace, + p.name, + p.criticality_score AS "criticalityScore", + p.dependent_packages_count AS "dependentPackagesCount", + p.dependent_repos_count AS "dependentReposCount", + p.downloads_last_month AS "downloads30d", + p.latest_version AS "latestVersion" + FROM packages p + WHERE + p.ecosystem = 'maven' + AND p.namespace IS NOT NULL + AND ( + p.ingestion_source IS NULL + OR p.ingestion_source <> ALL($(workerOutcomes)::text[]) + OR p.last_synced_at < NOW() - ($(refreshDays) || ' days')::interval + ) + ORDER BY + p.criticality_score DESC NULLS LAST, + p.id ASC + LIMIT $(limit) + `, + { limit, refreshDays, workerOutcomes: MAVEN_WORKER_OUTCOMES }, + ) + } + + // Disabled non-critical path — reads from packages_universe (not scheduled). return qx.select( ` SELECT @@ -49,7 +102,6 @@ export async function listMavenPackagesToSync( pu.purl, pu.namespace, pu.name, - pu.is_critical AS "isCritical", pu.criticality_score AS "criticalityScore", pu.dependent_packages_count AS "dependentPackagesCount", pu.dependent_repos_count AS "dependentReposCount", @@ -59,7 +111,7 @@ export async function listMavenPackagesToSync( LEFT JOIN packages p ON p.purl = pu.purl WHERE pu.ecosystem = 'maven' - AND pu.is_critical = $(isCritical) + AND pu.is_critical = false AND pu.purl IS NOT NULL AND pu.namespace IS NOT NULL AND ( @@ -71,7 +123,44 @@ export async function listMavenPackagesToSync( pu.id ASC LIMIT $(limit) `, - { limit, refreshDays, isCritical }, + { limit, refreshDays }, + ) +} + +/** + * Loads Tier 2 Maven packages (from `packages`) by package-level purl, regardless + * of staleness. Used by the delta-API sync path: the upstream feed already told us + * these packages changed, so we (re)extract them now. Purls not present in + * `packages` (i.e. not promoted to Tier 2 by the criticality worker) are dropped. + */ +export async function listMavenPackagesByPurls( + qx: QueryExecutor, + purls: string[], +): Promise { + if (purls.length === 0) return [] + + return qx.select( + ` + SELECT + p.id, + p.purl, + p.namespace, + p.name, + p.criticality_score AS "criticalityScore", + p.dependent_packages_count AS "dependentPackagesCount", + p.dependent_repos_count AS "dependentReposCount", + p.downloads_last_month AS "downloads30d", + p.latest_version AS "latestVersion" + FROM packages p + WHERE + p.ecosystem = 'maven' + AND p.namespace IS NOT NULL + AND p.purl = ANY($(purls)) + ORDER BY + p.criticality_score DESC NULLS LAST, + p.id ASC + `, + { purls }, ) } diff --git a/services/libs/data-access-layer/src/osspckgs/versions.ts b/services/libs/data-access-layer/src/osspckgs/versions.ts index a9a2b0335a..c1a621a6cb 100644 --- a/services/libs/data-access-layer/src/osspckgs/versions.ts +++ b/services/libs/data-access-layer/src/osspckgs/versions.ts @@ -6,7 +6,9 @@ import { IDbVersionUpsert } from './types' * Bulk-upserts a list of versions for a single package. * Uses UNNEST arrays to avoid N individual round-trips. * On conflict (package_id, number) updates is_latest, is_prerelease, and - * license (never overwrites an existing license with NULL). + * licenses (never overwrites an existing licenses array with NULL). + * The per-version `license` input is stored as a single-element text[] in the + * `licenses` column (the schema is an array to match packages.licenses). * Returns the list of fields that actually changed across all versions. */ export async function upsertVersionsBatch( @@ -28,36 +30,40 @@ export async function upsertVersionsBatch( const row: { changed_fields: string[] } = await qx.selectOne( ` WITH old AS ( - SELECT number, is_latest, is_prerelease, license + SELECT number, is_latest, is_prerelease, licenses FROM versions WHERE package_id = $(packageId)::bigint AND number = ANY($(numbers)::text[]) ), ins AS ( - INSERT INTO versions (package_id, ecosystem, name, number, is_latest, is_prerelease, license, last_synced_at) + INSERT INTO versions (package_id, ecosystem, name, number, is_latest, is_prerelease, licenses, last_synced_at) SELECT - UNNEST($(packageIds)::bigint[]), - UNNEST($(ecosystems)::text[]), - UNNEST($(names)::text[]), - UNNEST($(numbers)::text[]), - UNNEST($(isLatests)::bool[]), - UNNEST($(isPreleases)::bool[]), - UNNEST($(licenses)::text[]), + t.package_id, t.ecosystem, t.name, t.number, t.is_latest, t.is_prerelease, + CASE WHEN t.license IS NULL THEN NULL ELSE ARRAY[t.license] END, NOW() + FROM UNNEST( + $(packageIds)::bigint[], + $(ecosystems)::text[], + $(names)::text[], + $(numbers)::text[], + $(isLatests)::bool[], + $(isPreleases)::bool[], + $(licenses)::text[] + ) AS t(package_id, ecosystem, name, number, is_latest, is_prerelease, license) ON CONFLICT (package_id, number) DO UPDATE SET is_latest = EXCLUDED.is_latest, is_prerelease = EXCLUDED.is_prerelease, - license = COALESCE(EXCLUDED.license, versions.license), + licenses = COALESCE(EXCLUDED.licenses, versions.licenses), last_synced_at = NOW() - RETURNING number, is_latest, is_prerelease, license + RETURNING number, is_latest, is_prerelease, licenses ) SELECT array_remove(ARRAY[ CASE WHEN bool_or(o.number IS NULL) THEN 'versions.number' END, CASE WHEN bool_or(o.is_latest IS DISTINCT FROM ins.is_latest) THEN 'versions.is_latest' END, CASE WHEN bool_or(o.is_prerelease IS DISTINCT FROM ins.is_prerelease) THEN 'versions.is_prerelease' END, - CASE WHEN bool_or(o.license IS DISTINCT FROM ins.license) THEN 'versions.license' END + CASE WHEN bool_or(o.licenses IS DISTINCT FROM ins.licenses) THEN 'versions.licenses' END ], NULL) AS changed_fields FROM ins LEFT JOIN old o ON o.number = ins.number - `, +`, { packageId: versions[0].packageId, packageIds: versions.map((v) => v.packageId), From 55d227da0aadaf8356787c239a1e40e641312185 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 3 Jun 2026 22:13:09 +0200 Subject: [PATCH 11/22] fix: lint Signed-off-by: Umberto Sgueglia --- scripts/cli | 2 +- services/apps/packages_worker/package.json | 2 -- services/apps/packages_worker/src/maven/metadata.ts | 2 +- services/apps/packages_worker/src/maven/normalize.ts | 4 +++- services/apps/packages_worker/src/maven/schedule.ts | 2 +- .../apps/packages_worker/src/scripts/validateDataQuality.ts | 4 +++- services/libs/data-access-layer/src/osspckgs/types.ts | 2 +- 7 files changed, 10 insertions(+), 8 deletions(-) diff --git a/scripts/cli b/scripts/cli index 6e863b5c44..9f3ce75aac 100755 --- a/scripts/cli +++ b/scripts/cli @@ -1060,7 +1060,7 @@ while test $# -gt 0; do exit ;; clean-start-dev) - IGNORED_SERVICES=("python-worker" "job-generator" "discord-ws" "webhook-api" "profiles-worker" "organizations-enrichment-worker" "merge-suggestions-worker" "members-enrichment-worker" "exports-worker" "entity-merging-worker") + # IGNORED_SERVICES=("python-worker" "job-generator" "discord-ws" "webhook-api" "profiles-worker" "organizations-enrichment-worker" "merge-suggestions-worker" "members-enrichment-worker" "exports-worker" "entity-merging-worker") CLEAN_START=1 DEV=1 start diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index 2719ba7dcd..9b31642f68 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -5,11 +5,9 @@ "start:deps-dev-ingest": "CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest tsx src/bin/deps-dev-ingest.ts", "start:github-repos-enricher": "SERVICE=github-repos-enricher tsx src/bin/github-repos-enricher.ts", "start:packages-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker tsx src/bin/packages-worker.ts", - "start:pom-fetcher": "SERVICE=pom-fetcher tsx src/bin/pom-fetcher.ts", "backfill:maven": "SERVICE=maven tsx src/bin/maven-backfill.ts", "dev:packages-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", "dev:deps-dev-ingest": "CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/deps-dev-ingest.ts", - "start:maven": "SERVICE=maven tsx src/bin/maven.ts", "dev:github-repos-enricher": "SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", "dev:packages-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", "dev:deps-dev-ingest:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/deps-dev-ingest.ts", diff --git a/services/apps/packages_worker/src/maven/metadata.ts b/services/apps/packages_worker/src/maven/metadata.ts index 18a7b317d4..680f38e5fe 100644 --- a/services/apps/packages_worker/src/maven/metadata.ts +++ b/services/apps/packages_worker/src/maven/metadata.ts @@ -80,7 +80,7 @@ export async function resolveVersionsList( await sleep(delay) continue } - if (status === 429 || status === 403) return { kind: 'RATE_LIMIT', status: status! } + if (status === 429 || status === 403) return { kind: 'RATE_LIMIT', status } } const message = err instanceof Error ? err.message : String(err) return { kind: 'TRANSIENT', message } diff --git a/services/apps/packages_worker/src/maven/normalize.ts b/services/apps/packages_worker/src/maven/normalize.ts index b19829e02f..00df7986aa 100644 --- a/services/apps/packages_worker/src/maven/normalize.ts +++ b/services/apps/packages_worker/src/maven/normalize.ts @@ -2,7 +2,9 @@ export function isPrerelease(version: string): boolean { return /-(SNAPSHOT|alpha|beta|rc|m\d+)/i.test(version) } -export function parseRepoUrl(url: string): { host: string; owner: string | null; name: string | null } | null { +export function parseRepoUrl( + url: string, +): { host: string; owner: string | null; name: string | null } | null { try { const parsed = new URL(url) const h = parsed.hostname.toLowerCase() diff --git a/services/apps/packages_worker/src/maven/schedule.ts b/services/apps/packages_worker/src/maven/schedule.ts index 26e9890f5b..c8c004842d 100644 --- a/services/apps/packages_worker/src/maven/schedule.ts +++ b/services/apps/packages_worker/src/maven/schedule.ts @@ -1,7 +1,7 @@ import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' import { svc } from '../service' -import { mavenCriticalWorkflow, mavenNonCriticalWorkflow } from '../workflows' +import { mavenCriticalWorkflow } from '../workflows' export async function scheduleMavenCritical(): Promise { const { temporal } = svc diff --git a/services/apps/packages_worker/src/scripts/validateDataQuality.ts b/services/apps/packages_worker/src/scripts/validateDataQuality.ts index abec5301f7..123274c10d 100644 --- a/services/apps/packages_worker/src/scripts/validateDataQuality.ts +++ b/services/apps/packages_worker/src/scripts/validateDataQuality.ts @@ -65,7 +65,9 @@ const main = async () => { process.stdout.write(render(rows)) const failures = rows.filter((r) => r.status === 'FAIL') - const warnings = rows.filter((r) => r.status === 'WARN' || r.status === 'POOR' || r.status === 'LOW') + const warnings = rows.filter( + (r) => r.status === 'WARN' || r.status === 'POOR' || r.status === 'LOW', + ) if (failures.length > 0) { log.error( diff --git a/services/libs/data-access-layer/src/osspckgs/types.ts b/services/libs/data-access-layer/src/osspckgs/types.ts index c678738787..a6f4150ad7 100644 --- a/services/libs/data-access-layer/src/osspckgs/types.ts +++ b/services/libs/data-access-layer/src/osspckgs/types.ts @@ -1,7 +1,7 @@ // ─── packages_universe ──────────────────────────────────────────────────────── export interface IDbPackageUniverse { - id: number + id: string purl: string | null ecosystem: string namespace: string | null From 47aed0951f960f11788b3c68175c54067ea5c6d9 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 3 Jun 2026 22:21:32 +0200 Subject: [PATCH 12/22] fix: lint Signed-off-by: Umberto Sgueglia --- .../apps/packages_worker/src/maven/README.md | 207 +++++++++--------- 1 file changed, 104 insertions(+), 103 deletions(-) diff --git a/services/apps/packages_worker/src/maven/README.md b/services/apps/packages_worker/src/maven/README.md index ac8671af4e..70855b6144 100644 --- a/services/apps/packages_worker/src/maven/README.md +++ b/services/apps/packages_worker/src/maven/README.md @@ -33,9 +33,9 @@ description, homepage, SCM/repo, licenses, maintainers and the full version list Whether the version short-circuit applies is fixed per **entry point** (not a runtime flag): -| Entry point | Mode | Behaviour | -|-------------|------|-----------| -| Standalone `bin/maven.ts` | **backfill** | Always runs full POM extraction for every selected critical package, regardless of version. Use for the initial fill / periodic full refresh. | +| Entry point | Mode | Behaviour | +| -------------------------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Standalone `bin/maven.ts` | **backfill** | Always runs full POM extraction for every selected critical package, regardless of version. Use for the initial fill / periodic full refresh. | | Temporal `mavenCriticalWorkflow` | **incremental** | If the upstream release version equals the stored `latest_version`, skips the POM fetch and only bumps `last_synced_at` (status `unchanged`). Full extraction runs only for new packages or when the version changed. | This is passed as the `forceFullExtraction` argument to `processBatch` — `true` from the @@ -55,7 +55,7 @@ together. A module-level, coordinate-keyed in-process cache in `extract.ts` coll repeated parent fetches into a **single** HTTP request, and also removes the redundant second fetch of each artifact's own POM (`extractArtifact` fetches the leaf, then `resolveWithInheritance` would fetch it again at depth 0). This is the **single biggest lever -against Maven Central rate limiting** — and it works *because* of the namespace clustering, so +against Maven Central rate limiting** — and it works _because_ of the namespace clustering, so shuffling the batch (which the queue's `rank` ordering produces) would be counter-productive. - **Only successful fetches are cached.** `fetchPom` returns `null` for both a real 404 and a @@ -66,7 +66,7 @@ shuffling the batch (which the queue's `rank` ordering produces) would be counte - **Request coalescing.** Concurrent fetches for the same coordinates share a single in-flight request instead of issuing duplicates. - **Observability.** `getPomCacheStats()` returns `{ size, hits, coalesced, misses, evictions, - hitRate }`; the critical batch logs it once per batch under message **`POM cache`**, so you +hitRate }`; the critical batch logs it once per batch under message **`POM cache`**, so you can watch the hit rate climb as the cache warms. > The cache lives for the lifetime of the worker process. Under Temporal it persists **across @@ -85,71 +85,71 @@ all POM-derived columns stay null for them. ### packages -| Column | Source | Coverage | -|--------|--------|----------| -| purl | packages_universe | ✅ all | -| ecosystem | hardcoded `'maven'` | ✅ all | -| namespace | packages_universe.namespace (= groupId) | ✅ all | -| name | packages_universe.name (= artifactId) | ✅ all | -| registry_url | `https://central.sonatype.com/artifact/{ns}/{name}` | ✅ all | -| latest_version | maven-metadata.xml `` | ✅ all | -| ingestion_source | see table below | ✅ all | -| last_synced_at | NOW() | ✅ all | -| description | POM `` | ✅ best-effort¹ | -| homepage | POM `` | ✅ best-effort¹ | -| declared_repository_url | POM `` raw | ✅ best-effort¹ | -| repository_url | normalized from declared_repository_url | ✅ best-effort¹ | -| licenses / licenses_raw | POM `` | ✅ best-effort¹ / ✅ full for critical² | -| status | Sonatype: deprecated flag | 🔜 Sonatype | -| versions_count | Sonatype: COUNT of releases | 🔜 Sonatype | -| first_release_at | Sonatype: MIN release timestamp | 🔜 Sonatype | -| latest_release_at | Sonatype: MAX release timestamp | 🔜 Sonatype | -| keywords | not in Maven POM | ❌ | -| dist_tags_* | N/A — Maven ecosystem | ❌ | -| dependent_packages_count | not in Maven registry API | ❌ | -| dependent_repos_count | not in Maven registry API | ❌ | -| criticality_score | set by ranking function | ❌ | -| is_critical | set by ranking function | ❌ | -| last_rank_pass_at | set by ranking function | ❌ | +| Column | Source | Coverage | +| ------------------------ | --------------------------------------------------- | --------------------------------------- | +| purl | packages_universe | ✅ all | +| ecosystem | hardcoded `'maven'` | ✅ all | +| namespace | packages_universe.namespace (= groupId) | ✅ all | +| name | packages_universe.name (= artifactId) | ✅ all | +| registry_url | `https://central.sonatype.com/artifact/{ns}/{name}` | ✅ all | +| latest_version | maven-metadata.xml `` | ✅ all | +| ingestion_source | see table below | ✅ all | +| last_synced_at | NOW() | ✅ all | +| description | POM `` | ✅ best-effort¹ | +| homepage | POM `` | ✅ best-effort¹ | +| declared_repository_url | POM `` raw | ✅ best-effort¹ | +| repository_url | normalized from declared_repository_url | ✅ best-effort¹ | +| licenses / licenses_raw | POM `` | ✅ best-effort¹ / ✅ full for critical² | +| status | Sonatype: deprecated flag | 🔜 Sonatype | +| versions_count | Sonatype: COUNT of releases | 🔜 Sonatype | +| first_release_at | Sonatype: MIN release timestamp | 🔜 Sonatype | +| latest_release_at | Sonatype: MAX release timestamp | 🔜 Sonatype | +| keywords | not in Maven POM | ❌ | +| dist*tags*\* | N/A — Maven ecosystem | ❌ | +| dependent_packages_count | not in Maven registry API | ❌ | +| dependent_repos_count | not in Maven registry API | ❌ | +| criticality_score | set by ranking function | ❌ | +| is_critical | set by ranking function | ❌ | +| last_rank_pass_at | set by ranking function | ❌ | ### versions -| Column | Source | Coverage | -|--------|--------|----------| -| package_id | FK from packages upsert | ✅ all | -| ecosystem | hardcoded `'maven'` | ✅ all | -| number | maven-metadata.xml `` | ✅ all | -| is_latest | `number === ` | ✅ all | -| is_prerelease | regex on version string³ | ✅ all | -| last_synced_at | NOW() | ✅ all | -| licenses | package-level license applied to all versions⁴ (stored as a single-element `text[]`) | ✅ best-effort¹ | -| published_at | Sonatype: release timestamp | 🔜 Sonatype | -| is_yanked | no yank mechanism in Maven | ❌ | -| download_count | no public per-version API | ❌ | +| Column | Source | Coverage | +| -------------- | ------------------------------------------------------------------------------------ | --------------- | +| package_id | FK from packages upsert | ✅ all | +| ecosystem | hardcoded `'maven'` | ✅ all | +| number | maven-metadata.xml `` | ✅ all | +| is_latest | `number === ` | ✅ all | +| is_prerelease | regex on version string³ | ✅ all | +| last_synced_at | NOW() | ✅ all | +| licenses | package-level license applied to all versions⁴ (stored as a single-element `text[]`) | ✅ best-effort¹ | +| published_at | Sonatype: release timestamp | 🔜 Sonatype | +| is_yanked | no yank mechanism in Maven | ❌ | +| download_count | no public per-version API | ❌ | ### maintainers / package_maintainers -| Column | Source | Coverage | -|--------|--------|----------| -| ecosystem | hardcoded `'maven'` | ✅ all | -| username | POM `` | ✅ best-effort¹ | -| display_name | POM `` | ✅ best-effort¹ | -| email_hash | SHA-256(``) — GDPR | ✅ best-effort¹ | -| url | POM `` | ✅ best-effort¹ | -| role | `'author'` from ``, `'maintainer'` from `` | ✅ best-effort¹ | -| github_login | requires identity resolution | ❌ | +| Column | Source | Coverage | +| ------------ | -------------------------------------------------------------------- | --------------- | +| ecosystem | hardcoded `'maven'` | ✅ all | +| username | POM `` | ✅ best-effort¹ | +| display_name | POM `` | ✅ best-effort¹ | +| email_hash | SHA-256(``) — GDPR | ✅ best-effort¹ | +| url | POM `` | ✅ best-effort¹ | +| role | `'author'` from ``, `'maintainer'` from `` | ✅ best-effort¹ | +| github_login | requires identity resolution | ❌ | ### repos / package_repos -| Column | Source | Coverage | -|--------|--------|----------| -| repos.url | `repository_url` (normalized from POM ``) | ✅ best-effort¹ | -| repos.host | derived from URL (`github` / `gitlab` / `bitbucket` / `other`) | ✅ best-effort¹ | -| repos.owner | URL path segment | ✅ best-effort¹ | -| repos.name | URL path segment | ✅ best-effort¹ | -| repos.description / stars / forks / … | GitHub enricher | filled by github-repos-enricher | -| package_repos.source | `'declared'` (from POM ``) | ✅ best-effort¹ | -| package_repos.confidence | `0.80` | ✅ best-effort¹ | +| Column | Source | Coverage | +| ------------------------------------- | -------------------------------------------------------------- | ------------------------------- | +| repos.url | `repository_url` (normalized from POM ``) | ✅ best-effort¹ | +| repos.host | derived from URL (`github` / `gitlab` / `bitbucket` / `other`) | ✅ best-effort¹ | +| repos.owner | URL path segment | ✅ best-effort¹ | +| repos.name | URL path segment | ✅ best-effort¹ | +| repos.description / stars / forks / … | GitHub enricher | filled by github-repos-enricher | +| package_repos.source | `'declared'` (from POM ``) | ✅ best-effort¹ | +| package_repos.confidence | `0.80` | ✅ best-effort¹ | The POM fetcher seeds `repos` with URL-derivable fields only. The GitHub enricher then fills the rest (description, stars, forks, language, topics, etc.) because the repo row already exists. On conflict the `repos` upsert uses `COALESCE` — richer data from other enrichers is never overwritten. @@ -178,13 +178,13 @@ The POM fetcher seeds `repos` with URL-derivable fields only. The GitHub enriche ## `ingestion_source` Values -| Value | Meaning | -|-------|---------| -| `maven-registry` | Critical — full POM + parent resolution succeeded | -| `packages_universe` | Non-critical (DB-only) — only universe stats copied, no POM fetch | -| `maven_not_on_central` | `maven-metadata.xml` not found on `repo1.maven.org` — artifact is hosted on a third-party repository (e.g. WSO2 Nexus, JBoss, Atlassian). Universe data came from an aggregator (deps.dev, OSV). | -| `maven_no_version` | `maven-metadata.xml` found but `` is empty — artifact has no stable release | -| `maven_error` | `maven-metadata.xml` has a release version but the `.pom` file for that version is a 404. Typical cause: partial deploy to Maven Central (metadata updated, artifact not uploaded) or Eclipse P2 feature artifacts that don't publish a standard POM. | +| Value | Meaning | +| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `maven-registry` | Critical — full POM + parent resolution succeeded | +| `packages_universe` | Non-critical (DB-only) — only universe stats copied, no POM fetch | +| `maven_not_on_central` | `maven-metadata.xml` not found on `repo1.maven.org` — artifact is hosted on a third-party repository (e.g. WSO2 Nexus, JBoss, Atlassian). Universe data came from an aggregator (deps.dev, OSV). | +| `maven_no_version` | `maven-metadata.xml` found but `` is empty — artifact has no stable release | +| `maven_error` | `maven-metadata.xml` has a release version but the `.pom` file for that version is a 404. Typical cause: partial deploy to Maven Central (metadata updated, artifact not uploaded) or Eclipse P2 feature artifacts that don't publish a standard POM. | > On a 403/429 rate-limit or a transient network error, **no sentinel record is written**: > the batch counts the package as an error and it is simply retried on the next tick/pass. @@ -236,14 +236,14 @@ Occasionally a publisher's CI/CD updates `` in `maven-metadata.xml` bef **All variables are required** — `getMavenConfig()` (`config.ts`) calls `requireEnv` for each, so the worker throws on startup if any is missing. Suggested values shown. -| Env var | Suggested | Description | -|---------|-----------|-------------| -| `POM_FETCHER_BATCH_SIZE` | `50` | Packages per batch — critical phase | -| `POM_FETCHER_CONCURRENCY` | `5` | Concurrent fetches — critical phase | -| `POM_FETCHER_NON_CRITICAL_BATCH_SIZE` | `500` | Packages per batch — non-critical phase | -| `POM_FETCHER_NON_CRITICAL_CONCURRENCY` | `20` | Concurrent writes — non-critical DB-only phase | -| `POM_FETCHER_REFRESH_DAYS` | `1` | Staleness window — re-sync a package once its `last_synced_at` is older than N days (applies to both phases) | -| `POM_FETCHER_GROUP_DELAY_MS` | `200`–`400` | Delay between concurrent groups in the critical phase (rate-limit mitigation) | +| Env var | Suggested | Description | +| -------------------------------------- | ----------- | ------------------------------------------------------------------------------------------------------------ | +| `POM_FETCHER_BATCH_SIZE` | `50` | Packages per batch — critical phase | +| `POM_FETCHER_CONCURRENCY` | `5` | Concurrent fetches — critical phase | +| `POM_FETCHER_NON_CRITICAL_BATCH_SIZE` | `500` | Packages per batch — non-critical phase | +| `POM_FETCHER_NON_CRITICAL_CONCURRENCY` | `20` | Concurrent writes — non-critical DB-only phase | +| `POM_FETCHER_REFRESH_DAYS` | `1` | Staleness window — re-sync a package once its `last_synced_at` is older than N days (applies to both phases) | +| `POM_FETCHER_GROUP_DELAY_MS` | `200`–`400` | Delay between concurrent groups in the critical phase (rate-limit mitigation) | ### Sync source (Temporal critical path) @@ -251,14 +251,14 @@ These select **where the critical sync gets its work from**. They affect only th `processMavenCriticalBatch` activity — the standalone backfill loop is unaffected. All are optional; unset/invalid values fall back to the current universe-polling behaviour. -| Env var | Default | Description | -|---------|---------|-------------| -| `MAVEN_SYNC_SOURCE` | `maven` | `maven` = poll `packages_universe` by staleness (current behaviour). `api` = enrich only what the delta feed reports. `both` = run both passes per tick. | -| `MAVEN_DELTA_API_URL` | — | Base URL of our delta feed (e.g. the Railway deployment). **Required** when source is `api` or `both`. | -| `MAVEN_DELTA_API_TOKEN` | — | Optional bearer token for the delta feed. | -| `MAVEN_DELTA_API_PAGE_SIZE` | `100` | Page size for `/api/changes` pagination. | -| `MAVEN_DELTA_API_LOOKBACK_MINUTES` | `15` | Rolling window size: each tick fetches `[now-N, now)`. Overlaps the cron interval on purpose — re-processing is safe (idempotent upserts). | -| `MAVEN_DELTA_API_INCLUDE_PRERELEASE` | `false` | Forwarded as `includePrerelease` to the feed. | +| Env var | Default | Description | +| ------------------------------------ | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `MAVEN_SYNC_SOURCE` | `maven` | `maven` = poll `packages_universe` by staleness (current behaviour). `api` = enrich only what the delta feed reports. `both` = run both passes per tick. | +| `MAVEN_DELTA_API_URL` | — | Base URL of our delta feed (e.g. the Railway deployment). **Required** when source is `api` or `both`. | +| `MAVEN_DELTA_API_TOKEN` | — | Optional bearer token for the delta feed. | +| `MAVEN_DELTA_API_PAGE_SIZE` | `100` | Page size for `/api/changes` pagination. | +| `MAVEN_DELTA_API_LOOKBACK_MINUTES` | `15` | Rolling window size: each tick fetches `[now-N, now)`. Overlaps the cron interval on purpose — re-processing is safe (idempotent upserts). | +| `MAVEN_DELTA_API_INCLUDE_PRERELEASE` | `false` | Forwarded as `includePrerelease` to the feed. | The delta-API path always runs **full extraction** (the feed is an explicit "this changed" signal) and only enriches packages that are `is_critical` in `packages_universe`; non-critical @@ -272,18 +272,18 @@ purls in the feed are dropped. Observed on ~2K packages (local dev, Maven Central over the network): -| Phase | Mode | Throughput | Notes | -|-------|------|------------|-------| -| Non-critical | DB-only | ~1000 pkg/sec | Pure DB writes, no HTTP | -| Non-critical | direct-pom | ~25 pkg/sec | 2 HTTP requests/pkg: metadata.xml + POM | -| Critical | full-pom | ~15–25 pkg/sec | Faster when packages share parent POMs (CDN cache warm) | +| Phase | Mode | Throughput | Notes | +| ------------ | ---------- | -------------- | ------------------------------------------------------- | +| Non-critical | DB-only | ~1000 pkg/sec | Pure DB writes, no HTTP | +| Non-critical | direct-pom | ~25 pkg/sec | 2 HTTP requests/pkg: metadata.xml + POM | +| Critical | full-pom | ~15–25 pkg/sec | Faster when packages share parent POMs (CDN cache warm) | **Estimated time for ~800K packages (≈18% critical):** -| Phase | Packages | Estimated time | -|-------|----------|---------------| -| Non-critical (DB-only) | ~670K | ~12 min | -| Critical (full POM, first extraction) | ~150K | several hours | +| Phase | Packages | Estimated time | +| ------------------------------------- | -------- | -------------- | +| Non-critical (DB-only) | ~670K | ~12 min | +| Critical (full POM, first extraction) | ~150K | several hours | The first critical extraction is the expensive part — run it with the standalone backfill loop. Afterwards the Temporal schedules keep things incremental: non-critical re-syncs cheaply @@ -303,10 +303,10 @@ avoid Maven Central throttling) or trigger the schedule manually. Two Temporal schedules are registered on startup of `bin/packages-worker.ts` (see `maven/schedule.ts`): -| Schedule ID | Cron | Workflow | Activity | Workflow timeout | -|-------------|------|----------|----------|------------------| -| `maven-critical` | `*/5 * * * *` (every 5 min) | `mavenCriticalWorkflow` | `processMavenCriticalBatch` → one critical batch | 15 min | -| `maven-non-critical` | `*/10 * * * *` (every 10 min) | `mavenNonCriticalWorkflow` | `processMavenNonCriticalBatch` → one non-critical batch | 5 min | +| Schedule ID | Cron | Workflow | Activity | Workflow timeout | +| -------------------- | ----------------------------- | -------------------------- | ------------------------------------------------------- | ---------------- | +| `maven-critical` | `*/5 * * * *` (every 5 min) | `mavenCriticalWorkflow` | `processMavenCriticalBatch` → one critical batch | 15 min | +| `maven-non-critical` | `*/10 * * * *` (every 10 min) | `mavenNonCriticalWorkflow` | `processMavenNonCriticalBatch` → one non-critical batch | 5 min | Both: overlap policy `SKIP` (a tick is dropped if the previous run is still active), catchup window 1 hour, retry 3× (30s initial, 2× backoff). @@ -330,12 +330,12 @@ temporal schedule trigger --schedule-id maven-critical Maven packages released via automated CI/CD pipelines (every commit or every day) accumulate thousands of versions on Central. Observed examples on a 10K sample: -| Package | Versions | -|---------|----------| -| io.joern/x2cpg_3 | ~2 166 | -| org.cdk8s/cdk8s | ~1 749 | -| io.joern/semanticcpg_3 | ~2 077 | -| org.janusgraph/* (×15 artifacts) | ~795 each | +| Package | Versions | +| --------------------------------- | --------- | +| io.joern/x2cpg_3 | ~2 166 | +| org.cdk8s/cdk8s | ~1 749 | +| io.joern/semanticcpg_3 | ~2 077 | +| org.janusgraph/\* (×15 artifacts) | ~795 each | `maven-metadata.xml` `` lists **every version ever published**, including each snapshot, alpha, RC, and automated patch. On a 10K package run this produced ~3.8M rows in the `versions` table (~1 375 versions/package on average). @@ -369,6 +369,7 @@ ORDER BY packages DESC; ``` Expected behaviour: + - `maven` (critical, full resolution) → high repo coverage - `maven_direct` (non-critical, no parent resolution) → low repo coverage - `maven_not_on_central` / `maven_error` → no repo (no POM data) From e23d9d65190d2f4e352935dee048d1e63b8f7504 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 3 Jun 2026 22:31:15 +0200 Subject: [PATCH 13/22] fix: dependet column Signed-off-by: Umberto Sgueglia --- .../libs/data-access-layer/src/osspckgs/packages.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/services/libs/data-access-layer/src/osspckgs/packages.ts b/services/libs/data-access-layer/src/osspckgs/packages.ts index 33f04edba6..1878255954 100644 --- a/services/libs/data-access-layer/src/osspckgs/packages.ts +++ b/services/libs/data-access-layer/src/osspckgs/packages.ts @@ -72,7 +72,7 @@ export async function listMavenPackagesToSync( p.namespace, p.name, p.criticality_score AS "criticalityScore", - p.dependent_packages_count AS "dependentPackagesCount", + p.dependent_count AS "dependentPackagesCount", p.dependent_repos_count AS "dependentReposCount", p.downloads_last_month AS "downloads30d", p.latest_version AS "latestVersion" @@ -103,7 +103,7 @@ export async function listMavenPackagesToSync( pu.namespace, pu.name, pu.criticality_score AS "criticalityScore", - pu.dependent_packages_count AS "dependentPackagesCount", + pu.dependent_count AS "dependentPackagesCount", pu.dependent_repos_count AS "dependentReposCount", pu.downloads_30d AS "downloads30d", p.latest_version AS "latestVersion" @@ -147,7 +147,7 @@ export async function listMavenPackagesByPurls( p.namespace, p.name, p.criticality_score AS "criticalityScore", - p.dependent_packages_count AS "dependentPackagesCount", + p.dependent_count AS "dependentPackagesCount", p.dependent_repos_count AS "dependentReposCount", p.downloads_last_month AS "downloads30d", p.latest_version AS "latestVersion" @@ -186,7 +186,7 @@ export async function touchPackageSyncedAt( UPDATE packages SET last_synced_at = NOW(), criticality_score = COALESCE($(criticalityScore), criticality_score), - dependent_packages_count = COALESCE($(dependentPackagesCount), dependent_packages_count), + dependent_count = COALESCE($(dependentPackagesCount), dependent_count), dependent_repos_count = COALESCE($(dependentReposCount), dependent_repos_count), downloads_last_month = COALESCE($(downloadsLastMonth), downloads_last_month) WHERE purl = $(purl) @@ -238,7 +238,7 @@ export async function upsertPackage( purl, ecosystem, namespace, name, description, homepage, registry_url, declared_repository_url, repository_url, licenses, licenses_raw, latest_version, - criticality_score, dependent_packages_count, dependent_repos_count, downloads_last_month, + criticality_score, dependent_count, dependent_repos_count, downloads_last_month, ingestion_source, last_synced_at ) VALUES ( $(purl), $(ecosystem), $(namespace), $(name), @@ -257,7 +257,7 @@ export async function upsertPackage( licenses_raw = COALESCE(EXCLUDED.licenses_raw, packages.licenses_raw), latest_version = COALESCE(EXCLUDED.latest_version, packages.latest_version), criticality_score = COALESCE(EXCLUDED.criticality_score, packages.criticality_score), - dependent_packages_count = COALESCE(EXCLUDED.dependent_packages_count, packages.dependent_packages_count), + dependent_count = COALESCE(EXCLUDED.dependent_count, packages.dependent_count), dependent_repos_count = COALESCE(EXCLUDED.dependent_repos_count, packages.dependent_repos_count), downloads_last_month = COALESCE(EXCLUDED.downloads_last_month, packages.downloads_last_month), ingestion_source = EXCLUDED.ingestion_source, From 5db5779f0fbb1940514868c2cc1ac1e8476c44d0 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Wed, 3 Jun 2026 22:40:30 +0200 Subject: [PATCH 14/22] fix: dependet column Signed-off-by: Umberto Sgueglia --- services/libs/data-access-layer/src/osspckgs/packages.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/libs/data-access-layer/src/osspckgs/packages.ts b/services/libs/data-access-layer/src/osspckgs/packages.ts index 1878255954..8fa24572ab 100644 --- a/services/libs/data-access-layer/src/osspckgs/packages.ts +++ b/services/libs/data-access-layer/src/osspckgs/packages.ts @@ -79,6 +79,7 @@ export async function listMavenPackagesToSync( FROM packages p WHERE p.ecosystem = 'maven' + AND p.is_critical AND p.namespace IS NOT NULL AND ( p.ingestion_source IS NULL @@ -154,6 +155,7 @@ export async function listMavenPackagesByPurls( FROM packages p WHERE p.ecosystem = 'maven' + AND p.is_critical AND p.namespace IS NOT NULL AND p.purl = ANY($(purls)) ORDER BY From f21f5a43170f3b4bdb903a0d59437a85456e3139 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 4 Jun 2026 09:14:34 +0200 Subject: [PATCH 15/22] fix: cache logic Signed-off-by: Umberto Sgueglia --- .../apps/packages_worker/src/maven/README.md | 16 +++++++++------- .../src/maven/runMavenEnrichmentLoop.ts | 13 +++++++++++++ .../libs/data-access-layer/src/osspckgs/types.ts | 1 + .../data-access-layer/src/osspckgs/versions.ts | 9 ++++++--- 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/services/apps/packages_worker/src/maven/README.md b/services/apps/packages_worker/src/maven/README.md index 70855b6144..4fb2f4a0b5 100644 --- a/services/apps/packages_worker/src/maven/README.md +++ b/services/apps/packages_worker/src/maven/README.md @@ -50,13 +50,15 @@ critical packages, where data quality matters. Parent POMs are shared across many artifacts of the same namespace (`org.apache:apache`, `org.springframework.boot:spring-boot-starter-parent`, `com.google.cloud:google-cloud-shared-config`, …). -Because the queue is ordered by `rank_in_ecosystem`, those siblings are processed close -together. A module-level, coordinate-keyed in-process cache in `extract.ts` collapses the -repeated parent fetches into a **single** HTTP request, and also removes the redundant second -fetch of each artifact's own POM (`extractArtifact` fetches the leaf, then -`resolveWithInheritance` would fetch it again at depth 0). This is the **single biggest lever -against Maven Central rate limiting** — and it works _because_ of the namespace clustering, so -shuffling the batch (which the queue's `rank` ordering produces) would be counter-productive. +The batch is **selected** by criticality (`rank_in_ecosystem` / `criticality_score` via the SQL +`LIMIT`), but that ordering does **not** group same-namespace siblings — so before processing, +`processPackages` re-sorts each critical batch by `(namespace, name)`. That sort is what puts the +siblings adjacent, so a parent fetched for one is still cached when the next arrives. A +module-level, coordinate-keyed in-process cache in `extract.ts` then collapses the repeated parent +fetches into a **single** HTTP request, and also removes the redundant second fetch of each +artifact's own POM (`extractArtifact` fetches the leaf, then `resolveWithInheritance` would fetch +it again at depth 0). This is the **single biggest lever against Maven Central rate limiting** — and +it works _because_ of the namespace sort, so re-shuffling the batch would be counter-productive. - **Only successful fetches are cached.** `fetchPom` returns `null` for both a real 404 and a transient failure (throttle/timeout), so caching `null` would poison the cache — it is never diff --git a/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts index 84f9961b71..f48d7464ff 100644 --- a/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts +++ b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts @@ -283,6 +283,7 @@ async function processCriticalPackage( allVersions.map((v) => ({ packageId, ecosystem: 'maven', + namespace: groupId, name: artifactId, number: v, isLatest: v === metadata.releaseVersion, @@ -378,6 +379,18 @@ async function processPackages( if (packages.length === 0) return { processed: 0, skipped: 0, error: 0, unchanged: 0, hopLimitReached: 0 } + // Cluster the batch by namespace so artifacts sharing a parent POM are processed + // adjacently — this is what makes the parent-POM cache effective. The criticality + // ordering only decides *which* packages are in the batch (via the SQL LIMIT); + // it does not group same-namespace siblings, so we reorder here. Only matters on + // the critical path (the non-critical path issues no POM/parent HTTP). + if (isCritical) { + packages.sort( + (a, b) => + (a.namespace ?? '').localeCompare(b.namespace ?? '') || a.name.localeCompare(b.name), + ) + } + log.info({ count: packages.length, isCritical }, 'Batch started') const counts = { processed: 0, skipped: 0, error: 0, unchanged: 0, hopLimitReached: 0 } diff --git a/services/libs/data-access-layer/src/osspckgs/types.ts b/services/libs/data-access-layer/src/osspckgs/types.ts index a6f4150ad7..a1b6f1bc6d 100644 --- a/services/libs/data-access-layer/src/osspckgs/types.ts +++ b/services/libs/data-access-layer/src/osspckgs/types.ts @@ -59,6 +59,7 @@ export type IDbPackageMaintainerUpsert = { export type IDbVersionUpsert = { packageId: number ecosystem: string + namespace: string | null name: string number: string isLatest: boolean diff --git a/services/libs/data-access-layer/src/osspckgs/versions.ts b/services/libs/data-access-layer/src/osspckgs/versions.ts index c1a621a6cb..cdbaa9c40d 100644 --- a/services/libs/data-access-layer/src/osspckgs/versions.ts +++ b/services/libs/data-access-layer/src/osspckgs/versions.ts @@ -35,21 +35,23 @@ export async function upsertVersionsBatch( WHERE package_id = $(packageId)::bigint AND number = ANY($(numbers)::text[]) ), ins AS ( - INSERT INTO versions (package_id, ecosystem, name, number, is_latest, is_prerelease, licenses, last_synced_at) + INSERT INTO versions (package_id, ecosystem, namespace, name, number, is_latest, is_prerelease, licenses, last_synced_at) SELECT - t.package_id, t.ecosystem, t.name, t.number, t.is_latest, t.is_prerelease, + t.package_id, t.ecosystem, t.namespace, t.name, t.number, t.is_latest, t.is_prerelease, CASE WHEN t.license IS NULL THEN NULL ELSE ARRAY[t.license] END, NOW() FROM UNNEST( $(packageIds)::bigint[], $(ecosystems)::text[], + $(namespaces)::text[], $(names)::text[], $(numbers)::text[], $(isLatests)::bool[], $(isPreleases)::bool[], $(licenses)::text[] - ) AS t(package_id, ecosystem, name, number, is_latest, is_prerelease, license) + ) AS t(package_id, ecosystem, namespace, name, number, is_latest, is_prerelease, license) ON CONFLICT (package_id, number) DO UPDATE SET + namespace = COALESCE(EXCLUDED.namespace, versions.namespace), is_latest = EXCLUDED.is_latest, is_prerelease = EXCLUDED.is_prerelease, licenses = COALESCE(EXCLUDED.licenses, versions.licenses), @@ -68,6 +70,7 @@ export async function upsertVersionsBatch( packageId: versions[0].packageId, packageIds: versions.map((v) => v.packageId), ecosystems: versions.map((v) => v.ecosystem), + namespaces: versions.map((v) => v.namespace), names: versions.map((v) => v.name), numbers: versions.map((v) => v.number), isLatests: versions.map((v) => v.isLatest), From 7242d8272d48414b63cf3ab65d3dd3cdf709c5b5 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 4 Jun 2026 09:55:35 +0200 Subject: [PATCH 16/22] feat: deployment Signed-off-by: Umberto Sgueglia --- backend/.env.dist.local | 2 - scripts/builders/packages-worker.env | 4 + scripts/services/maven-worker.yaml | 67 +++++++ services/apps/packages_worker/package.json | 9 +- .../packages_worker/src/bin/maven-worker.ts | 12 ++ .../src/maven/data_quality.sql | 166 ++++++++++++++++++ 6 files changed, 253 insertions(+), 7 deletions(-) create mode 100644 scripts/builders/packages-worker.env create mode 100644 scripts/services/maven-worker.yaml create mode 100644 services/apps/packages_worker/src/bin/maven-worker.ts create mode 100644 services/apps/packages_worker/src/maven/data_quality.sql diff --git a/backend/.env.dist.local b/backend/.env.dist.local index fe988f7c08..83a2e4ecf9 100755 --- a/backend/.env.dist.local +++ b/backend/.env.dist.local @@ -207,8 +207,6 @@ POM_FETCHER_NON_CRITICAL_BATCH_SIZE=500 POM_FETCHER_NON_CRITICAL_CONCURRENCY=20 POM_FETCHER_REFRESH_DAYS=1 POM_FETCHER_GROUP_DELAY_MS=100 -# Set to 'true' on first run against a fresh/restored DB to skip the version-unchanged -# optimisation and force full POM extraction. Set to 'false' after the first pass. POM_FETCHER_FORCE_FULL_EXTRACTION=true POM_FETCHER_MAVEN_BASE_URL=https://maven-central.storage-download.googleapis.com/maven2 MAVEN_SYNC_SOURCE=both diff --git a/scripts/builders/packages-worker.env b/scripts/builders/packages-worker.env new file mode 100644 index 0000000000..9c8d44032c --- /dev/null +++ b/scripts/builders/packages-worker.env @@ -0,0 +1,4 @@ +DOCKERFILE="./services/docker/Dockerfile.packages-worker" +CONTEXT="../" +REPO="sjc.ocir.io/axbydjxa5zuh/packages-worker" +SERVICES="maven-worker" diff --git a/scripts/services/maven-worker.yaml b/scripts/services/maven-worker.yaml new file mode 100644 index 0000000000..c0142c22fd --- /dev/null +++ b/scripts/services/maven-worker.yaml @@ -0,0 +1,67 @@ +version: '3.1' + +x-env-args: &env-args + DOCKER_BUILDKIT: 1 + NODE_ENV: docker + SERVICE: maven-worker + CROWD_TEMPORAL_TASKQUEUE: packages-worker + CROWD_TEMPORAL_NAMESPACE: ${CROWD_PACKAGES_TEMPORAL_NAMESPACE} + SHELL: /bin/sh + SUPPRESS_NO_CONFIG_WARNING: 'true' + +services: + maven-worker: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages-worker + command: 'pnpm run start:maven-worker' + working_dir: /usr/crowd/app/services/apps/packages_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + restart: always + networks: + - crowd-bridge + + maven-worker-dev: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages-worker + command: 'pnpm run dev:maven-worker' + working_dir: /usr/crowd/app/services/apps/packages_worker + # user: '${USER_ID}:${GROUP_ID}' + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + hostname: maven-worker + networks: + - crowd-bridge + volumes: + - ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src + - ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src + - ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src + - ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src + - ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src + - ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src + - ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src + - ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src + - ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src + - ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src + - ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src + - ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src + - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src + - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src + - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src + - ../../services/apps/packages_worker/src:/usr/crowd/app/services/apps/packages_worker/src + +networks: + crowd-bridge: + external: true diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index 9b31642f68..7d5ba86dd0 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -5,21 +5,20 @@ "start:deps-dev-ingest": "CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest tsx src/bin/deps-dev-ingest.ts", "start:github-repos-enricher": "SERVICE=github-repos-enricher tsx src/bin/github-repos-enricher.ts", "start:packages-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker tsx src/bin/packages-worker.ts", + "start:maven-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=maven-worker tsx src/bin/maven-worker.ts", + "backfill:maven:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=maven LOG_LEVEL=info tsx src/bin/maven-backfill.ts", "backfill:maven": "SERVICE=maven tsx src/bin/maven-backfill.ts", "dev:packages-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", + "dev:maven-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=maven-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9236 src/bin/maven-worker.ts", "dev:deps-dev-ingest": "CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/deps-dev-ingest.ts", "dev:github-repos-enricher": "SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", "dev:packages-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=packages-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9233 src/bin/packages-worker.ts", + "dev:maven-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=packages-worker CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=maven-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9236 src/bin/maven-worker.ts", "dev:deps-dev-ingest:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=deps-dev-ingest CROWD_TEMPORAL_NAMESPACE=$CROWD_PACKAGES_TEMPORAL_NAMESPACE SERVICE=deps-dev-ingest nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9235 src/bin/deps-dev-ingest.ts", "dev:github-repos-enricher:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=github-repos-enricher LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9234 src/bin/github-repos-enricher.ts", "export-to-bucket": "SERVICE=deps-dev-ingest tsx src/scripts/exportToBucket.ts", "export-to-bucket:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=deps-dev-ingest tsx src/scripts/exportToBucket.ts", "monitor:osspckgs:local": "bash -c 'set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && node ../../../scripts/monitor-osspckgs.mjs'", - "backfill:maven:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=maven LOG_LEVEL=info tsx src/bin/maven-backfill.ts", - "benchmark:maven-delta": "SERVICE=maven tsx src/scripts/benchmarkMavenDelta.ts", - "benchmark:maven-delta:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=maven LOG_LEVEL=info tsx src/scripts/benchmarkMavenDelta.ts", - "validate:maven-quality": "SERVICE=maven tsx src/scripts/validateDataQuality.ts", - "validate:maven-quality:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=maven LOG_LEVEL=info tsx src/scripts/validateDataQuality.ts", "lint": "npx eslint --ext .ts src --max-warnings=0", "format": "npx prettier --write \"src/**/*.ts\"", "format-check": "npx prettier --check .", diff --git a/services/apps/packages_worker/src/bin/maven-worker.ts b/services/apps/packages_worker/src/bin/maven-worker.ts new file mode 100644 index 0000000000..54a4e7c672 --- /dev/null +++ b/services/apps/packages_worker/src/bin/maven-worker.ts @@ -0,0 +1,12 @@ +import { scheduleMavenCritical } from '../maven/schedule' +import { svc } from '../service' + +// Maven-only worker: runs on the shared `packages-worker` taskqueue (so it picks up +// the same bundled workflows/activities) but registers ONLY the maven-critical +// schedule. Intended for local dev — lets you run Maven in isolation without also +// firing the npm/osv schedules that bin/packages-worker.ts registers. +setImmediate(async () => { + await svc.init() + await scheduleMavenCritical() + await svc.start() +}) diff --git a/services/apps/packages_worker/src/maven/data_quality.sql b/services/apps/packages_worker/src/maven/data_quality.sql new file mode 100644 index 0000000000..b3b4ca99af --- /dev/null +++ b/services/apps/packages_worker/src/maven/data_quality.sql @@ -0,0 +1,166 @@ +-- ──────────────────────────────────────────────────────────────────────────── +-- Maven enrichment — data quality scorecard +-- +-- One read-only statement. Returns one row per check so it works in any SQL +-- client (psql, DBeaver, prod read-replica) and via validateDataQuality.ts. +-- +-- Scope: the CRITICAL Maven set (packages_universe.is_critical = true) — i.e. the +-- set the POM fetcher is responsible for. Coverage % is over the whole critical +-- universe, so packages not yet enriched count as "not covered". +-- +-- Columns: +-- section | metric | value | pct (of critical) | status +-- status: OK / LOW / POOR for coverage; OK / FAIL for anomalies & integrity. +-- Coverage thresholds (80% / 50%) are deliberately simple — tune as needed. +-- ──────────────────────────────────────────────────────────────────────────── + +WITH crit AS ( + -- denominator: the critical universe + SELECT pu.purl + FROM packages_universe pu + WHERE pu.ecosystem = 'maven' + AND pu.is_critical = true + AND pu.purl IS NOT NULL +), +crit_n AS (SELECT count(*)::numeric AS n FROM crit), + +-- per-package version aggregates (only for packages we care about) +ver AS ( + SELECT + v.package_id, + count(*) AS versions_count, + count(*) FILTER (WHERE v.is_latest) AS latest_count, + count(*) FILTER (WHERE v.is_latest AND v.is_prerelease) AS latest_prerelease_count, + max(v.number) FILTER (WHERE v.is_latest) AS latest_number + FROM versions v + GROUP BY v.package_id +), + +-- enriched critical packages with derived quality flags +pkg AS ( + SELECT + pk.id, + pk.ingestion_source, + pk.repository_url, + pk.latest_version, + pk.last_synced_at, + (pk.repository_url IS NOT NULL) AS has_repo, + (pk.licenses IS NOT NULL AND array_length(pk.licenses, 1) >= 1) AS has_license, + (pk.description IS NOT NULL AND length(btrim(pk.description)) > 0) AS has_description, + (pk.homepage IS NOT NULL) AS has_homepage, + COALESCE(ver.versions_count, 0) AS versions_count, + COALESCE(ver.latest_count, 0) AS latest_count, + COALESCE(ver.latest_prerelease_count, 0) AS latest_prerelease_count, + ver.latest_number, + (pm.package_id IS NOT NULL) AS has_maintainer + FROM packages pk + JOIN crit c ON c.purl = pk.purl + LEFT JOIN ver ON ver.package_id = pk.id + LEFT JOIN (SELECT DISTINCT package_id FROM package_maintainers) pm ON pm.package_id = pk.id +), + +-- all critical-scoped counts in a single row +agg AS ( + SELECT + (SELECT n FROM crit_n) AS critical_total, + count(*) AS enriched, + count(*) FILTER (WHERE ingestion_source = 'maven-registry') AS src_registry, + count(*) FILTER (WHERE ingestion_source = 'maven_not_on_central') AS src_not_central, + count(*) FILTER (WHERE ingestion_source = 'maven_no_version') AS src_no_version, + count(*) FILTER (WHERE ingestion_source = 'maven_error') AS src_error, + count(*) FILTER (WHERE ingestion_source = 'packages_universe') AS src_universe, + count(*) FILTER (WHERE has_repo) AS has_repo, + count(*) FILTER (WHERE has_license) AS has_license, + count(*) FILTER (WHERE versions_count > 0) AS has_versions, + count(*) FILTER (WHERE has_maintainer) AS has_maintainer, + count(*) FILTER (WHERE has_description) AS has_description, + count(*) FILTER (WHERE has_homepage) AS has_homepage, + count(*) FILTER (WHERE latest_count > 1) AS multi_latest, + count(*) FILTER (WHERE versions_count > 0 AND latest_count = 0) AS no_latest, + count(*) FILTER (WHERE latest_count = 1 + AND latest_number IS DISTINCT FROM latest_version) AS latest_mismatch, + count(*) FILTER (WHERE latest_prerelease_count > 0) AS prerelease_latest, + count(*) FILTER (WHERE has_repo AND repository_url !~ '^https?://') AS repo_not_http, + count(*) FILTER (WHERE last_synced_at >= now() - interval '24 hours') AS synced_24h, + count(*) FILTER (WHERE last_synced_at < now() - interval '24 hours' + AND last_synced_at >= now() - interval '7 days') AS synced_week, + count(*) FILTER (WHERE last_synced_at < now() - interval '7 days') AS synced_old + FROM pkg +), + +-- global integrity (not limited to the critical set) +integ AS ( + SELECT + (SELECT count(*) FROM (SELECT purl FROM packages GROUP BY purl HAVING count(*) > 1) d) AS dup_purl, + (SELECT count(*) FROM versions v LEFT JOIN packages p ON p.id = v.package_id WHERE p.id IS NULL) AS orphan_versions, + (SELECT count(*) FROM package_repos pr LEFT JOIN packages p ON p.id = pr.package_id WHERE p.id IS NULL) AS orphan_repos, + (SELECT count(*) FROM package_maintainers pm LEFT JOIN packages p ON p.id = pm.package_id WHERE p.id IS NULL) AS orphan_maintainers, + (SELECT count(*) FROM maintainers WHERE username IS NULL OR btrim(username) = '') AS maintainer_no_username +), + +-- coverage % + status helper +cov AS ( + SELECT + metric, value, + round(100.0 * value / nullif((SELECT critical_total FROM agg), 0), 1) AS pct + FROM (VALUES + ('enriched (row exists)', (SELECT enriched FROM agg)), + ('ingestion=maven-registry', (SELECT src_registry FROM agg)), + ('has repository_url', (SELECT has_repo FROM agg)), + ('has license', (SELECT has_license FROM agg)), + ('has versions', (SELECT has_versions FROM agg)), + ('has maintainer', (SELECT has_maintainer FROM agg)), + ('has description', (SELECT has_description FROM agg)), + ('has homepage', (SELECT has_homepage FROM agg)) + ) AS t(metric, value) +) + +-- ─── report ─────────────────────────────────────────────────────────────────── +SELECT ord, section, metric, value, pct_txt AS pct, status FROM ( + -- totals + SELECT 0 AS ord, '1. TOTALS' AS section, 'critical packages (universe)' AS metric, + (SELECT critical_total FROM agg) AS value, '100.0%' AS pct_txt, 'INFO' AS status + UNION ALL + SELECT 1, '1. TOTALS', 'never enriched (no packages row)', + (SELECT critical_total - enriched FROM agg), + to_char(round(100.0 * (SELECT critical_total - enriched FROM agg) + / nullif((SELECT critical_total FROM agg), 0), 1), 'FM990.0') || '%', + CASE WHEN (SELECT critical_total - enriched FROM agg) = 0 THEN 'OK' ELSE 'INFO' END + + -- coverage + UNION ALL + SELECT 10 + row_number() OVER (ORDER BY metric), '2. COVERAGE', metric, value, + to_char(pct, 'FM990.0') || '%', + CASE WHEN pct >= 80 THEN 'OK' WHEN pct >= 50 THEN 'LOW' ELSE 'POOR' END + FROM cov + + -- ingestion_source breakdown (enriched rows) + UNION ALL SELECT 30, '3. INGESTION SOURCE', 'maven-registry (full POM ok)', (SELECT src_registry FROM agg), NULL, 'INFO' + UNION ALL SELECT 31, '3. INGESTION SOURCE', 'maven_not_on_central', (SELECT src_not_central FROM agg), NULL, 'INFO' + UNION ALL SELECT 32, '3. INGESTION SOURCE', 'maven_no_version', (SELECT src_no_version FROM agg), NULL, 'INFO' + UNION ALL SELECT 33, '3. INGESTION SOURCE', 'maven_error', (SELECT src_error FROM agg), NULL, + CASE WHEN (SELECT src_error FROM agg) = 0 THEN 'OK' ELSE 'WARN' END + UNION ALL SELECT 34, '3. INGESTION SOURCE', 'packages_universe (unexpected on critical)', (SELECT src_universe FROM agg), NULL, + CASE WHEN (SELECT src_universe FROM agg) = 0 THEN 'OK' ELSE 'WARN' END + + -- anomalies (expect 0) + UNION ALL SELECT 40, '4. ANOMALIES (expect 0)', 'packages with >1 is_latest version', (SELECT multi_latest FROM agg), NULL, CASE WHEN (SELECT multi_latest FROM agg)=0 THEN 'OK' ELSE 'FAIL' END + UNION ALL SELECT 41, '4. ANOMALIES (expect 0)', 'has versions but no is_latest', (SELECT no_latest FROM agg), NULL, CASE WHEN (SELECT no_latest FROM agg)=0 THEN 'OK' ELSE 'FAIL' END + UNION ALL SELECT 42, '4. ANOMALIES (expect 0)', 'latest_version != is_latest version', (SELECT latest_mismatch FROM agg), NULL, CASE WHEN (SELECT latest_mismatch FROM agg)=0 THEN 'OK' ELSE 'FAIL' END + UNION ALL SELECT 43, '4. ANOMALIES (expect 0)', 'prerelease flagged as latest', (SELECT prerelease_latest FROM agg), NULL, CASE WHEN (SELECT prerelease_latest FROM agg)=0 THEN 'OK' ELSE 'WARN' END + UNION ALL SELECT 44, '4. ANOMALIES (expect 0)', 'repository_url not http(s)', (SELECT repo_not_http FROM agg), NULL, CASE WHEN (SELECT repo_not_http FROM agg)=0 THEN 'OK' ELSE 'WARN' END + + -- global integrity (expect 0) + UNION ALL SELECT 50, '5. INTEGRITY (expect 0)', 'duplicate purls in packages', (SELECT dup_purl FROM integ), NULL, CASE WHEN (SELECT dup_purl FROM integ)=0 THEN 'OK' ELSE 'FAIL' END + UNION ALL SELECT 51, '5. INTEGRITY (expect 0)', 'orphan versions', (SELECT orphan_versions FROM integ), NULL, CASE WHEN (SELECT orphan_versions FROM integ)=0 THEN 'OK' ELSE 'FAIL' END + UNION ALL SELECT 52, '5. INTEGRITY (expect 0)', 'orphan package_repos', (SELECT orphan_repos FROM integ), NULL, CASE WHEN (SELECT orphan_repos FROM integ)=0 THEN 'OK' ELSE 'FAIL' END + UNION ALL SELECT 53, '5. INTEGRITY (expect 0)', 'orphan package_maintainers', (SELECT orphan_maintainers FROM integ), NULL, CASE WHEN (SELECT orphan_maintainers FROM integ)=0 THEN 'OK' ELSE 'FAIL' END + UNION ALL SELECT 54, '5. INTEGRITY (expect 0)', 'maintainers without username', (SELECT maintainer_no_username FROM integ), NULL, CASE WHEN (SELECT maintainer_no_username FROM integ)=0 THEN 'OK' ELSE 'FAIL' END + + -- freshness (enriched critical) + UNION ALL SELECT 60, '6. FRESHNESS', 'synced in last 24h', (SELECT synced_24h FROM agg), NULL, 'INFO' + UNION ALL SELECT 61, '6. FRESHNESS', 'synced 24h–7d ago', (SELECT synced_week FROM agg), NULL, 'INFO' + UNION ALL SELECT 62, '6. FRESHNESS', 'synced > 7d ago (stale)', (SELECT synced_old FROM agg), NULL, + CASE WHEN (SELECT synced_old FROM agg) = 0 THEN 'OK' ELSE 'WARN' END +) report +ORDER BY ord; From e18960969c37165ba09275dc7f42f23bf05b4921 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 4 Jun 2026 11:03:37 +0200 Subject: [PATCH 17/22] fix: simplify pr for deployment Signed-off-by: Umberto Sgueglia --- backend/.env.dist.local | 4 +- services/apps/packages_worker/src/config.ts | 31 ---- .../apps/packages_worker/src/maven/README.md | 19 -- .../packages_worker/src/maven/activities.ts | 46 +---- .../src/maven/data_quality.sql | 166 ------------------ .../packages_worker/src/maven/deltaApi.ts | 128 -------------- .../packages_worker/src/maven/metadata.ts | 25 ++- .../src/maven/runMavenEnrichmentLoop.ts | 75 +------- .../src/scripts/benchmarkMavenDelta.ts | 82 --------- .../src/scripts/validateDataQuality.ts | 90 ---------- .../src/osspckgs/packages.ts | 52 ++---- .../data-access-layer/src/osspckgs/types.ts | 2 + 12 files changed, 45 insertions(+), 675 deletions(-) delete mode 100644 services/apps/packages_worker/src/maven/data_quality.sql delete mode 100644 services/apps/packages_worker/src/maven/deltaApi.ts delete mode 100644 services/apps/packages_worker/src/scripts/benchmarkMavenDelta.ts delete mode 100644 services/apps/packages_worker/src/scripts/validateDataQuality.ts diff --git a/backend/.env.dist.local b/backend/.env.dist.local index 83a2e4ecf9..614e6dd66d 100755 --- a/backend/.env.dist.local +++ b/backend/.env.dist.local @@ -208,6 +208,4 @@ POM_FETCHER_NON_CRITICAL_CONCURRENCY=20 POM_FETCHER_REFRESH_DAYS=1 POM_FETCHER_GROUP_DELAY_MS=100 POM_FETCHER_FORCE_FULL_EXTRACTION=true -POM_FETCHER_MAVEN_BASE_URL=https://maven-central.storage-download.googleapis.com/maven2 -MAVEN_SYNC_SOURCE=both -MAVEN_DELTA_API_URL=https://maven-fetcher-production.up.railway.app \ No newline at end of file +POM_FETCHER_MAVEN_BASE_URL=https://maven-central.storage-download.googleapis.com/maven2 \ No newline at end of file diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index 30726fdd19..30d1f66ca0 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -46,26 +46,7 @@ export function getEnricherConfig() { } } -// Which source drives the critical Maven sync: -// 'maven' → poll packages_universe by staleness (current behaviour, default/fallback) -// 'api' → only enrich what our delta feed reports as changed -// 'both' → run both passes in the same Temporal tick -export type MavenSyncSource = 'api' | 'maven' | 'both' - -function parseMavenSyncSource(raw: string | undefined): MavenSyncSource { - if (raw === 'api' || raw === 'both') return raw - // Anything else (unset, typo, legacy value) falls back to the current behaviour. - return 'maven' -} - export function getMavenConfig() { - const syncSource = parseMavenSyncSource(process.env.MAVEN_SYNC_SOURCE) - const deltaApiBaseUrl = (process.env.MAVEN_DELTA_API_URL ?? '').replace(/\/+$/, '') - - if (syncSource !== 'maven' && !deltaApiBaseUrl) { - throw new Error(`MAVEN_SYNC_SOURCE='${syncSource}' requires MAVEN_DELTA_API_URL to be set`) - } - return { batchSize: requireEnvInt('POM_FETCHER_BATCH_SIZE'), concurrency: requireEnvInt('POM_FETCHER_CONCURRENCY'), @@ -73,17 +54,5 @@ export function getMavenConfig() { nonCriticalConcurrency: requireEnvInt('POM_FETCHER_NON_CRITICAL_CONCURRENCY'), refreshDays: requireEnvInt('POM_FETCHER_REFRESH_DAYS'), groupDelayMs: requireEnvInt('POM_FETCHER_GROUP_DELAY_MS'), - syncSource, - deltaApi: { - baseUrl: deltaApiBaseUrl, - token: process.env.MAVEN_DELTA_API_TOKEN || undefined, - pageSize: process.env.MAVEN_DELTA_API_PAGE_SIZE - ? parseInt(process.env.MAVEN_DELTA_API_PAGE_SIZE, 10) - : 100, - lookbackMinutes: process.env.MAVEN_DELTA_API_LOOKBACK_MINUTES - ? parseInt(process.env.MAVEN_DELTA_API_LOOKBACK_MINUTES, 10) - : 15, - includePrerelease: process.env.MAVEN_DELTA_API_INCLUDE_PRERELEASE === 'true', - }, } } diff --git a/services/apps/packages_worker/src/maven/README.md b/services/apps/packages_worker/src/maven/README.md index 4fb2f4a0b5..acda3acd8a 100644 --- a/services/apps/packages_worker/src/maven/README.md +++ b/services/apps/packages_worker/src/maven/README.md @@ -247,25 +247,6 @@ so the worker throws on startup if any is missing. Suggested values shown. | `POM_FETCHER_REFRESH_DAYS` | `1` | Staleness window — re-sync a package once its `last_synced_at` is older than N days (applies to both phases) | | `POM_FETCHER_GROUP_DELAY_MS` | `200`–`400` | Delay between concurrent groups in the critical phase (rate-limit mitigation) | -### Sync source (Temporal critical path) - -These select **where the critical sync gets its work from**. They affect only the Temporal -`processMavenCriticalBatch` activity — the standalone backfill loop is unaffected. All are -optional; unset/invalid values fall back to the current universe-polling behaviour. - -| Env var | Default | Description | -| ------------------------------------ | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `MAVEN_SYNC_SOURCE` | `maven` | `maven` = poll `packages_universe` by staleness (current behaviour). `api` = enrich only what the delta feed reports. `both` = run both passes per tick. | -| `MAVEN_DELTA_API_URL` | — | Base URL of our delta feed (e.g. the Railway deployment). **Required** when source is `api` or `both`. | -| `MAVEN_DELTA_API_TOKEN` | — | Optional bearer token for the delta feed. | -| `MAVEN_DELTA_API_PAGE_SIZE` | `100` | Page size for `/api/changes` pagination. | -| `MAVEN_DELTA_API_LOOKBACK_MINUTES` | `15` | Rolling window size: each tick fetches `[now-N, now)`. Overlaps the cron interval on purpose — re-processing is safe (idempotent upserts). | -| `MAVEN_DELTA_API_INCLUDE_PRERELEASE` | `false` | Forwarded as `includePrerelease` to the feed. | - -The delta-API path always runs **full extraction** (the feed is an explicit "this changed" -signal) and only enriches packages that are `is_critical` in `packages_universe`; non-critical -purls in the feed are dropped. - **Concurrency guidance:** Maven Central handles 10–15 concurrent requests per IP without throttling. Retry logic with exponential backoff handles 429/403s. Keep `POM_FETCHER_CONCURRENCY` ≤ 5 locally — repeated local runs heat the IP (see [Known Exceptions](#maven-central-403-rate-limiting)). --- diff --git a/services/apps/packages_worker/src/maven/activities.ts b/services/apps/packages_worker/src/maven/activities.ts index 66db62dd2e..2380ec7a7d 100644 --- a/services/apps/packages_worker/src/maven/activities.ts +++ b/services/apps/packages_worker/src/maven/activities.ts @@ -3,54 +3,18 @@ import { getServiceChildLogger } from '@crowd/logging' import { getMavenConfig } from '../config' import { getPackagesDb } from '../db' -import { BatchResult, processApiChangesBatch, processBatch } from './runMavenEnrichmentLoop' +import { BatchResult, processBatch } from './runMavenEnrichmentLoop' const log = getServiceChildLogger('maven-activity') -function addBatchResult(into: BatchResult, from: BatchResult): void { - into.processed += from.processed - into.skipped += from.skipped - into.error += from.error - into.unchanged += from.unchanged - into.hopLimitReached += from.hopLimitReached -} - export async function processMavenCriticalBatch(): Promise { const config = getMavenConfig() const qx = await getPackagesDb() - const total: BatchResult = { - processed: 0, - skipped: 0, - error: 0, - unchanged: 0, - hopLimitReached: 0, - } - - // Delta-API pass: enrich what our feed reports as changed (forces full extraction). - if (config.syncSource === 'api' || config.syncSource === 'both') { - try { - const apiResult = await processApiChangesBatch(qx, config) - log.info({ ...apiResult }, 'Maven delta-API batch complete') - addBatchResult(total, apiResult) - } catch (err) { - // In 'both' mode the universe-polling pass is the reliable backbone, so a flaky - // delta feed must never block it — log and continue. In 'api' mode there is no - // fallback, so let the activity fail and have Temporal retry it. - if (config.syncSource === 'api') throw err - const message = err instanceof Error ? err.message : String(err) - log.warn({ error: message }, 'Delta-API pass failed — continuing with universe-polling pass') - } - } - - // Universe-polling pass: current behaviour — skip POM extraction when version is unchanged. - if (config.syncSource === 'maven' || config.syncSource === 'both') { - const mavenResult = await processBatch(qx, config, true, false) - log.info({ ...mavenResult }, 'Maven critical batch complete') - addBatchResult(total, mavenResult) - } - - return total + // Universe-polling pass: skip POM extraction when version is unchanged. + const result = await processBatch(qx, config, true, false) + log.info({ ...result }, 'Maven critical batch complete') + return result } export async function processMavenNonCriticalBatch(): Promise { diff --git a/services/apps/packages_worker/src/maven/data_quality.sql b/services/apps/packages_worker/src/maven/data_quality.sql deleted file mode 100644 index b3b4ca99af..0000000000 --- a/services/apps/packages_worker/src/maven/data_quality.sql +++ /dev/null @@ -1,166 +0,0 @@ --- ──────────────────────────────────────────────────────────────────────────── --- Maven enrichment — data quality scorecard --- --- One read-only statement. Returns one row per check so it works in any SQL --- client (psql, DBeaver, prod read-replica) and via validateDataQuality.ts. --- --- Scope: the CRITICAL Maven set (packages_universe.is_critical = true) — i.e. the --- set the POM fetcher is responsible for. Coverage % is over the whole critical --- universe, so packages not yet enriched count as "not covered". --- --- Columns: --- section | metric | value | pct (of critical) | status --- status: OK / LOW / POOR for coverage; OK / FAIL for anomalies & integrity. --- Coverage thresholds (80% / 50%) are deliberately simple — tune as needed. --- ──────────────────────────────────────────────────────────────────────────── - -WITH crit AS ( - -- denominator: the critical universe - SELECT pu.purl - FROM packages_universe pu - WHERE pu.ecosystem = 'maven' - AND pu.is_critical = true - AND pu.purl IS NOT NULL -), -crit_n AS (SELECT count(*)::numeric AS n FROM crit), - --- per-package version aggregates (only for packages we care about) -ver AS ( - SELECT - v.package_id, - count(*) AS versions_count, - count(*) FILTER (WHERE v.is_latest) AS latest_count, - count(*) FILTER (WHERE v.is_latest AND v.is_prerelease) AS latest_prerelease_count, - max(v.number) FILTER (WHERE v.is_latest) AS latest_number - FROM versions v - GROUP BY v.package_id -), - --- enriched critical packages with derived quality flags -pkg AS ( - SELECT - pk.id, - pk.ingestion_source, - pk.repository_url, - pk.latest_version, - pk.last_synced_at, - (pk.repository_url IS NOT NULL) AS has_repo, - (pk.licenses IS NOT NULL AND array_length(pk.licenses, 1) >= 1) AS has_license, - (pk.description IS NOT NULL AND length(btrim(pk.description)) > 0) AS has_description, - (pk.homepage IS NOT NULL) AS has_homepage, - COALESCE(ver.versions_count, 0) AS versions_count, - COALESCE(ver.latest_count, 0) AS latest_count, - COALESCE(ver.latest_prerelease_count, 0) AS latest_prerelease_count, - ver.latest_number, - (pm.package_id IS NOT NULL) AS has_maintainer - FROM packages pk - JOIN crit c ON c.purl = pk.purl - LEFT JOIN ver ON ver.package_id = pk.id - LEFT JOIN (SELECT DISTINCT package_id FROM package_maintainers) pm ON pm.package_id = pk.id -), - --- all critical-scoped counts in a single row -agg AS ( - SELECT - (SELECT n FROM crit_n) AS critical_total, - count(*) AS enriched, - count(*) FILTER (WHERE ingestion_source = 'maven-registry') AS src_registry, - count(*) FILTER (WHERE ingestion_source = 'maven_not_on_central') AS src_not_central, - count(*) FILTER (WHERE ingestion_source = 'maven_no_version') AS src_no_version, - count(*) FILTER (WHERE ingestion_source = 'maven_error') AS src_error, - count(*) FILTER (WHERE ingestion_source = 'packages_universe') AS src_universe, - count(*) FILTER (WHERE has_repo) AS has_repo, - count(*) FILTER (WHERE has_license) AS has_license, - count(*) FILTER (WHERE versions_count > 0) AS has_versions, - count(*) FILTER (WHERE has_maintainer) AS has_maintainer, - count(*) FILTER (WHERE has_description) AS has_description, - count(*) FILTER (WHERE has_homepage) AS has_homepage, - count(*) FILTER (WHERE latest_count > 1) AS multi_latest, - count(*) FILTER (WHERE versions_count > 0 AND latest_count = 0) AS no_latest, - count(*) FILTER (WHERE latest_count = 1 - AND latest_number IS DISTINCT FROM latest_version) AS latest_mismatch, - count(*) FILTER (WHERE latest_prerelease_count > 0) AS prerelease_latest, - count(*) FILTER (WHERE has_repo AND repository_url !~ '^https?://') AS repo_not_http, - count(*) FILTER (WHERE last_synced_at >= now() - interval '24 hours') AS synced_24h, - count(*) FILTER (WHERE last_synced_at < now() - interval '24 hours' - AND last_synced_at >= now() - interval '7 days') AS synced_week, - count(*) FILTER (WHERE last_synced_at < now() - interval '7 days') AS synced_old - FROM pkg -), - --- global integrity (not limited to the critical set) -integ AS ( - SELECT - (SELECT count(*) FROM (SELECT purl FROM packages GROUP BY purl HAVING count(*) > 1) d) AS dup_purl, - (SELECT count(*) FROM versions v LEFT JOIN packages p ON p.id = v.package_id WHERE p.id IS NULL) AS orphan_versions, - (SELECT count(*) FROM package_repos pr LEFT JOIN packages p ON p.id = pr.package_id WHERE p.id IS NULL) AS orphan_repos, - (SELECT count(*) FROM package_maintainers pm LEFT JOIN packages p ON p.id = pm.package_id WHERE p.id IS NULL) AS orphan_maintainers, - (SELECT count(*) FROM maintainers WHERE username IS NULL OR btrim(username) = '') AS maintainer_no_username -), - --- coverage % + status helper -cov AS ( - SELECT - metric, value, - round(100.0 * value / nullif((SELECT critical_total FROM agg), 0), 1) AS pct - FROM (VALUES - ('enriched (row exists)', (SELECT enriched FROM agg)), - ('ingestion=maven-registry', (SELECT src_registry FROM agg)), - ('has repository_url', (SELECT has_repo FROM agg)), - ('has license', (SELECT has_license FROM agg)), - ('has versions', (SELECT has_versions FROM agg)), - ('has maintainer', (SELECT has_maintainer FROM agg)), - ('has description', (SELECT has_description FROM agg)), - ('has homepage', (SELECT has_homepage FROM agg)) - ) AS t(metric, value) -) - --- ─── report ─────────────────────────────────────────────────────────────────── -SELECT ord, section, metric, value, pct_txt AS pct, status FROM ( - -- totals - SELECT 0 AS ord, '1. TOTALS' AS section, 'critical packages (universe)' AS metric, - (SELECT critical_total FROM agg) AS value, '100.0%' AS pct_txt, 'INFO' AS status - UNION ALL - SELECT 1, '1. TOTALS', 'never enriched (no packages row)', - (SELECT critical_total - enriched FROM agg), - to_char(round(100.0 * (SELECT critical_total - enriched FROM agg) - / nullif((SELECT critical_total FROM agg), 0), 1), 'FM990.0') || '%', - CASE WHEN (SELECT critical_total - enriched FROM agg) = 0 THEN 'OK' ELSE 'INFO' END - - -- coverage - UNION ALL - SELECT 10 + row_number() OVER (ORDER BY metric), '2. COVERAGE', metric, value, - to_char(pct, 'FM990.0') || '%', - CASE WHEN pct >= 80 THEN 'OK' WHEN pct >= 50 THEN 'LOW' ELSE 'POOR' END - FROM cov - - -- ingestion_source breakdown (enriched rows) - UNION ALL SELECT 30, '3. INGESTION SOURCE', 'maven-registry (full POM ok)', (SELECT src_registry FROM agg), NULL, 'INFO' - UNION ALL SELECT 31, '3. INGESTION SOURCE', 'maven_not_on_central', (SELECT src_not_central FROM agg), NULL, 'INFO' - UNION ALL SELECT 32, '3. INGESTION SOURCE', 'maven_no_version', (SELECT src_no_version FROM agg), NULL, 'INFO' - UNION ALL SELECT 33, '3. INGESTION SOURCE', 'maven_error', (SELECT src_error FROM agg), NULL, - CASE WHEN (SELECT src_error FROM agg) = 0 THEN 'OK' ELSE 'WARN' END - UNION ALL SELECT 34, '3. INGESTION SOURCE', 'packages_universe (unexpected on critical)', (SELECT src_universe FROM agg), NULL, - CASE WHEN (SELECT src_universe FROM agg) = 0 THEN 'OK' ELSE 'WARN' END - - -- anomalies (expect 0) - UNION ALL SELECT 40, '4. ANOMALIES (expect 0)', 'packages with >1 is_latest version', (SELECT multi_latest FROM agg), NULL, CASE WHEN (SELECT multi_latest FROM agg)=0 THEN 'OK' ELSE 'FAIL' END - UNION ALL SELECT 41, '4. ANOMALIES (expect 0)', 'has versions but no is_latest', (SELECT no_latest FROM agg), NULL, CASE WHEN (SELECT no_latest FROM agg)=0 THEN 'OK' ELSE 'FAIL' END - UNION ALL SELECT 42, '4. ANOMALIES (expect 0)', 'latest_version != is_latest version', (SELECT latest_mismatch FROM agg), NULL, CASE WHEN (SELECT latest_mismatch FROM agg)=0 THEN 'OK' ELSE 'FAIL' END - UNION ALL SELECT 43, '4. ANOMALIES (expect 0)', 'prerelease flagged as latest', (SELECT prerelease_latest FROM agg), NULL, CASE WHEN (SELECT prerelease_latest FROM agg)=0 THEN 'OK' ELSE 'WARN' END - UNION ALL SELECT 44, '4. ANOMALIES (expect 0)', 'repository_url not http(s)', (SELECT repo_not_http FROM agg), NULL, CASE WHEN (SELECT repo_not_http FROM agg)=0 THEN 'OK' ELSE 'WARN' END - - -- global integrity (expect 0) - UNION ALL SELECT 50, '5. INTEGRITY (expect 0)', 'duplicate purls in packages', (SELECT dup_purl FROM integ), NULL, CASE WHEN (SELECT dup_purl FROM integ)=0 THEN 'OK' ELSE 'FAIL' END - UNION ALL SELECT 51, '5. INTEGRITY (expect 0)', 'orphan versions', (SELECT orphan_versions FROM integ), NULL, CASE WHEN (SELECT orphan_versions FROM integ)=0 THEN 'OK' ELSE 'FAIL' END - UNION ALL SELECT 52, '5. INTEGRITY (expect 0)', 'orphan package_repos', (SELECT orphan_repos FROM integ), NULL, CASE WHEN (SELECT orphan_repos FROM integ)=0 THEN 'OK' ELSE 'FAIL' END - UNION ALL SELECT 53, '5. INTEGRITY (expect 0)', 'orphan package_maintainers', (SELECT orphan_maintainers FROM integ), NULL, CASE WHEN (SELECT orphan_maintainers FROM integ)=0 THEN 'OK' ELSE 'FAIL' END - UNION ALL SELECT 54, '5. INTEGRITY (expect 0)', 'maintainers without username', (SELECT maintainer_no_username FROM integ), NULL, CASE WHEN (SELECT maintainer_no_username FROM integ)=0 THEN 'OK' ELSE 'FAIL' END - - -- freshness (enriched critical) - UNION ALL SELECT 60, '6. FRESHNESS', 'synced in last 24h', (SELECT synced_24h FROM agg), NULL, 'INFO' - UNION ALL SELECT 61, '6. FRESHNESS', 'synced 24h–7d ago', (SELECT synced_week FROM agg), NULL, 'INFO' - UNION ALL SELECT 62, '6. FRESHNESS', 'synced > 7d ago (stale)', (SELECT synced_old FROM agg), NULL, - CASE WHEN (SELECT synced_old FROM agg) = 0 THEN 'OK' ELSE 'WARN' END -) report -ORDER BY ord; diff --git a/services/apps/packages_worker/src/maven/deltaApi.ts b/services/apps/packages_worker/src/maven/deltaApi.ts deleted file mode 100644 index 0b9a24a505..0000000000 --- a/services/apps/packages_worker/src/maven/deltaApi.ts +++ /dev/null @@ -1,128 +0,0 @@ -/** - * Client for our own Maven delta feed (deployed separately, e.g. on Railway). - * It diffs the Maven Central index and exposes the artifacts that changed in a - * time window, so we can enrich exactly the packages that moved instead of - * polling the whole universe. - * - * Endpoint: - * GET {baseUrl}/api/changes?since=&until=&pageSize=&includePrerelease= - * - * Response: - * { - * "window": { "since": "...", "until": "..." }, - * "changes": [ - * { "purl", "groupId", "artifactId", "version", "publishedAt", - * "isPrerelease", "changeType" }, ... - * ], - * "nextCursor": "..." // present while more pages remain - * } - */ -import axios from 'axios' - -import { getServiceChildLogger } from '@crowd/logging' - -const log = getServiceChildLogger('maven-delta-api') - -const REQUEST_TIMEOUT_MS = 15_000 -// Hard stop so a misbehaving cursor can never spin forever. -const MAX_PAGES = 1_000 -const MAX_RETRIES = 3 -const RETRY_BASE_MS = 1_000 - -function sleep(ms: number): Promise { - return new Promise((r) => setTimeout(r, ms)) -} - -// Transient: no HTTP response at all (socket aborted/reset, timeout — the -// 'Error: aborted' we see when Railway drops the gzip stream mid-flight) or a -// retryable status (5xx / 429). Everything else (4xx, parse errors) is fatal. -function isTransientError(err: unknown): boolean { - if (!axios.isAxiosError(err)) return false - const status = err.response?.status - if (status === undefined) return true - return status >= 500 || status === 429 -} - -export interface MavenChange { - purl: string - groupId: string - artifactId: string - version: string - publishedAt: string - isPrerelease: boolean - changeType: string -} - -interface ChangesResponse { - window: { since: string; until: string } - changes: MavenChange[] - nextCursor?: string -} - -export interface FetchChangesOptions { - baseUrl: string - token?: string - since: string // ISO timestamp - until: string // ISO timestamp - pageSize: number - includePrerelease: boolean -} - -/** - * Fetches every change in [since, until), following `nextCursor` pagination. - */ -export async function fetchMavenChanges(opts: FetchChangesOptions): Promise { - const all: MavenChange[] = [] - let cursor: string | undefined - let page = 0 - - do { - const params: Record = { - since: opts.since, - until: opts.until, - pageSize: opts.pageSize, - includePrerelease: opts.includePrerelease, - } - if (cursor) params.cursor = cursor - - let res: { data: ChangesResponse } | undefined - for (let attempt = 0; ; attempt++) { - try { - res = await axios.get(`${opts.baseUrl}/api/changes`, { - params, - timeout: REQUEST_TIMEOUT_MS, - headers: opts.token ? { Authorization: `Bearer ${opts.token}` } : undefined, - }) - break - } catch (err) { - if (attempt < MAX_RETRIES && isTransientError(err)) { - const delay = RETRY_BASE_MS * 2 ** attempt + Math.random() * 300 - log.warn( - { page, attempt, error: err instanceof Error ? err.message : String(err) }, - 'Delta page fetch failed — retrying', - ) - await sleep(delay) - continue - } - throw err - } - } - - const changes = res.data?.changes ?? [] - all.push(...changes) - cursor = res.data?.nextCursor || undefined - page++ - - log.debug( - { page, batch: changes.length, total: all.length, hasMore: Boolean(cursor) }, - 'Fetched delta page', - ) - - if (page >= MAX_PAGES) { - log.warn({ page, total: all.length }, 'Hit MAX_PAGES — stopping pagination early') - break - } - } while (cursor) - - return all -} diff --git a/services/apps/packages_worker/src/maven/metadata.ts b/services/apps/packages_worker/src/maven/metadata.ts index 680f38e5fe..5043d3b682 100644 --- a/services/apps/packages_worker/src/maven/metadata.ts +++ b/services/apps/packages_worker/src/maven/metadata.ts @@ -26,6 +26,25 @@ const parser = new XMLParser({ export interface MavenVersionsMetadata { versions: string[] releaseVersion: string | null + lastUpdated: Date | null +} + +// maven-metadata.xml carries yyyyMMddHHmmss in UTC — +// the timestamp of the most recent publish. Parse it into a Date for +// packages.latest_release_at; return null on anything malformed. +function parseMavenLastUpdated(raw: unknown): Date | null { + if (typeof raw !== 'string') { + // fast-xml-parser may coerce the all-digits value to a number + raw = typeof raw === 'number' ? String(raw) : null + } + const m = + typeof raw === 'string' + ? raw.trim().match(/^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})$/) + : null + if (!m) return null + const [, y, mo, d, h, mi, s] = m + const ts = Date.UTC(+y, +mo - 1, +d, +h, +mi, +s) + return Number.isNaN(ts) ? null : new Date(ts) } export type MavenFetchError = @@ -69,7 +88,11 @@ export async function resolveVersionsList( versions = [rawVersions.trim()] } - return { versions, releaseVersion: release || latest || null } + return { + versions, + releaseVersion: release || latest || null, + lastUpdated: parseMavenLastUpdated(versioning?.lastUpdated), + } } catch (err) { if (axios.isAxiosError(err)) { const status = err.response?.status diff --git a/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts index f48d7464ff..cafc135309 100644 --- a/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts +++ b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts @@ -2,7 +2,6 @@ import crypto from 'crypto' import { MavenPackageToSync, - listMavenPackagesByPurls, listMavenPackagesToSync, logAuditFieldChange, replacePackageMaintainers, @@ -18,7 +17,6 @@ import { getServiceChildLogger } from '@crowd/logging' import { getMavenConfig } from '../config' -import { fetchMavenChanges } from './deltaApi' import { MAX_PARENT_HOPS, extractArtifact, getPomCacheStats, normalizeScmUrl } from './extract' import { isMavenFetchError, resolveVersionsList } from './metadata' import { isPrerelease, parseRepoUrl } from './normalize' @@ -269,6 +267,8 @@ async function processCriticalPackage( licenses: result.licenses.length > 0 ? result.licenses : null, licensesRaw: result.licensesRaw, latestVersion: version, + versionsCount: metadata.versions.length > 0 ? metadata.versions.length : null, + latestReleaseAt: metadata.lastUpdated, ingestionSource: 'maven-registry', criticalityScore: pkg.criticalityScore, dependentPackagesCount: pkg.dependentPackagesCount, @@ -365,8 +365,7 @@ export async function processBatch( return processPackages(qx, config, packages, isCritical, forceFullExtraction) } -// Runs a concrete list of packages through the enrichment pipeline. Shared by the -// universe-polling path (processBatch) and the delta-API path (processApiChangesBatch). +// Runs a concrete list of packages through the enrichment pipeline. async function processPackages( qx: QueryExecutor, config: MavenConfig, @@ -436,74 +435,6 @@ async function processPackages( return counts } -// ─── Delta-API batch ────────────────────────────────────────────────────────── - -// BatchResult plus delta-feed-specific counters and a fetch/process timing split — -// handy for the benchmark script and for spotting whether time goes to the feed or -// to POM extraction. Extra fields are ignored by callers that only need BatchResult. -export interface DeltaApiBatchResult extends BatchResult { - apiChanges: number - uniquePackages: number - matchedCritical: number - fetchMs: number - processMs: number -} - -// Pulls the changed artifacts from our delta feed over a rolling [now-lookback, now) -// window and enriches the critical ones. The window deliberately overlaps the -// Temporal schedule interval; re-processing is safe because every write is an -// idempotent upsert. Always forces full extraction — the feed is an explicit -// "this changed" signal, so we never trust the version-unchanged shortcut here. -export async function processApiChangesBatch( - qx: QueryExecutor, - config: MavenConfig, -): Promise { - const until = new Date() - const since = new Date(until.getTime() - config.deltaApi.lookbackMinutes * 60_000) - - const fetchStartedAt = Date.now() - - const changes = await fetchMavenChanges({ - baseUrl: config.deltaApi.baseUrl, - token: config.deltaApi.token, - since: since.toISOString(), - until: until.toISOString(), - pageSize: config.deltaApi.pageSize, - includePrerelease: config.deltaApi.includePrerelease, - }) - - // Collapse to package-level purls (drop the @version) and dedup — the feed - // reports one entry per version, but we enrich the package as a whole. - const purls = Array.from(new Set(changes.map((c) => `pkg:maven/${c.groupId}/${c.artifactId}`))) - - const packages = await listMavenPackagesByPurls(qx, purls) - const fetchMs = Date.now() - fetchStartedAt - - log.info( - { - changes: changes.length, - uniquePackages: purls.length, - matchedCritical: packages.length, - lookbackMinutes: config.deltaApi.lookbackMinutes, - fetchMs, - }, - 'Delta-API window fetched', - ) - - const processStartedAt = Date.now() - const counts = await processPackages(qx, config, packages, true, true) - const processMs = Date.now() - processStartedAt - - return { - ...counts, - apiChanges: changes.length, - uniquePackages: purls.length, - matchedCritical: packages.length, - fetchMs, - processMs, - } -} - // ─── Phase runner ───────────────────────────────────────────────────────────── async function runPhase( diff --git a/services/apps/packages_worker/src/scripts/benchmarkMavenDelta.ts b/services/apps/packages_worker/src/scripts/benchmarkMavenDelta.ts deleted file mode 100644 index 7b5c7c27b4..0000000000 --- a/services/apps/packages_worker/src/scripts/benchmarkMavenDelta.ts +++ /dev/null @@ -1,82 +0,0 @@ -/** - * One-shot benchmark for the Maven delta-API sync path. - * - * Runs a single processApiChangesBatch() against the configured delta feed and - * prints a performance summary (fetch vs. process split, throughput, per-package - * average). Use it to gather numbers before wiring the path into Temporal. - * - * Run with env loaded: - * pnpm run benchmark:maven-delta:local - * - * Requires MAVEN_DELTA_API_URL; set MAVEN_SYNC_SOURCE=api|both so the config - * validates. Widen MAVEN_DELTA_API_LOOKBACK_MINUTES to pull a bigger window for a - * more meaningful sample. - */ -import { getServiceLogger } from '@crowd/logging' - -import { getMavenConfig } from '../config' -import { getPackagesDb } from '../db' -import { processApiChangesBatch } from '../maven/runMavenEnrichmentLoop' - -const log = getServiceLogger() - -const main = async () => { - const config = getMavenConfig() - - if (!config.deltaApi.baseUrl) { - throw new Error('MAVEN_DELTA_API_URL is required to benchmark the delta-API path') - } - - log.info( - { - baseUrl: config.deltaApi.baseUrl, - lookbackMinutes: config.deltaApi.lookbackMinutes, - pageSize: config.deltaApi.pageSize, - includePrerelease: config.deltaApi.includePrerelease, - concurrency: config.concurrency, - groupDelayMs: config.groupDelayMs, - }, - 'Delta-API benchmark starting', - ) - - const qx = await getPackagesDb() - await qx.selectOne('SELECT 1') - - const startedAt = Date.now() - const r = await processApiChangesBatch(qx, config) - const totalMs = Date.now() - startedAt - - const enriched = r.processed + r.skipped + r.error + r.unchanged - const throughputPerSec = r.processMs > 0 ? +(enriched / (r.processMs / 1000)).toFixed(2) : 0 - const avgMsPerPkg = enriched > 0 ? +(r.processMs / enriched).toFixed(1) : 0 - - log.info( - { - // window - apiChanges: r.apiChanges, - uniquePackages: r.uniquePackages, - matchedCritical: r.matchedCritical, - // outcomes - processed: r.processed, - skipped: r.skipped, - unchanged: r.unchanged, - error: r.error, - hopLimitReached: r.hopLimitReached, - // timing - fetchMs: r.fetchMs, - processMs: r.processMs, - totalMs, - // perf - throughputPerSec, - avgMsPerPkg, - }, - 'Delta-API benchmark complete', - ) - - process.exit(0) -} - -main().catch((err) => { - log.error({ err }, 'Delta-API benchmark failed') - process.exit(1) -}) diff --git a/services/apps/packages_worker/src/scripts/validateDataQuality.ts b/services/apps/packages_worker/src/scripts/validateDataQuality.ts deleted file mode 100644 index 123274c10d..0000000000 --- a/services/apps/packages_worker/src/scripts/validateDataQuality.ts +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Data-quality scorecard for the Maven enrichment output. - * - * Runs the read-only checks in ../maven/data_quality.sql against the packages DB - * and prints a grouped table (coverage, ingestion-source breakdown, anomalies, - * integrity, freshness). Useful before/after a backfill, and to validate prod. - * - * Run with env loaded: - * pnpm run validate:maven-quality:local - * - * Read-only: a single SELECT, safe to point at a prod read-replica. - * Exits non-zero if any check has status FAIL (handy for CI / gating a deploy). - */ -import { readFileSync } from 'fs' -import { join } from 'path' - -import { getServiceLogger } from '@crowd/logging' - -import { getPackagesDb } from '../db' - -const log = getServiceLogger() - -interface ReportRow { - section: string - metric: string - value: string | number - pct: string | null - status: string -} - -const SQL_PATH = join(__dirname, '../maven/data_quality.sql') - -function pad(s: string, len: number): string { - return s.length >= len ? s : s + ' '.repeat(len - s.length) -} - -function padLeft(s: string, len: number): string { - return s.length >= len ? s : ' '.repeat(len - s.length) + s -} - -function render(rows: ReportRow[]): string { - const lines: string[] = ['', 'Maven enrichment — data quality scorecard', ''] - let currentSection = '' - - for (const r of rows) { - if (r.section !== currentSection) { - currentSection = r.section - lines.push('', currentSection) - } - const value = padLeft(String(r.value), 10) - const pct = padLeft(r.pct ?? '', 8) - const status = pad(r.status, 4) - lines.push(` [${status}] ${pad(r.metric, 44)} ${value} ${pct}`) - } - lines.push('') - return lines.join('\n') -} - -const main = async () => { - const qx = await getPackagesDb() - const sql = readFileSync(SQL_PATH, 'utf8') - - const rows: ReportRow[] = await qx.select(sql) - - process.stdout.write(render(rows)) - - const failures = rows.filter((r) => r.status === 'FAIL') - const warnings = rows.filter( - (r) => r.status === 'WARN' || r.status === 'POOR' || r.status === 'LOW', - ) - - if (failures.length > 0) { - log.error( - { failures: failures.map((f) => `${f.section} / ${f.metric} = ${f.value}`) }, - `Data quality: ${failures.length} FAIL check(s)`, - ) - process.exit(1) - } - - log.info( - { warnings: warnings.length, checks: rows.length }, - `Data quality OK — no FAIL checks (${warnings.length} warning(s))`, - ) - process.exit(0) -} - -main().catch((err) => { - log.error({ err }, 'Data quality validation failed') - process.exit(1) -}) diff --git a/services/libs/data-access-layer/src/osspckgs/packages.ts b/services/libs/data-access-layer/src/osspckgs/packages.ts index 8fa24572ab..995a6139b7 100644 --- a/services/libs/data-access-layer/src/osspckgs/packages.ts +++ b/services/libs/data-access-layer/src/osspckgs/packages.ts @@ -128,44 +128,6 @@ export async function listMavenPackagesToSync( ) } -/** - * Loads Tier 2 Maven packages (from `packages`) by package-level purl, regardless - * of staleness. Used by the delta-API sync path: the upstream feed already told us - * these packages changed, so we (re)extract them now. Purls not present in - * `packages` (i.e. not promoted to Tier 2 by the criticality worker) are dropped. - */ -export async function listMavenPackagesByPurls( - qx: QueryExecutor, - purls: string[], -): Promise { - if (purls.length === 0) return [] - - return qx.select( - ` - SELECT - p.id, - p.purl, - p.namespace, - p.name, - p.criticality_score AS "criticalityScore", - p.dependent_count AS "dependentPackagesCount", - p.dependent_repos_count AS "dependentReposCount", - p.downloads_last_month AS "downloads30d", - p.latest_version AS "latestVersion" - FROM packages p - WHERE - p.ecosystem = 'maven' - AND p.is_critical - AND p.namespace IS NOT NULL - AND p.purl = ANY($(purls)) - ORDER BY - p.criticality_score DESC NULLS LAST, - p.id ASC - `, - { purls }, - ) -} - // ─── packages touch ─────────────────────────────────────────────────────────── /** @@ -232,20 +194,20 @@ export async function upsertPackage( ` WITH old AS ( SELECT description, homepage, registry_url, declared_repository_url, repository_url, - licenses, licenses_raw, latest_version, ingestion_source + licenses, licenses_raw, latest_version, versions_count, latest_release_at, ingestion_source FROM packages WHERE purl = $(purl) ), ins AS ( INSERT INTO packages ( purl, ecosystem, namespace, name, description, homepage, registry_url, declared_repository_url, repository_url, - licenses, licenses_raw, latest_version, + licenses, licenses_raw, latest_version, versions_count, latest_release_at, criticality_score, dependent_count, dependent_repos_count, downloads_last_month, ingestion_source, last_synced_at ) VALUES ( $(purl), $(ecosystem), $(namespace), $(name), $(description), $(homepage), $(registryUrl), $(declaredRepositoryUrl), $(repositoryUrl), - $(licenses)::text[], $(licensesRaw), $(latestVersion), + $(licenses)::text[], $(licensesRaw), $(latestVersion), $(versionsCount), $(latestReleaseAt), $(criticalityScore), $(dependentPackagesCount), $(dependentReposCount), $(downloadsLastMonth), $(ingestionSource), NOW() ) @@ -258,6 +220,8 @@ export async function upsertPackage( licenses = COALESCE(EXCLUDED.licenses, packages.licenses), licenses_raw = COALESCE(EXCLUDED.licenses_raw, packages.licenses_raw), latest_version = COALESCE(EXCLUDED.latest_version, packages.latest_version), + versions_count = COALESCE(EXCLUDED.versions_count, packages.versions_count), + latest_release_at = COALESCE(EXCLUDED.latest_release_at, packages.latest_release_at), criticality_score = COALESCE(EXCLUDED.criticality_score, packages.criticality_score), dependent_count = COALESCE(EXCLUDED.dependent_count, packages.dependent_count), dependent_repos_count = COALESCE(EXCLUDED.dependent_repos_count, packages.dependent_repos_count), @@ -265,7 +229,7 @@ export async function upsertPackage( ingestion_source = EXCLUDED.ingestion_source, last_synced_at = NOW() RETURNING id, description, homepage, registry_url, declared_repository_url, repository_url, - licenses, licenses_raw, latest_version, ingestion_source + licenses, licenses_raw, latest_version, versions_count, latest_release_at, ingestion_source ) SELECT ins.id, array_remove(ARRAY[ @@ -277,6 +241,8 @@ export async function upsertPackage( CASE WHEN o.licenses IS DISTINCT FROM ins.licenses THEN 'packages.licenses' END, CASE WHEN o.licenses_raw IS DISTINCT FROM ins.licenses_raw THEN 'packages.licenses_raw' END, CASE WHEN o.latest_version IS DISTINCT FROM ins.latest_version THEN 'packages.latest_version' END, + CASE WHEN o.versions_count IS DISTINCT FROM ins.versions_count THEN 'packages.versions_count' END, + CASE WHEN o.latest_release_at IS DISTINCT FROM ins.latest_release_at THEN 'packages.latest_release_at' END, CASE WHEN o.ingestion_source IS DISTINCT FROM ins.ingestion_source THEN 'packages.ingestion_source' END ], NULL) AS changed_fields FROM ins LEFT JOIN old o ON true @@ -285,6 +251,8 @@ export async function upsertPackage( ...item, registryUrl: item.registryUrl ?? null, repositoryUrl: item.repositoryUrl ?? null, + versionsCount: item.versionsCount ?? null, + latestReleaseAt: item.latestReleaseAt ?? null, criticalityScore: item.criticalityScore ?? null, dependentPackagesCount: item.dependentPackagesCount ?? null, dependentReposCount: item.dependentReposCount ?? null, diff --git a/services/libs/data-access-layer/src/osspckgs/types.ts b/services/libs/data-access-layer/src/osspckgs/types.ts index a1b6f1bc6d..6b62d91f88 100644 --- a/services/libs/data-access-layer/src/osspckgs/types.ts +++ b/services/libs/data-access-layer/src/osspckgs/types.ts @@ -27,6 +27,8 @@ export type IDbPackageUpsert = { licenses: string[] | null licensesRaw: string | null latestVersion: string | null + versionsCount?: number | null + latestReleaseAt?: Date | null ingestionSource: string criticalityScore?: number | null dependentPackagesCount?: number | null From 8a7bc960cdca6d467916c62d109d3decd891c9d2 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 4 Jun 2026 11:28:28 +0200 Subject: [PATCH 18/22] fix: review Signed-off-by: Umberto Sgueglia --- backend/.env.dist.local | 1 - .../apps/packages_worker/src/maven/README.md | 22 +++++++++---------- .../src/maven/runMavenEnrichmentLoop.ts | 2 +- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/backend/.env.dist.local b/backend/.env.dist.local index 614e6dd66d..680b6275d8 100755 --- a/backend/.env.dist.local +++ b/backend/.env.dist.local @@ -207,5 +207,4 @@ POM_FETCHER_NON_CRITICAL_BATCH_SIZE=500 POM_FETCHER_NON_CRITICAL_CONCURRENCY=20 POM_FETCHER_REFRESH_DAYS=1 POM_FETCHER_GROUP_DELAY_MS=100 -POM_FETCHER_FORCE_FULL_EXTRACTION=true POM_FETCHER_MAVEN_BASE_URL=https://maven-central.storage-download.googleapis.com/maven2 \ No newline at end of file diff --git a/services/apps/packages_worker/src/maven/README.md b/services/apps/packages_worker/src/maven/README.md index acda3acd8a..b4569e4e87 100644 --- a/services/apps/packages_worker/src/maven/README.md +++ b/services/apps/packages_worker/src/maven/README.md @@ -207,25 +207,25 @@ Eclipse/OSGi feature artifacts (e.g. `org.wso2.carbon.identity.xacml.server.feat ### Maven Central 403 rate limiting -Maven Central (`repo1.maven.org`) restituisce 403 come meccanismo di throttle oltre al canonico 429. Il comportamento è gestito a due livelli: +Maven Central (`repo1.maven.org`) returns 403 as a throttling mechanism in addition to the canonical 429. The behaviour is handled at two levels: -1. **Retry con backoff esponenziale** — 403 e 429 vengono ritentati fino a 3 volte (2s base, ×2 per tentativo). Gestito in `getWithRetry` (extract.ts) e `resolveVersionsList` (metadata.ts). +1. **Exponential backoff retry** — 403 and 429 are retried up to 3 times (2s base, ×2 per attempt). Handled in `getWithRetry` (extract.ts) and `resolveVersionsList` (metadata.ts). -2. **Retry al prossimo pass** — se tutti i retry esauriscono, il batch conta il pacchetto come errore (nessun record sentinel viene scritto su `packages`) e lo riprenderà al tick/pass successivo, quando l'IP è di nuovo freddo. +2. **Retry on the next pass** — if all retries are exhausted, the batch counts the package as an error (no sentinel record is written to `packages`) and picks it up again on the next tick/pass, when the IP has cooled down. -**Causa root dei 403 persistenti:** `packages_universe` è ordinato per `rank_in_ecosystem`, quindi pacchetti dello stesso namespace (es. `com.google.apis`, `org.wso2`, `software.amazon.awssdk`) si raggruppano nel batch e colpiscono lo stesso CDN node di Maven Central in rapida successione. Il rate limit scatta sistematicamente dopo ~150–200 pacchetti processati. +**Root cause of persistent 403s:** `packages_universe` is ordered by `rank_in_ecosystem`, so packages from the same namespace (e.g. `com.google.apis`, `org.wso2`, `software.amazon.awssdk`) cluster together in the batch and hit the same Maven Central CDN node in rapid succession. The rate limit consistently kicks in after ~150–200 packages processed. -**Mitigazione applicata, in ordine di efficacia:** +**Mitigations applied, in order of effectiveness:** -1. **Cache in-process dei parent POM** (vedi [Parent POM cache](#parent-pom-cache)) — sfrutta il clustering per namespace per collassare i fetch dei parent condivisi e il doppio fetch del leaf POM, riducendo il **volume totale** di richieste. È la leva principale: il throttle è volume-based per IP, quindi meno richieste = meno 403. -2. Un delay configurabile tra i gruppi concorrenti (`POM_FETCHER_GROUP_DELAY_MS`) + `POM_FETCHER_CONCURRENCY` basso (≤5) → abbassano il rate istantaneo. -3. Backoff di retry con jitter (`±500ms`, vedi `extract.ts` / `metadata.ts`) → evita retry sincronizzati. +1. **In-process parent POM cache** (see [Parent POM cache](#parent-pom-cache)) — leverages namespace clustering to collapse fetches of shared parents and the double fetch of the leaf POM, reducing the **total volume** of requests. This is the main lever: throttling is volume-based per IP, so fewer requests = fewer 403s. +2. A configurable delay between concurrent groups (`POM_FETCHER_GROUP_DELAY_MS`) + low `POM_FETCHER_CONCURRENCY` (≤5) → lower the instantaneous rate. +3. Retry backoff with jitter (`±500ms`, see `extract.ts` / `metadata.ts`) → avoids synchronized retries. -> Nota: **lo shuffle dei batch non aiuta** — riordina gli stessi N request nella stessa finestra temporale (stesso volume → stesso throttle) e in più romperebbe la località che rende efficace la cache dei parent. +> Note: **shuffling the batch does not help** — it reorders the same N requests within the same time window (same volume → same throttle) and would additionally break the locality that makes the parent cache effective. -Namespace noti per triggerare il rate limit a causa dell'alta densità di artefatti: `com.google.apis`, `software.amazon.awssdk`, `org.wso2.*`. +Namespaces known to trigger the rate limit due to their high artifact density: `com.google.apis`, `software.amazon.awssdk`, `org.wso2.*`. -**IP caldo durante i test locali:** run ripetute sulla stessa macchina accumulano request history sull'IP. Maven Central usa finestre di throttle lunghe (1–4 ore), quindi anche a concurrency=3 + delay=400ms l'IP può rimanere in stato di throttle per tutta la sessione di test. In produzione questo non accade perché le run sono distanziate di 24 ore e l'IP è sempre freddo tra un pass e l'altro. Per verificare se l'IP è throttlato: `curl -I https://repo1.maven.org/maven2/org/wso2/carbon/identity/framework/application-mgt/maven-metadata.xml` — risposta 403 immediata conferma il throttle. +**Hot IP during local testing:** repeated runs on the same machine accumulate request history on the IP. Maven Central uses long throttle windows (1–4 hours), so even at concurrency=3 + delay=400ms the IP can stay throttled for the entire test session. This does not happen in production because runs are spaced 24 hours apart and the IP is always cold between passes. To check whether the IP is throttled: `curl -I https://repo1.maven.org/maven2/org/wso2/carbon/identity/framework/application-mgt/maven-metadata.xml` — an immediate 403 response confirms the throttle. ### Partial Maven Central Deploys diff --git a/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts index cafc135309..607bd05c07 100644 --- a/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts +++ b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts @@ -2,6 +2,7 @@ import crypto from 'crypto' import { MavenPackageToSync, + QueryExecutor, listMavenPackagesToSync, logAuditFieldChange, replacePackageMaintainers, @@ -12,7 +13,6 @@ import { upsertRepo, upsertVersionsBatch, } from '@crowd/data-access-layer' -import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' import { getServiceChildLogger } from '@crowd/logging' import { getMavenConfig } from '../config' From 1837aa39497793dc457c75fa67e0daa06d7054aa Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 4 Jun 2026 13:44:16 +0200 Subject: [PATCH 19/22] fix: review Signed-off-by: Umberto Sgueglia --- .../apps/packages_worker/src/maven/README.md | 4 ++-- .../src/maven/runMavenEnrichmentLoop.ts | 6 ------ .../data-access-layer/src/osspckgs/packages.ts | 16 ++++------------ .../libs/data-access-layer/src/osspckgs/types.ts | 1 - 4 files changed, 6 insertions(+), 21 deletions(-) diff --git a/services/apps/packages_worker/src/maven/README.md b/services/apps/packages_worker/src/maven/README.md index b4569e4e87..2828f9bd7d 100644 --- a/services/apps/packages_worker/src/maven/README.md +++ b/services/apps/packages_worker/src/maven/README.md @@ -81,8 +81,8 @@ hitRate }`; the critical batch logs it once per batch under message **`POM cache The matrix below describes the **critical** path (full POM + parent resolution). Non-critical packages are DB-only: they receive just the universe-stat columns -(`criticality_score`, `dependent_packages_count`, `dependent_repos_count`, -`downloads_last_month`) plus `purl`/`namespace`/`name`/`registry_url`/`last_synced_at`; +(`criticality_score`, `dependent_packages_count`, `dependent_repos_count`) +plus `purl`/`namespace`/`name`/`registry_url`/`last_synced_at`; all POM-derived columns stay null for them. ### packages diff --git a/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts index 607bd05c07..d20f1e0cd6 100644 --- a/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts +++ b/services/apps/packages_worker/src/maven/runMavenEnrichmentLoop.ts @@ -110,7 +110,6 @@ async function processNonCriticalPackage(qx: QueryExecutor, pkg: PackageRow): Pr criticalityScore: pkg.criticalityScore, dependentPackagesCount: pkg.dependentPackagesCount, dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, }) } @@ -151,7 +150,6 @@ async function processCriticalPackage( criticalityScore: pkg.criticalityScore, dependentPackagesCount: pkg.dependentPackagesCount, dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, }) log.warn({ groupId, artifactId }, 'Not on Maven Central — writing minimal record') return { status: 'skipped', hopLimitReached: false } @@ -188,7 +186,6 @@ async function processCriticalPackage( criticalityScore: pkg.criticalityScore, dependentPackagesCount: pkg.dependentPackagesCount, dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, }) log.warn({ groupId, artifactId }, 'No release version in metadata — writing minimal record') return { status: 'skipped', hopLimitReached: false } @@ -200,7 +197,6 @@ async function processCriticalPackage( criticalityScore: pkg.criticalityScore, dependentPackagesCount: pkg.dependentPackagesCount, dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, }) log.debug({ groupId, artifactId, version }, 'Version unchanged — skipping POM extraction') return { status: 'unchanged', hopLimitReached: false } @@ -229,7 +225,6 @@ async function processCriticalPackage( criticalityScore: pkg.criticalityScore, dependentPackagesCount: pkg.dependentPackagesCount, dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, }) return { status: 'error', hopLimitReached: false } } @@ -273,7 +268,6 @@ async function processCriticalPackage( criticalityScore: pkg.criticalityScore, dependentPackagesCount: pkg.dependentPackagesCount, dependentReposCount: pkg.dependentReposCount, - downloadsLastMonth: pkg.downloads30d, }) pkgChanged.forEach((f) => changed.add(f)) diff --git a/services/libs/data-access-layer/src/osspckgs/packages.ts b/services/libs/data-access-layer/src/osspckgs/packages.ts index 995a6139b7..3afefee55a 100644 --- a/services/libs/data-access-layer/src/osspckgs/packages.ts +++ b/services/libs/data-access-layer/src/osspckgs/packages.ts @@ -27,7 +27,6 @@ export type MavenPackageToSync = Pick< | 'criticalityScore' | 'dependentPackagesCount' | 'dependentReposCount' - | 'downloads30d' > & { purl: string latestVersion: string | null @@ -74,7 +73,6 @@ export async function listMavenPackagesToSync( p.criticality_score AS "criticalityScore", p.dependent_count AS "dependentPackagesCount", p.dependent_repos_count AS "dependentReposCount", - p.downloads_last_month AS "downloads30d", p.latest_version AS "latestVersion" FROM packages p WHERE @@ -106,7 +104,6 @@ export async function listMavenPackagesToSync( pu.criticality_score AS "criticalityScore", pu.dependent_count AS "dependentPackagesCount", pu.dependent_repos_count AS "dependentReposCount", - pu.downloads_30d AS "downloads30d", p.latest_version AS "latestVersion" FROM packages_universe pu LEFT JOIN packages p ON p.purl = pu.purl @@ -142,7 +139,6 @@ export async function touchPackageSyncedAt( criticalityScore: number | null | undefined dependentPackagesCount: number | null | undefined dependentReposCount: number | null | undefined - downloadsLastMonth: bigint | null | undefined }, ): Promise { await qx.result( @@ -151,8 +147,7 @@ export async function touchPackageSyncedAt( last_synced_at = NOW(), criticality_score = COALESCE($(criticalityScore), criticality_score), dependent_count = COALESCE($(dependentPackagesCount), dependent_count), - dependent_repos_count = COALESCE($(dependentReposCount), dependent_repos_count), - downloads_last_month = COALESCE($(downloadsLastMonth), downloads_last_month) + dependent_repos_count = COALESCE($(dependentReposCount), dependent_repos_count) WHERE purl = $(purl) `, { @@ -160,8 +155,7 @@ export async function touchPackageSyncedAt( criticalityScore: metrics.criticalityScore ?? null, dependentPackagesCount: metrics.dependentPackagesCount ?? null, dependentReposCount: metrics.dependentReposCount ?? null, - downloadsLastMonth: metrics.downloadsLastMonth ?? null, - }, + }, ) } @@ -202,13 +196,13 @@ export async function upsertPackage( purl, ecosystem, namespace, name, description, homepage, registry_url, declared_repository_url, repository_url, licenses, licenses_raw, latest_version, versions_count, latest_release_at, - criticality_score, dependent_count, dependent_repos_count, downloads_last_month, + criticality_score, dependent_count, dependent_repos_count, ingestion_source, last_synced_at ) VALUES ( $(purl), $(ecosystem), $(namespace), $(name), $(description), $(homepage), $(registryUrl), $(declaredRepositoryUrl), $(repositoryUrl), $(licenses)::text[], $(licensesRaw), $(latestVersion), $(versionsCount), $(latestReleaseAt), - $(criticalityScore), $(dependentPackagesCount), $(dependentReposCount), $(downloadsLastMonth), + $(criticalityScore), $(dependentPackagesCount), $(dependentReposCount), $(ingestionSource), NOW() ) ON CONFLICT (purl) DO UPDATE SET @@ -225,7 +219,6 @@ export async function upsertPackage( criticality_score = COALESCE(EXCLUDED.criticality_score, packages.criticality_score), dependent_count = COALESCE(EXCLUDED.dependent_count, packages.dependent_count), dependent_repos_count = COALESCE(EXCLUDED.dependent_repos_count, packages.dependent_repos_count), - downloads_last_month = COALESCE(EXCLUDED.downloads_last_month, packages.downloads_last_month), ingestion_source = EXCLUDED.ingestion_source, last_synced_at = NOW() RETURNING id, description, homepage, registry_url, declared_repository_url, repository_url, @@ -256,7 +249,6 @@ export async function upsertPackage( criticalityScore: item.criticalityScore ?? null, dependentPackagesCount: item.dependentPackagesCount ?? null, dependentReposCount: item.dependentReposCount ?? null, - downloadsLastMonth: item.downloadsLastMonth ?? null, }, ) return { id: row.id as number, changedFields: row.changed_fields as string[] } diff --git a/services/libs/data-access-layer/src/osspckgs/types.ts b/services/libs/data-access-layer/src/osspckgs/types.ts index 6b62d91f88..2a2545f697 100644 --- a/services/libs/data-access-layer/src/osspckgs/types.ts +++ b/services/libs/data-access-layer/src/osspckgs/types.ts @@ -33,7 +33,6 @@ export type IDbPackageUpsert = { criticalityScore?: number | null dependentPackagesCount?: number | null dependentReposCount?: number | null - downloadsLastMonth?: bigint | null registryUrl?: string | null repositoryUrl?: string | null } From 8a690ce1ad63327d4ecb5cfd37e2000e8ba28d93 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 4 Jun 2026 13:49:28 +0200 Subject: [PATCH 20/22] fix: lint Signed-off-by: Umberto Sgueglia --- services/libs/data-access-layer/src/osspckgs/packages.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/libs/data-access-layer/src/osspckgs/packages.ts b/services/libs/data-access-layer/src/osspckgs/packages.ts index 3afefee55a..344917b79e 100644 --- a/services/libs/data-access-layer/src/osspckgs/packages.ts +++ b/services/libs/data-access-layer/src/osspckgs/packages.ts @@ -155,7 +155,7 @@ export async function touchPackageSyncedAt( criticalityScore: metrics.criticalityScore ?? null, dependentPackagesCount: metrics.dependentPackagesCount ?? null, dependentReposCount: metrics.dependentReposCount ?? null, - }, + }, ) } From 40f8596bed618b470be50a95d1b66c02ff8ed0fa Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 4 Jun 2026 15:18:10 +0200 Subject: [PATCH 21/22] fix: review comments Signed-off-by: Umberto Sgueglia --- .../apps/packages_worker/src/maven/README.md | 373 ------------------ .../apps/packages_worker/src/maven/extract.ts | 12 +- .../src/osspckgs/versions.ts | 19 +- 3 files changed, 20 insertions(+), 384 deletions(-) delete mode 100644 services/apps/packages_worker/src/maven/README.md diff --git a/services/apps/packages_worker/src/maven/README.md b/services/apps/packages_worker/src/maven/README.md deleted file mode 100644 index 2828f9bd7d..0000000000 --- a/services/apps/packages_worker/src/maven/README.md +++ /dev/null @@ -1,373 +0,0 @@ -# Maven POM Fetcher - -Worker that syncs Maven package metadata from Maven Central into the `packages` DB. -Lives in `packages_worker` and can run in two ways: - -- **Temporal (production, incremental):** entry point `bin/packages-worker.ts` registers the - `maven-critical` Temporal schedule. Each tick runs a **single batch** as a Temporal activity, - and **skips POM extraction when the version is unchanged**. See [Scheduling](#scheduling). -- **One-shot backfill:** entry point `bin/maven-backfill.ts` (`pnpm backfill:maven`) drains the - Tier 2 critical queue once with **full extraction** (no version short-circuit), then exits. - Run it `exec`ed into the `packages-worker` container for the initial fill / periodic full - refresh. It is resumable — the DB state is the cursor, so re-running picks up where it left off. - ---- - -## Architecture: Two-Tier Fetch - -Both phases pull candidates from `packages_universe` (filtered by `is_critical`, ordered by -`rank_in_ecosystem`) and write into `packages`. A package is a candidate when it is not yet -in `packages` (`last_synced_at IS NULL`) or its `last_synced_at` is older than -`POM_FETCHER_REFRESH_DAYS`. The two phases run as **separate Temporal schedules** (or, in the -standalone loop, only the critical phase runs). - -### Non-Critical phase (`is_critical = false`) - -DB-only. Copies universe stats (criticality score, downloads, dependent counts) into -`packages` with `ingestion_source = 'packages_universe'`. **No HTTP.** Fast (~1000 pkg/sec). - -### Critical phase (`is_critical = true`) - -Full POM extraction from Maven Central with parent-chain resolution (max 8 hops). Populates -description, homepage, SCM/repo, licenses, maintainers and the full version list. - -Whether the version short-circuit applies is fixed per **entry point** (not a runtime flag): - -| Entry point | Mode | Behaviour | -| -------------------------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| Standalone `bin/maven.ts` | **backfill** | Always runs full POM extraction for every selected critical package, regardless of version. Use for the initial fill / periodic full refresh. | -| Temporal `mavenCriticalWorkflow` | **incremental** | If the upstream release version equals the stored `latest_version`, skips the POM fetch and only bumps `last_synced_at` (status `unchanged`). Full extraction runs only for new packages or when the version changed. | - -This is passed as the `forceFullExtraction` argument to `processBatch` — `true` from the -standalone loop, `false` from the Temporal activity. There is no env variable for it. - -**Why two tiers?** Parent POM resolution is the expensive part — multiple HTTP requests per -package (up to 8 extra fetches). Running it on millions of non-critical packages is not -feasible; for them the DB-only universe stats are enough. The extra cost is reserved for -critical packages, where data quality matters. - -### Parent POM cache - -Parent POMs are shared across many artifacts of the same namespace (`org.apache:apache`, -`org.springframework.boot:spring-boot-starter-parent`, `com.google.cloud:google-cloud-shared-config`, …). -The batch is **selected** by criticality (`rank_in_ecosystem` / `criticality_score` via the SQL -`LIMIT`), but that ordering does **not** group same-namespace siblings — so before processing, -`processPackages` re-sorts each critical batch by `(namespace, name)`. That sort is what puts the -siblings adjacent, so a parent fetched for one is still cached when the next arrives. A -module-level, coordinate-keyed in-process cache in `extract.ts` then collapses the repeated parent -fetches into a **single** HTTP request, and also removes the redundant second fetch of each -artifact's own POM (`extractArtifact` fetches the leaf, then `resolveWithInheritance` would fetch -it again at depth 0). This is the **single biggest lever against Maven Central rate limiting** — and -it works _because_ of the namespace sort, so re-shuffling the batch would be counter-productive. - -- **Only successful fetches are cached.** `fetchPom` returns `null` for both a real 404 and a - transient failure (throttle/timeout), so caching `null` would poison the cache — it is never - done. Missing/failed POMs are simply re-fetched on the next pass. -- **No TTL.** Maven coordinates are immutable, so a cached POM never goes stale. The cache is - bounded by an LRU size cap (`POM_CACHE_MAX_ENTRIES`, default 5000) purely to cap memory. -- **Request coalescing.** Concurrent fetches for the same coordinates share a single in-flight - request instead of issuing duplicates. -- **Observability.** `getPomCacheStats()` returns `{ size, hits, coalesced, misses, evictions, -hitRate }`; the critical batch logs it once per batch under message **`POM cache`**, so you - can watch the hit rate climb as the cache warms. - -> The cache lives for the lifetime of the worker process. Under Temporal it persists **across -> batches/ticks** (same process), so the hit rate keeps improving across the run; in the -> standalone loop it persists across passes until the process is restarted. - ---- - -## Coverage Matrix (critical packages — full POM extraction) - -The matrix below describes the **critical** path (full POM + parent resolution). -Non-critical packages are DB-only: they receive just the universe-stat columns -(`criticality_score`, `dependent_packages_count`, `dependent_repos_count`) -plus `purl`/`namespace`/`name`/`registry_url`/`last_synced_at`; -all POM-derived columns stay null for them. - -### packages - -| Column | Source | Coverage | -| ------------------------ | --------------------------------------------------- | --------------------------------------- | -| purl | packages_universe | ✅ all | -| ecosystem | hardcoded `'maven'` | ✅ all | -| namespace | packages_universe.namespace (= groupId) | ✅ all | -| name | packages_universe.name (= artifactId) | ✅ all | -| registry_url | `https://central.sonatype.com/artifact/{ns}/{name}` | ✅ all | -| latest_version | maven-metadata.xml `` | ✅ all | -| ingestion_source | see table below | ✅ all | -| last_synced_at | NOW() | ✅ all | -| description | POM `` | ✅ best-effort¹ | -| homepage | POM `` | ✅ best-effort¹ | -| declared_repository_url | POM `` raw | ✅ best-effort¹ | -| repository_url | normalized from declared_repository_url | ✅ best-effort¹ | -| licenses / licenses_raw | POM `` | ✅ best-effort¹ / ✅ full for critical² | -| status | Sonatype: deprecated flag | 🔜 Sonatype | -| versions_count | Sonatype: COUNT of releases | 🔜 Sonatype | -| first_release_at | Sonatype: MIN release timestamp | 🔜 Sonatype | -| latest_release_at | Sonatype: MAX release timestamp | 🔜 Sonatype | -| keywords | not in Maven POM | ❌ | -| dist*tags*\* | N/A — Maven ecosystem | ❌ | -| dependent_packages_count | not in Maven registry API | ❌ | -| dependent_repos_count | not in Maven registry API | ❌ | -| criticality_score | set by ranking function | ❌ | -| is_critical | set by ranking function | ❌ | -| last_rank_pass_at | set by ranking function | ❌ | - -### versions - -| Column | Source | Coverage | -| -------------- | ------------------------------------------------------------------------------------ | --------------- | -| package_id | FK from packages upsert | ✅ all | -| ecosystem | hardcoded `'maven'` | ✅ all | -| number | maven-metadata.xml `` | ✅ all | -| is_latest | `number === ` | ✅ all | -| is_prerelease | regex on version string³ | ✅ all | -| last_synced_at | NOW() | ✅ all | -| licenses | package-level license applied to all versions⁴ (stored as a single-element `text[]`) | ✅ best-effort¹ | -| published_at | Sonatype: release timestamp | 🔜 Sonatype | -| is_yanked | no yank mechanism in Maven | ❌ | -| download_count | no public per-version API | ❌ | - -### maintainers / package_maintainers - -| Column | Source | Coverage | -| ------------ | -------------------------------------------------------------------- | --------------- | -| ecosystem | hardcoded `'maven'` | ✅ all | -| username | POM `` | ✅ best-effort¹ | -| display_name | POM `` | ✅ best-effort¹ | -| email_hash | SHA-256(``) — GDPR | ✅ best-effort¹ | -| url | POM `` | ✅ best-effort¹ | -| role | `'author'` from ``, `'maintainer'` from `` | ✅ best-effort¹ | -| github_login | requires identity resolution | ❌ | - -### repos / package_repos - -| Column | Source | Coverage | -| ------------------------------------- | -------------------------------------------------------------- | ------------------------------- | -| repos.url | `repository_url` (normalized from POM ``) | ✅ best-effort¹ | -| repos.host | derived from URL (`github` / `gitlab` / `bitbucket` / `other`) | ✅ best-effort¹ | -| repos.owner | URL path segment | ✅ best-effort¹ | -| repos.name | URL path segment | ✅ best-effort¹ | -| repos.description / stars / forks / … | GitHub enricher | filled by github-repos-enricher | -| package_repos.source | `'declared'` (from POM ``) | ✅ best-effort¹ | -| package_repos.confidence | `0.80` | ✅ best-effort¹ | - -The POM fetcher seeds `repos` with URL-derivable fields only. The GitHub enricher then fills the rest (description, stars, forks, language, topics, etc.) because the repo row already exists. On conflict the `repos` upsert uses `COALESCE` — richer data from other enrichers is never overwritten. - -`package_repos.confidence` is updated with `GREATEST(new, existing)` so a higher-confidence link from deps.dev (`0.90`) is never downgraded by our `0.80` write. - -### Not supported (no Maven source) - -`package_funding_links` — no funding concept in Maven POM. -`package_name_history` — Maven coordinates are immutable; rename history does not exist. -`downloads_daily` — no public per-day download API from Maven Central. -`downloads_last_30d` — 🔜 Sonatype. - ---- - -**Notes:** - -> ¹ **best-effort**: field is populated only when present in the resolved POM chain. Non-critical packages skip POM fetching entirely (DB-only), so these columns are always null for them. -> -> ² **full resolution for critical**: parent chain is followed (max 8 hops), so inherited fields are resolved correctly. -> -> ³ **prerelease regex**: matches `-SNAPSHOT`, `-alpha`, `-beta`, `-rc`, `-M[0-9]+` (case-insensitive). -> -> ⁴ **license per version**: the package-level license (first entry from POM ``) is applied to all versions. Per-version POM fetches are not performed. This is an approximation — Maven licenses rarely change between versions. - ---- - -## `ingestion_source` Values - -| Value | Meaning | -| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `maven-registry` | Critical — full POM + parent resolution succeeded | -| `packages_universe` | Non-critical (DB-only) — only universe stats copied, no POM fetch | -| `maven_not_on_central` | `maven-metadata.xml` not found on `repo1.maven.org` — artifact is hosted on a third-party repository (e.g. WSO2 Nexus, JBoss, Atlassian). Universe data came from an aggregator (deps.dev, OSV). | -| `maven_no_version` | `maven-metadata.xml` found but `` is empty — artifact has no stable release | -| `maven_error` | `maven-metadata.xml` has a release version but the `.pom` file for that version is a 404. Typical cause: partial deploy to Maven Central (metadata updated, artifact not uploaded) or Eclipse P2 feature artifacts that don't publish a standard POM. | - -> On a 403/429 rate-limit or a transient network error, **no sentinel record is written**: -> the batch counts the package as an error and it is simply retried on the next tick/pass. - ---- - -## Known Exceptions - -### WSO2 (`org.wso2.*`) - -WSO2 publishes some artifacts exclusively to their own Nexus at `maven.wso2.org`. A subset of their artifacts appear in `packages_universe` (sourced from deps.dev/OSV which aggregates all Maven repositories) but are **not** available on `repo1.maven.org/maven2`. - -Affected pattern: `org.wso2.carbon.*` — specifically `.feature` Eclipse P2 artifacts and `.stub` artifacts. These are written with `ingestion_source = 'maven_not_on_central'` and are not retried until the next `nonCriticalPomRefreshDays` window. - -### Eclipse P2 Feature Artifacts (`*.feature`) - -Eclipse/OSGi feature artifacts (e.g. `org.wso2.carbon.identity.xacml.server.feature`) are packaged as `.zip` files, not `.jar`. Some publishers update `maven-metadata.xml` on Central without uploading the corresponding `.pom`. These land in `maven_error`. No fix is possible without the publisher correcting their CI/CD pipeline. - -### Maven Central 403 rate limiting - -Maven Central (`repo1.maven.org`) returns 403 as a throttling mechanism in addition to the canonical 429. The behaviour is handled at two levels: - -1. **Exponential backoff retry** — 403 and 429 are retried up to 3 times (2s base, ×2 per attempt). Handled in `getWithRetry` (extract.ts) and `resolveVersionsList` (metadata.ts). - -2. **Retry on the next pass** — if all retries are exhausted, the batch counts the package as an error (no sentinel record is written to `packages`) and picks it up again on the next tick/pass, when the IP has cooled down. - -**Root cause of persistent 403s:** `packages_universe` is ordered by `rank_in_ecosystem`, so packages from the same namespace (e.g. `com.google.apis`, `org.wso2`, `software.amazon.awssdk`) cluster together in the batch and hit the same Maven Central CDN node in rapid succession. The rate limit consistently kicks in after ~150–200 packages processed. - -**Mitigations applied, in order of effectiveness:** - -1. **In-process parent POM cache** (see [Parent POM cache](#parent-pom-cache)) — leverages namespace clustering to collapse fetches of shared parents and the double fetch of the leaf POM, reducing the **total volume** of requests. This is the main lever: throttling is volume-based per IP, so fewer requests = fewer 403s. -2. A configurable delay between concurrent groups (`POM_FETCHER_GROUP_DELAY_MS`) + low `POM_FETCHER_CONCURRENCY` (≤5) → lower the instantaneous rate. -3. Retry backoff with jitter (`±500ms`, see `extract.ts` / `metadata.ts`) → avoids synchronized retries. - -> Note: **shuffling the batch does not help** — it reorders the same N requests within the same time window (same volume → same throttle) and would additionally break the locality that makes the parent cache effective. - -Namespaces known to trigger the rate limit due to their high artifact density: `com.google.apis`, `software.amazon.awssdk`, `org.wso2.*`. - -**Hot IP during local testing:** repeated runs on the same machine accumulate request history on the IP. Maven Central uses long throttle windows (1–4 hours), so even at concurrency=3 + delay=400ms the IP can stay throttled for the entire test session. This does not happen in production because runs are spaced 24 hours apart and the IP is always cold between passes. To check whether the IP is throttled: `curl -I https://repo1.maven.org/maven2/org/wso2/carbon/identity/framework/application-mgt/maven-metadata.xml` — an immediate 403 response confirms the throttle. - -### Partial Maven Central Deploys - -Occasionally a publisher's CI/CD updates `` in `maven-metadata.xml` before the `.pom` is fully propagated to all Central mirrors. These appear as `maven_error` on the first pass and usually resolve on the next periodic refresh. - ---- - -## Configuration Reference - -**All variables are required** — `getMavenConfig()` (`config.ts`) calls `requireEnv` for each, -so the worker throws on startup if any is missing. Suggested values shown. - -| Env var | Suggested | Description | -| -------------------------------------- | ----------- | ------------------------------------------------------------------------------------------------------------ | -| `POM_FETCHER_BATCH_SIZE` | `50` | Packages per batch — critical phase | -| `POM_FETCHER_CONCURRENCY` | `5` | Concurrent fetches — critical phase | -| `POM_FETCHER_NON_CRITICAL_BATCH_SIZE` | `500` | Packages per batch — non-critical phase | -| `POM_FETCHER_NON_CRITICAL_CONCURRENCY` | `20` | Concurrent writes — non-critical DB-only phase | -| `POM_FETCHER_REFRESH_DAYS` | `1` | Staleness window — re-sync a package once its `last_synced_at` is older than N days (applies to both phases) | -| `POM_FETCHER_GROUP_DELAY_MS` | `200`–`400` | Delay between concurrent groups in the critical phase (rate-limit mitigation) | - -**Concurrency guidance:** Maven Central handles 10–15 concurrent requests per IP without throttling. Retry logic with exponential backoff handles 429/403s. Keep `POM_FETCHER_CONCURRENCY` ≤ 5 locally — repeated local runs heat the IP (see [Known Exceptions](#maven-central-403-rate-limiting)). - ---- - -## Performance - -Observed on ~2K packages (local dev, Maven Central over the network): - -| Phase | Mode | Throughput | Notes | -| ------------ | ---------- | -------------- | ------------------------------------------------------- | -| Non-critical | DB-only | ~1000 pkg/sec | Pure DB writes, no HTTP | -| Non-critical | direct-pom | ~25 pkg/sec | 2 HTTP requests/pkg: metadata.xml + POM | -| Critical | full-pom | ~15–25 pkg/sec | Faster when packages share parent POMs (CDN cache warm) | - -**Estimated time for ~800K packages (≈18% critical):** - -| Phase | Packages | Estimated time | -| ------------------------------------- | -------- | -------------- | -| Non-critical (DB-only) | ~670K | ~12 min | -| Critical (full POM, first extraction) | ~150K | several hours | - -The first critical extraction is the expensive part — run it with the standalone backfill -loop. Afterwards the Temporal schedules keep things incremental: non-critical re-syncs cheaply -every `POM_FETCHER_REFRESH_DAYS`, and critical packages are re-fetched only when a new release -is published (the Temporal path skips unchanged versions) -or once their refresh window elapses. - -Under Temporal **each tick processes one batch** within its workflow timeout (15 min critical / -5 min non-critical); the backlog is drained over many ticks, not in one long pass. To go -faster, raise `POM_FETCHER_BATCH_SIZE` / `POM_FETCHER_CONCURRENCY` (keep concurrency ≤ 15 to -avoid Maven Central throttling) or trigger the schedule manually. - ---- - -## Scheduling - -Two Temporal schedules are registered on startup of `bin/packages-worker.ts` -(see `maven/schedule.ts`): - -| Schedule ID | Cron | Workflow | Activity | Workflow timeout | -| -------------------- | ----------------------------- | -------------------------- | ------------------------------------------------------- | ---------------- | -| `maven-critical` | `*/5 * * * *` (every 5 min) | `mavenCriticalWorkflow` | `processMavenCriticalBatch` → one critical batch | 15 min | -| `maven-non-critical` | `*/10 * * * *` (every 10 min) | `mavenNonCriticalWorkflow` | `processMavenNonCriticalBatch` → one non-critical batch | 5 min | - -Both: overlap policy `SKIP` (a tick is dropped if the previous run is still active), -catchup window 1 hour, retry 3× (30s initial, 2× backoff). - -**Each tick processes a single batch** (`POM_FETCHER_BATCH_SIZE` / -`POM_FETCHER_NON_CRITICAL_BATCH_SIZE`), not a full pass — the queue is drained incrementally -across ticks. - -To run a batch on demand instead of waiting for the cron, trigger the schedule from the -Temporal UI (the schedule's **Trigger** button) or the CLI: - -```bash -temporal schedule trigger --schedule-id maven-critical -``` - ---- - -## Known Data Anomalies - -### High version counts - -Maven packages released via automated CI/CD pipelines (every commit or every day) accumulate thousands of versions on Central. Observed examples on a 10K sample: - -| Package | Versions | -| --------------------------------- | --------- | -| io.joern/x2cpg_3 | ~2 166 | -| org.cdk8s/cdk8s | ~1 749 | -| io.joern/semanticcpg_3 | ~2 077 | -| org.janusgraph/\* (×15 artifacts) | ~795 each | - -`maven-metadata.xml` `` lists **every version ever published**, including each snapshot, alpha, RC, and automated patch. On a 10K package run this produced ~3.8M rows in the `versions` table (~1 375 versions/package on average). - -This is correct data, not a bug. The high cardinality is expected given Maven's publishing model and is useful for `versions_count`, `first_release_at`, and `is_prerelease` derivation. `published_at` (pending Sonatype) will complete the picture. - -To inspect the distribution: - -```sql -SELECT - width_bucket(cnt, 0, 3000, 10) AS bucket, - min(cnt) AS min_versions, - max(cnt) AS max_versions, - count(*) AS packages -FROM (SELECT package_id, count(*) AS cnt FROM versions GROUP BY package_id) t -GROUP BY bucket ORDER BY bucket; -``` - -### Low repo coverage for non-critical packages - -On the same 10K sample only ~3–4% of packages produced a `repos` row. The root cause is that `` is frequently absent from the direct POM and inherited from a parent POM instead (Apache parent, Spring parent, Google parent, etc.). Since non-critical packages use direct-POM fetch without parent resolution, those SCM URLs are null and no repo row is written. - -Coverage breakdown by ingestion source: - -```sql -SELECT p.ingestion_source, count(p.id) AS packages, count(pr.id) AS with_repo -FROM packages p -LEFT JOIN package_repos pr ON pr.package_id = p.id -WHERE p.ecosystem = 'maven' -GROUP BY p.ingestion_source -ORDER BY packages DESC; -``` - -Expected behaviour: - -- `maven` (critical, full resolution) → high repo coverage -- `maven_direct` (non-critical, no parent resolution) → low repo coverage -- `maven_not_on_central` / `maven_error` → no repo (no POM data) - -Repo coverage will grow naturally as the critical package set expands and as non-critical packages hit their 30-day POM refresh window. - ---- - -## Pending: Sonatype Integration - -The following fields require data from the Sonatype API and are not yet populated: - -- `packages.status` — deprecated flag -- `packages.versions_count` — count of published releases -- `packages.first_release_at` — timestamp of first release -- `packages.latest_release_at` — timestamp of most recent release -- `versions.published_at` — per-version release timestamp -- `downloads_last_30d` — 30-day rolling download count diff --git a/services/apps/packages_worker/src/maven/extract.ts b/services/apps/packages_worker/src/maven/extract.ts index 5c9c5f4a47..66485b558c 100644 --- a/services/apps/packages_worker/src/maven/extract.ts +++ b/services/apps/packages_worker/src/maven/extract.ts @@ -141,8 +141,9 @@ export async function fetchPom( // // Parent POMs are heavily shared across artifacts of the same namespace // (e.g. org.apache:apache, org.springframework.boot:spring-boot-starter-parent), -// and the sync queue is ordered by rank_in_ecosystem, so those siblings are -// processed close together. A module-level, coordinate-keyed in-process cache +// and the critical batch is re-sorted by namespace before processing (see +// runMavenEnrichmentLoop), so those siblings are processed close together. A +// module-level, coordinate-keyed in-process cache // collapses those repeated parent fetches into a single HTTP request — the single // biggest lever against Maven Central rate limiting. It also removes the redundant // second fetch of each artifact's own POM (extractArtifact fetches the leaf, then @@ -318,9 +319,10 @@ async function resolveWithInheritance( // ─── Public entry points ────────────────────────────────────────────────────── /** - * Fetches only the root POM without following the parent chain. - * Faster than extractArtifact — use for non-critical packages where inherited - * fields (licenses, SCM) may be missing but throughput matters more. + * Fetches only the root POM without following the parent chain — faster than + * extractArtifact, but inherited fields (licenses, SCM) may be missing. + * Currently unused: kept as a lightweight option for high-throughput paths that + * don't need parent inheritance. */ export async function extractArtifactDirect( groupId: string, diff --git a/services/libs/data-access-layer/src/osspckgs/versions.ts b/services/libs/data-access-layer/src/osspckgs/versions.ts index cdbaa9c40d..6ea6e46f51 100644 --- a/services/libs/data-access-layer/src/osspckgs/versions.ts +++ b/services/libs/data-access-layer/src/osspckgs/versions.ts @@ -3,7 +3,8 @@ import { QueryExecutor } from '../queryExecutor' import { IDbVersionUpsert } from './types' /** - * Bulk-upserts a list of versions for a single package. + * Bulk-upserts a list of versions for a single package. All elements must share + * the same packageId — throws otherwise (the change-detection logic assumes it). * Uses UNNEST arrays to avoid N individual round-trips. * On conflict (package_id, number) updates is_latest, is_prerelease, and * licenses (never overwrites an existing licenses array with NULL). @@ -17,6 +18,14 @@ export async function upsertVersionsBatch( ): Promise { if (versions.length === 0) return [] + // This function operates on a single package: `old` reads by a scalar packageId and + // the changed-fields join keys on `number` alone, so mixing packageIds would silently + // corrupt the result. Enforce the invariant rather than rely on the caller. + const packageId = versions[0].packageId + if (versions.some((v) => v.packageId !== packageId)) { + throw new Error('upsertVersionsBatch: all versions must belong to the same package') + } + // maven-metadata.xml sometimes contains duplicate version strings — deduplicate // by number before inserting to avoid "ON CONFLICT DO UPDATE command cannot affect // row a second time" from PostgreSQL @@ -37,11 +46,10 @@ export async function upsertVersionsBatch( ins AS ( INSERT INTO versions (package_id, ecosystem, namespace, name, number, is_latest, is_prerelease, licenses, last_synced_at) SELECT - t.package_id, t.ecosystem, t.namespace, t.name, t.number, t.is_latest, t.is_prerelease, + $(packageId)::bigint, t.ecosystem, t.namespace, t.name, t.number, t.is_latest, t.is_prerelease, CASE WHEN t.license IS NULL THEN NULL ELSE ARRAY[t.license] END, NOW() FROM UNNEST( - $(packageIds)::bigint[], $(ecosystems)::text[], $(namespaces)::text[], $(names)::text[], @@ -49,7 +57,7 @@ export async function upsertVersionsBatch( $(isLatests)::bool[], $(isPreleases)::bool[], $(licenses)::text[] - ) AS t(package_id, ecosystem, namespace, name, number, is_latest, is_prerelease, license) + ) AS t(ecosystem, namespace, name, number, is_latest, is_prerelease, license) ON CONFLICT (package_id, number) DO UPDATE SET namespace = COALESCE(EXCLUDED.namespace, versions.namespace), is_latest = EXCLUDED.is_latest, @@ -67,8 +75,7 @@ export async function upsertVersionsBatch( FROM ins LEFT JOIN old o ON o.number = ins.number `, { - packageId: versions[0].packageId, - packageIds: versions.map((v) => v.packageId), + packageId, ecosystems: versions.map((v) => v.ecosystem), namespaces: versions.map((v) => v.namespace), names: versions.map((v) => v.name), From 9229257064802680d5077d8a1b5b9041c8f19609 Mon Sep 17 00:00:00 2001 From: Umberto Sgueglia Date: Thu, 4 Jun 2026 16:32:00 +0200 Subject: [PATCH 22/22] fix: builder Signed-off-by: Umberto Sgueglia --- scripts/builders/packages-worker.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/builders/packages-worker.env b/scripts/builders/packages-worker.env index 9c8d44032c..038dcb5575 100644 --- a/scripts/builders/packages-worker.env +++ b/scripts/builders/packages-worker.env @@ -1,4 +1,4 @@ -DOCKERFILE="./services/docker/Dockerfile.packages-worker" +DOCKERFILE="./services/docker/Dockerfile.packages" CONTEXT="../" REPO="sjc.ocir.io/axbydjxa5zuh/packages-worker" SERVICES="maven-worker"