diff --git a/vulnerabilities/importers/__init__.py b/vulnerabilities/importers/__init__.py index 594021092..9cbf91320 100644 --- a/vulnerabilities/importers/__init__.py +++ b/vulnerabilities/importers/__init__.py @@ -82,6 +82,7 @@ from vulnerabilities.pipelines.v2_importers import ubuntu_osv_importer as ubuntu_osv_importer_v2 from vulnerabilities.pipelines.v2_importers import vulnrichment_importer as vulnrichment_importer_v2 from vulnerabilities.pipelines.v2_importers import xen_importer as xen_importer_v2 +from vulnerabilities.pipelines.v2_improvers import reference_collect_commits from vulnerabilities.utils import create_registry IMPORTERS_REGISTRY = create_registry( @@ -127,6 +128,7 @@ nginx_importer.NginxImporterPipeline, pysec_importer.PyPIImporterPipeline, fireeye_importer_v2.FireeyeImporterPipeline, + reference_collect_commits.CollectReferencesFixCommitsPipeline, apache_tomcat.ApacheTomcatImporter, postgresql.PostgreSQLImporter, debian.DebianImporter, diff --git a/vulnerabilities/improvers/__init__.py b/vulnerabilities/improvers/__init__.py index 97c18e6f9..45bc0b70b 100644 --- a/vulnerabilities/improvers/__init__.py +++ b/vulnerabilities/improvers/__init__.py @@ -31,6 +31,7 @@ enhance_with_metasploit as enhance_with_metasploit_v2, ) from vulnerabilities.pipelines.v2_improvers import flag_ghost_packages as flag_ghost_packages_v2 +from vulnerabilities.pipelines.v2_improvers import reference_collect_commits from vulnerabilities.pipelines.v2_improvers import relate_severities from vulnerabilities.pipelines.v2_improvers import unfurl_version_range as unfurl_version_range_v2 from vulnerabilities.utils import create_registry @@ -74,5 +75,6 @@ compute_advisory_todo.ComputeToDo, collect_ssvc_trees.CollectSSVCPipeline, relate_severities.RelateSeveritiesPipeline, + reference_collect_commits.CollectReferencesFixCommitsPipeline, ] ) diff --git a/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py new file mode 100644 index 000000000..275025797 --- /dev/null +++ b/vulnerabilities/pipelines/v2_improvers/reference_collect_commits.py @@ -0,0 +1,145 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +from collections import defaultdict + +from aboutcode.pipeline import LoopProgress +from django.db.models import Prefetch +from packageurl.contrib.purl2url import purl2url +from packageurl.contrib.url2purl import url2purl + +from aboutcode.federated import get_core_purl +from vulnerabilities.models import AdvisoryReference +from vulnerabilities.models import AdvisoryV2 +from vulnerabilities.models import ImpactedPackage +from vulnerabilities.models import PackageCommitPatch +from vulnerabilities.models import Patch +from vulnerabilities.pipelines import VulnerableCodePipeline +from vulnerabilities.utils import is_commit + + +class CollectReferencesFixCommitsPipeline(VulnerableCodePipeline): + """ + Improver pipeline to scout References/Patch and create PackageCommitPatch entries. + """ + + pipeline_id = "collect_ref_fix_commits_v2" + + @classmethod + def steps(cls): + return (cls.collect_and_store_fix_commits,) + + def get_vcs_data(self, url): + """Extracts a VCS URL and commit hash from URL. + >> get_vcs_commit('https://github.com/aboutcode-org/vulnerablecode/commit/98e516011d6e096e25247b82fc5f196bbeecff10') + ("pkg:github/aboutcode-org/vulnerablecode", 'https://github.com/aboutcode-org/vulnerablecode', '98e516011d6e096e25247b82fc5f196bbeecff10') + >> get_vcs_commit('https://github.com/aboutcode-org/vulnerablecode/pull/1974') + None + """ + try: + purl = url2purl(url) + if not purl: + return + + version = purl.version + if not version or not is_commit(version): + return + base_purl = get_core_purl(purl) + vcs_url = purl2url(base_purl.to_string()) + if base_purl and vcs_url and version: + return base_purl, vcs_url, version + except Exception as e: + self.log(f"Invalid URL: url:{url} error:{e}") + + def collect_and_store_fix_commits(self): + advisories = AdvisoryV2.objects.only("id").prefetch_related( + Prefetch("references", queryset=AdvisoryReference.objects.only("url")), + Prefetch("patches", queryset=Patch.objects.only("patch_url")), + ) + + progress = LoopProgress(total_iterations=advisories.count(), logger=self.log) + + commit_batch = [] + updated_pkg_patch_commit_count = 0 + batch_size = 10000 + for adv in progress.iter(advisories.paginated(per_page=batch_size)): + urls = {r.url for r in adv.references.all()} | {p.patch_url for p in adv.patches.all()} + + for url in urls: + vcs_data = self.get_vcs_data(url) + if not vcs_data: + continue + base_purl, vcs_url, commit_hash = vcs_data + commit_batch.append((str(base_purl), vcs_url, commit_hash, adv.id)) + + if len(commit_batch) >= batch_size: + updated_pkg_patch_commit_count += self.bulk_commit_batch_update(commit_batch) + commit_batch.clear() + + if commit_batch: + updated_pkg_patch_commit_count += self.bulk_commit_batch_update(commit_batch) + commit_batch.clear() + + self.log(f"Successfully processed pkg patch commit {updated_pkg_patch_commit_count:,d}") + + def bulk_commit_batch_update(self, vcs_data_table): + impact_data = {(row[0], row[3]) for row in vcs_data_table} # base_purl, adv_id + commit_data = {(row[1], row[2]) for row in vcs_data_table} # vcs_url, commit_hash + + adv_ids = {adv_id for _, adv_id in impact_data} + commit_hashes = {commit_hash for _, commit_hash in commit_data} + + existing_impacts = ImpactedPackage.objects.filter(advisory_id__in=adv_ids).only( + "base_purl", "advisory_id" + ) + existing_impact_pairs = { + (impact_pkg.base_purl, impact_pkg.advisory_id) for impact_pkg in existing_impacts + } + + if new_impacts := impact_data - existing_impact_pairs: + ImpactedPackage.objects.bulk_create( + [ + ImpactedPackage(base_purl=base_purl, advisory_id=adv_id) + for base_purl, adv_id in new_impacts + ] + ) + + PackageCommitPatch.objects.bulk_create( + [ + PackageCommitPatch(vcs_url=vcs_url, commit_hash=commit_hash) + for vcs_url, commit_hash in commit_data + ], + ignore_conflicts=True, + ) + + fetched_impacts = { + (impacted_pkg.base_purl, impacted_pkg.advisory_id): impacted_pkg + for impacted_pkg in ImpactedPackage.objects.filter(advisory_id__in=adv_ids).only( + "base_purl", "advisory_id" + ) + } + + fetched_pkg_commits = { + (pkg_commit_patch.vcs_url, pkg_commit_patch.commit_hash): pkg_commit_patch + for pkg_commit_patch in PackageCommitPatch.objects.filter( + commit_hash__in=commit_hashes + ).only("vcs_url", "commit_hash") + } + + pkg_commit_add_impact_pkg = defaultdict(list) + for base_purl, vcs_url, commit_hash, adv_id in vcs_data_table: + impacted_pkg_obj = fetched_impacts.get((base_purl, adv_id)) + pkg_commit_obj = fetched_pkg_commits.get((vcs_url, commit_hash)) + if impacted_pkg_obj and pkg_commit_obj: + pkg_commit_add_impact_pkg[pkg_commit_obj].append(impacted_pkg_obj) + + for pkg_commit_obj, impact_pkgs in pkg_commit_add_impact_pkg.items(): + pkg_commit_obj.fixed_in_impacts.add(*impact_pkgs) + + return len(vcs_data_table) diff --git a/vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py b/vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py new file mode 100644 index 000000000..cb26f04f2 --- /dev/null +++ b/vulnerabilities/tests/pipelines/v2_improvers/test_reference_collect_commits_v2.py @@ -0,0 +1,72 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. + +from datetime import datetime + +import pytest + +from vulnerabilities.models import AdvisoryReference +from vulnerabilities.models import AdvisoryV2 +from vulnerabilities.models import ImpactedPackage +from vulnerabilities.models import PackageCommitPatch +from vulnerabilities.models import PackageV2 +from vulnerabilities.pipelines.v2_improvers.reference_collect_commits import ( + CollectReferencesFixCommitsPipeline, +) + + +@pytest.mark.django_db +def test_collect_fix_commits_pipeline_creates_entry(): + advisory = AdvisoryV2.objects.create( + advisory_id="CVE-2025-1000", + datasource_id="test-ds", + avid="test-ds/CVE-2025-1000", + url="https://example.com/advisory/CVE-2025-1000", + unique_content_id="11111", + date_collected=datetime.now(), + ) + + reference = AdvisoryReference.objects.create( + url="https://github.com/test/testpkg/commit/6bd301819f8f69331a55ae2336c8b111fc933f3d" + ) + advisory.references.add(reference) + + pipeline = CollectReferencesFixCommitsPipeline() + pipeline.collect_and_store_fix_commits() + + package_commit_patch = PackageCommitPatch.objects.all() + impacted_packages = advisory.impacted_packages.all() + + assert package_commit_patch.count() == 1 + assert impacted_packages.count() == 1 + + fix = package_commit_patch.first() + assert fix.commit_hash == "6bd301819f8f69331a55ae2336c8b111fc933f3d" + assert fix.vcs_url == "https://github.com/test/testpkg" + assert impacted_packages.first().fixed_by_package_commit_patches.count() == 1 + + +@pytest.mark.django_db +def test_collect_fix_commits_pipeline_skips_non_commit_urls(): + advisory = AdvisoryV2.objects.create( + advisory_id="CVE-2025-2000", + datasource_id="test-ds", + avid="test-ds/CVE-2025-2000", + url="https://example.com/advisory/CVE-2025-2000", + unique_content_id="11111", + date_collected=datetime.now(), + ) + + reference = AdvisoryReference.objects.create( + url="https://github.com/test/testpkg/issues/12" + ) # invalid reference 1 + advisory.references.add(reference) + + pipeline = CollectReferencesFixCommitsPipeline() + pipeline.collect_and_store_fix_commits() + assert PackageCommitPatch.objects.count() == 0