88#
99
1010import gzip
11+ import hashlib
1112import io
1213import os
1314from collections import namedtuple
@@ -574,6 +575,7 @@ def __init__(
574575 index_location = None ,
575576 index_properties_location = None ,
576577 last_incremental = None ,
578+ last_processed_purl = None ,
577579 logger = None ,
578580 ):
579581 if index_location and last_incremental :
@@ -584,6 +586,7 @@ def __init__(
584586 )
585587
586588 self .downloads = []
589+ self .last_processed_purl = last_processed_purl
587590
588591 if index_properties_location :
589592 self .index_properties_location = index_properties_location
@@ -615,6 +618,25 @@ def __init__(
615618 self .index_location = index_download .path
616619 self .index_increment_locations = []
617620
621+ self .index_checksum = self ._compute_index_checksum ()
622+
623+ def _compute_index_checksum (self ):
624+ """Compute SHA-256 checksum of the index file(s) for checkpoint validation."""
625+ locations = []
626+ if self .index_location :
627+ locations .append (self .index_location )
628+ locations .extend (self .index_increment_locations )
629+
630+ if not locations :
631+ return None
632+
633+ sha256 = hashlib .sha256 ()
634+ for location in sorted (locations ):
635+ with open (location , "rb" ) as f :
636+ for chunk in iter (lambda : f .read (8192 ), b"" ):
637+ sha256 .update (chunk )
638+ return sha256 .hexdigest ()
639+
618640 def __del__ (self ):
619641 if self .downloads :
620642 for download in self .downloads :
@@ -658,8 +680,9 @@ def _fetch_index_increments(self, last_incremental):
658680 index_increment_downloads .append (index_increment )
659681 return index_increment_downloads
660682
661- def _get_packages (self , content = None ):
683+ def _get_packages (self , content = None , last_processed_purl = None ):
662684 artifacts = get_artifacts (content , worthyness = is_worthy_artifact )
685+ skipping = bool (last_processed_purl )
663686
664687 for artifact in artifacts :
665688 # we cannot do much without these
@@ -723,17 +746,27 @@ def _get_packages(self, content=None):
723746 name = artifact_id ,
724747 version = version ,
725748 )
749+
750+ if skipping :
751+ if str (current_purl ) == last_processed_purl :
752+ skipping = False
753+ continue
754+
755+ self .last_processed_purl = str (current_purl )
726756 yield current_purl , [package .purl ]
727757
728758 def _get_packages_from_index_increments (self ):
729759 for index_increment in self .index_increment_locations :
730760 yield self ._get_packages (content = index_increment )
731761
732- def get_packages (self ):
762+ def get_packages (self , last_processed_purl = None ):
733763 """Yield Package objects from maven index or index increments"""
734764 packages = []
735765 if self .index_increment_locations :
736766 packages = chain (self ._get_packages_from_index_increments ())
737767 elif self .index_location :
738- packages = self ._get_packages (content = self .index_location )
768+ packages = self ._get_packages (
769+ content = self .index_location ,
770+ last_processed_purl = last_processed_purl ,
771+ )
739772 return packages
0 commit comments