From 6d9fde946c1ec9919b32318c02db03e25052c14f Mon Sep 17 00:00:00 2001 From: SanskaarUndale21 Date: Tue, 12 May 2026 00:24:15 +0530 Subject: [PATCH] feat(exporter): skip GCS upload when object CRC32C is unchanged Before this change, the exporter uploaded every output file on every run regardless of whether the content had changed. Since all.zip and other outputs are now reproducible (#3491), unchanged files would accumulate redundant object generations in the bucket, making it harder for downstream consumers to detect real updates. The writer now calls ReadObjectAttrs before each GCS write and computes the CRC32C of the outgoing data using the Castagnoli polynomial (the same algorithm GCS uses for its stored checksums). If the checksums match, the upload is skipped and an info log is emitted. New objects (ErrNotFound) and any transient attr-read errors fall through to the normal upload path so the exporter remains correct under all conditions. Tests verify the three cases: same content is skipped, changed content is uploaded, and brand-new objects are always created. Fixes #3513 --- go/cmd/exporter/writer.go | 29 ++++++- go/cmd/exporter/writer_test.go | 134 +++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+), 1 deletion(-) create mode 100644 go/cmd/exporter/writer_test.go diff --git a/go/cmd/exporter/writer.go b/go/cmd/exporter/writer.go index 363358a80b9..45eda36103a 100644 --- a/go/cmd/exporter/writer.go +++ b/go/cmd/exporter/writer.go @@ -2,6 +2,8 @@ package main import ( "context" + "errors" + "hash/crc32" "log/slog" "os" "path/filepath" @@ -11,6 +13,9 @@ import ( "github.com/google/osv.dev/go/osv/clients" ) +// crc32cTable uses the Castagnoli polynomial, matching GCS's own checksum algorithm. +var crc32cTable = crc32.MakeTable(crc32.Castagnoli) + // writeMsg holds the data for a file to be written. type writeMsg struct { path string @@ -31,7 +36,10 @@ func writer(ctx context.Context, cancel context.CancelFunc, inCh <-chan writeMsg } path := filepath.Join(pathPrefix, msg.path) if client != nil { - // Write to the bucket. + // Skip the upload if the object already has the same content. + if gcsContentUnchanged(ctx, client, path, msg.data) { + break + } err := client.WriteObject(ctx, path, msg.data, &clients.WriteOptions{ ContentType: msg.mimeType, }) @@ -62,3 +70,22 @@ func writer(ctx context.Context, cancel context.CancelFunc, inCh <-chan writeMsg } } } + +// gcsContentUnchanged returns true if the object at path already has the same +// CRC32C checksum as data, meaning the upload would be a no-op. Any error +// reading the object's attributes (other than ErrNotFound) is logged and +// treated as "content changed" so the upload proceeds. +func gcsContentUnchanged(ctx context.Context, client clients.CloudStorage, path string, data []byte) bool { + attrs, err := client.ReadObjectAttrs(ctx, path) + if err != nil { + if !errors.Is(err, clients.ErrNotFound) { + logger.WarnContext(ctx, "failed to read object attrs, proceeding with upload", slog.String("path", path), slog.Any("err", err)) + } + return false + } + if attrs.CRC32C == crc32.Checksum(data, crc32cTable) { + logger.InfoContext(ctx, "skipping upload, content unchanged", slog.String("path", path)) + return true + } + return false +} diff --git a/go/cmd/exporter/writer_test.go b/go/cmd/exporter/writer_test.go new file mode 100644 index 00000000000..c376e1fcb3e --- /dev/null +++ b/go/cmd/exporter/writer_test.go @@ -0,0 +1,134 @@ +package main + +import ( + "context" + "path/filepath" + "sync" + "testing" + + "github.com/google/osv.dev/go/osv/clients" + "github.com/google/osv.dev/go/testutils" +) + +// runWriter sends msgs to a writer goroutine and waits for it to finish. +func runWriter(t *testing.T, storage clients.CloudStorage, pathPrefix string, msgs []writeMsg) { + t.Helper() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + inCh := make(chan writeMsg, len(msgs)) + for _, m := range msgs { + inCh <- m + } + close(inCh) + + var wg sync.WaitGroup + wg.Add(1) + go writer(ctx, cancel, inCh, storage, pathPrefix, &wg) + wg.Wait() +} + +func TestWriter_GCS_SkipsUnchangedContent(t *testing.T) { + storage := testutils.NewMockStorage() + data := []byte(`{"id":"OSV-1"}`) + + // Pre-populate using the same path the writer will compute. + objPath := filepath.Join("out", "OSV-1.json") + if err := storage.WriteObject(t.Context(), objPath, data, nil); err != nil { + t.Fatalf("setup: %v", err) + } + attrsBefore, _ := storage.ReadObjectAttrs(t.Context(), objPath) + + runWriter(t, storage, "out", []writeMsg{ + {path: "OSV-1.json", mimeType: "application/json", data: data}, + }) + + attrsAfter, err := storage.ReadObjectAttrs(t.Context(), objPath) + if err != nil { + t.Fatalf("ReadObjectAttrs: %v", err) + } + if attrsAfter.Generation != attrsBefore.Generation { + t.Errorf("expected generation %d (skipped), got %d", attrsBefore.Generation, attrsAfter.Generation) + } +} + +func TestWriter_GCS_UploadsChangedContent(t *testing.T) { + storage := testutils.NewMockStorage() + + objPath := filepath.Join("out", "OSV-1.json") + if err := storage.WriteObject(t.Context(), objPath, []byte(`{"id":"OSV-1","old":true}`), nil); err != nil { + t.Fatalf("setup: %v", err) + } + attrsBefore, _ := storage.ReadObjectAttrs(t.Context(), objPath) + + runWriter(t, storage, "out", []writeMsg{ + {path: "OSV-1.json", mimeType: "application/json", data: []byte(`{"id":"OSV-1","old":false}`)}, + }) + + attrsAfter, err := storage.ReadObjectAttrs(t.Context(), objPath) + if err != nil { + t.Fatalf("ReadObjectAttrs: %v", err) + } + if attrsAfter.Generation <= attrsBefore.Generation { + t.Errorf("expected generation > %d (uploaded), got %d", attrsBefore.Generation, attrsAfter.Generation) + } +} + +func TestWriter_GCS_UploadsNewObject(t *testing.T) { + storage := testutils.NewMockStorage() + + runWriter(t, storage, "out", []writeMsg{ + {path: "OSV-1.json", mimeType: "application/json", data: []byte(`{"id":"OSV-1"}`)}, + }) + + objPath := filepath.Join("out", "OSV-1.json") + attrs, err := storage.ReadObjectAttrs(t.Context(), objPath) + if err != nil { + t.Fatalf("expected object to exist after upload: %v", err) + } + if attrs.Generation != 1 { + t.Errorf("expected generation 1 for new object, got %d", attrs.Generation) + } +} + +func TestWriter_GCS_SkipsMultipleUnchanged(t *testing.T) { + storage := testutils.NewMockStorage() + + type entry struct { + msgPath string + objPath string + data []byte + } + entries := []entry{ + {"A.json", filepath.Join("out", "A.json"), []byte(`{"id":"A"}`)}, + {"B.json", filepath.Join("out", "B.json"), []byte(`{"id":"B"}`)}, + {"C.json", filepath.Join("out", "C.json"), []byte(`{"id":"C"}`)}, + } + for _, e := range entries { + if err := storage.WriteObject(t.Context(), e.objPath, e.data, nil); err != nil { + t.Fatalf("setup %s: %v", e.objPath, err) + } + } + + gensBefore := make(map[string]int64) + for _, e := range entries { + attrs, _ := storage.ReadObjectAttrs(t.Context(), e.objPath) + gensBefore[e.objPath] = attrs.Generation + } + + msgs := make([]writeMsg, len(entries)) + for i, e := range entries { + msgs[i] = writeMsg{path: e.msgPath, mimeType: "application/json", data: e.data} + } + runWriter(t, storage, "out", msgs) + + for _, e := range entries { + attrs, err := storage.ReadObjectAttrs(t.Context(), e.objPath) + if err != nil { + t.Fatalf("ReadObjectAttrs(%s): %v", e.objPath, err) + } + if attrs.Generation != gensBefore[e.objPath] { + t.Errorf("%s: expected generation %d (skipped), got %d", e.objPath, gensBefore[e.objPath], attrs.Generation) + } + } +}