diff --git a/go/cmd/exporter/writer.go b/go/cmd/exporter/writer.go index 363358a80b9..45eda36103a 100644 --- a/go/cmd/exporter/writer.go +++ b/go/cmd/exporter/writer.go @@ -2,6 +2,8 @@ package main import ( "context" + "errors" + "hash/crc32" "log/slog" "os" "path/filepath" @@ -11,6 +13,9 @@ import ( "github.com/google/osv.dev/go/osv/clients" ) +// crc32cTable uses the Castagnoli polynomial, matching GCS's own checksum algorithm. +var crc32cTable = crc32.MakeTable(crc32.Castagnoli) + // writeMsg holds the data for a file to be written. type writeMsg struct { path string @@ -31,7 +36,10 @@ func writer(ctx context.Context, cancel context.CancelFunc, inCh <-chan writeMsg } path := filepath.Join(pathPrefix, msg.path) if client != nil { - // Write to the bucket. + // Skip the upload if the object already has the same content. + if gcsContentUnchanged(ctx, client, path, msg.data) { + break + } err := client.WriteObject(ctx, path, msg.data, &clients.WriteOptions{ ContentType: msg.mimeType, }) @@ -62,3 +70,22 @@ func writer(ctx context.Context, cancel context.CancelFunc, inCh <-chan writeMsg } } } + +// gcsContentUnchanged returns true if the object at path already has the same +// CRC32C checksum as data, meaning the upload would be a no-op. Any error +// reading the object's attributes (other than ErrNotFound) is logged and +// treated as "content changed" so the upload proceeds. +func gcsContentUnchanged(ctx context.Context, client clients.CloudStorage, path string, data []byte) bool { + attrs, err := client.ReadObjectAttrs(ctx, path) + if err != nil { + if !errors.Is(err, clients.ErrNotFound) { + logger.WarnContext(ctx, "failed to read object attrs, proceeding with upload", slog.String("path", path), slog.Any("err", err)) + } + return false + } + if attrs.CRC32C == crc32.Checksum(data, crc32cTable) { + logger.InfoContext(ctx, "skipping upload, content unchanged", slog.String("path", path)) + return true + } + return false +} diff --git a/go/cmd/exporter/writer_test.go b/go/cmd/exporter/writer_test.go new file mode 100644 index 00000000000..c376e1fcb3e --- /dev/null +++ b/go/cmd/exporter/writer_test.go @@ -0,0 +1,134 @@ +package main + +import ( + "context" + "path/filepath" + "sync" + "testing" + + "github.com/google/osv.dev/go/osv/clients" + "github.com/google/osv.dev/go/testutils" +) + +// runWriter sends msgs to a writer goroutine and waits for it to finish. +func runWriter(t *testing.T, storage clients.CloudStorage, pathPrefix string, msgs []writeMsg) { + t.Helper() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + inCh := make(chan writeMsg, len(msgs)) + for _, m := range msgs { + inCh <- m + } + close(inCh) + + var wg sync.WaitGroup + wg.Add(1) + go writer(ctx, cancel, inCh, storage, pathPrefix, &wg) + wg.Wait() +} + +func TestWriter_GCS_SkipsUnchangedContent(t *testing.T) { + storage := testutils.NewMockStorage() + data := []byte(`{"id":"OSV-1"}`) + + // Pre-populate using the same path the writer will compute. + objPath := filepath.Join("out", "OSV-1.json") + if err := storage.WriteObject(t.Context(), objPath, data, nil); err != nil { + t.Fatalf("setup: %v", err) + } + attrsBefore, _ := storage.ReadObjectAttrs(t.Context(), objPath) + + runWriter(t, storage, "out", []writeMsg{ + {path: "OSV-1.json", mimeType: "application/json", data: data}, + }) + + attrsAfter, err := storage.ReadObjectAttrs(t.Context(), objPath) + if err != nil { + t.Fatalf("ReadObjectAttrs: %v", err) + } + if attrsAfter.Generation != attrsBefore.Generation { + t.Errorf("expected generation %d (skipped), got %d", attrsBefore.Generation, attrsAfter.Generation) + } +} + +func TestWriter_GCS_UploadsChangedContent(t *testing.T) { + storage := testutils.NewMockStorage() + + objPath := filepath.Join("out", "OSV-1.json") + if err := storage.WriteObject(t.Context(), objPath, []byte(`{"id":"OSV-1","old":true}`), nil); err != nil { + t.Fatalf("setup: %v", err) + } + attrsBefore, _ := storage.ReadObjectAttrs(t.Context(), objPath) + + runWriter(t, storage, "out", []writeMsg{ + {path: "OSV-1.json", mimeType: "application/json", data: []byte(`{"id":"OSV-1","old":false}`)}, + }) + + attrsAfter, err := storage.ReadObjectAttrs(t.Context(), objPath) + if err != nil { + t.Fatalf("ReadObjectAttrs: %v", err) + } + if attrsAfter.Generation <= attrsBefore.Generation { + t.Errorf("expected generation > %d (uploaded), got %d", attrsBefore.Generation, attrsAfter.Generation) + } +} + +func TestWriter_GCS_UploadsNewObject(t *testing.T) { + storage := testutils.NewMockStorage() + + runWriter(t, storage, "out", []writeMsg{ + {path: "OSV-1.json", mimeType: "application/json", data: []byte(`{"id":"OSV-1"}`)}, + }) + + objPath := filepath.Join("out", "OSV-1.json") + attrs, err := storage.ReadObjectAttrs(t.Context(), objPath) + if err != nil { + t.Fatalf("expected object to exist after upload: %v", err) + } + if attrs.Generation != 1 { + t.Errorf("expected generation 1 for new object, got %d", attrs.Generation) + } +} + +func TestWriter_GCS_SkipsMultipleUnchanged(t *testing.T) { + storage := testutils.NewMockStorage() + + type entry struct { + msgPath string + objPath string + data []byte + } + entries := []entry{ + {"A.json", filepath.Join("out", "A.json"), []byte(`{"id":"A"}`)}, + {"B.json", filepath.Join("out", "B.json"), []byte(`{"id":"B"}`)}, + {"C.json", filepath.Join("out", "C.json"), []byte(`{"id":"C"}`)}, + } + for _, e := range entries { + if err := storage.WriteObject(t.Context(), e.objPath, e.data, nil); err != nil { + t.Fatalf("setup %s: %v", e.objPath, err) + } + } + + gensBefore := make(map[string]int64) + for _, e := range entries { + attrs, _ := storage.ReadObjectAttrs(t.Context(), e.objPath) + gensBefore[e.objPath] = attrs.Generation + } + + msgs := make([]writeMsg, len(entries)) + for i, e := range entries { + msgs[i] = writeMsg{path: e.msgPath, mimeType: "application/json", data: e.data} + } + runWriter(t, storage, "out", msgs) + + for _, e := range entries { + attrs, err := storage.ReadObjectAttrs(t.Context(), e.objPath) + if err != nil { + t.Fatalf("ReadObjectAttrs(%s): %v", e.objPath, err) + } + if attrs.Generation != gensBefore[e.objPath] { + t.Errorf("%s: expected generation %d (skipped), got %d", e.objPath, gensBefore[e.objPath], attrs.Generation) + } + } +}