From e00bb8a0890cab33f4f167e73c7a0c4ea94ef275 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 11 Mar 2026 18:46:51 +0000 Subject: [PATCH] feat: preserve OBJ_REF_DTYPE table schemas on save to_gbq --- bigframes/dataframe.py | 26 +++++++++++++++++++++++++ bigframes/dtypes.py | 7 +++++++ tests/system/small/test_dataframe_io.py | 25 ++++++++++++++++++++++++ 3 files changed, 58 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 25cedda8f4..c4f1eefc6a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4301,6 +4301,32 @@ def to_gbq( result_table = result.query_job.destination assert result_table is not None + obj_ref_dest_cols = [] + for col_id in id_overrides.keys(): + try: + if ( + export_array.get_column_type(col_id) + == bigframes.dtypes.OBJ_REF_DTYPE + ): + obj_ref_dest_cols.append(id_overrides[col_id]) + except Exception: + pass + + if obj_ref_dest_cols: + table = self._session.bqclient.get_table(result_table) + new_schema = [] + for field in table.schema: + if field.name in obj_ref_dest_cols: + field_dict = field.to_api_repr() + field_dict["description"] = "bigframes_dtype: OBJ_REF_DTYPE" + new_schema.append( + google.cloud.bigquery.SchemaField.from_api_repr(field_dict) + ) + else: + new_schema.append(field) + table.schema = new_schema + self._session.bqclient.update_table(table, ["schema"]) + if temp_table_ref: bigframes.session._io.bigquery.set_table_expiration( self._session.bqclient, diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 304428ef2f..1fdce9fe44 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -772,6 +772,13 @@ def convert_schema_field( ) -> typing.Tuple[str, Dtype]: is_repeated = field.mode == "REPEATED" if field.field_type == "RECORD": + if field.description == "bigframes_dtype: OBJ_REF_DTYPE": + bf_dtype = OBJ_REF_DTYPE # type: ignore + if is_repeated: + pa_type = pa.list_(bigframes_dtype_to_arrow_dtype(bf_dtype)) + bf_dtype = pd.ArrowDtype(pa_type) + return field.name, bf_dtype + mapped_fields = map(convert_schema_field, field.fields) fields = [] for name, dtype in mapped_fields: diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index fece679d06..33cf11096d 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -1002,6 +1002,31 @@ def test_to_gbq_timedelta_tag_ignored_when_appending(bigquery_client, dataset_id assert table.schema[0].description is None +def test_to_gbq_obj_ref(session, dataset_id: str, bigquery_client): + destination_table = f"{dataset_id}.test_to_gbq_obj_ref" + sql = """ + SELECT + 'gs://cloud-samples-data/vision/ocr/sign.jpg' AS uri_col + """ + df = session.read_gbq(sql) + df["obj_ref_col"] = df["uri_col"].str.to_blob() + df = df.drop(columns=["uri_col"]) + + # Save the dataframe to bigquery + df.to_gbq(destination_table) + + # Verify the table schema description is added + table = bigquery_client.get_table(destination_table) + obj_ref_field = next(f for f in table.schema if f.name == "obj_ref_col") + assert obj_ref_field.field_type == "RECORD" + assert obj_ref_field.description == "bigframes_dtype: OBJ_REF_DTYPE" + + # Verify reloading it correctly restores the dtype + reloaded_df = session.read_gbq(destination_table) + assert reloaded_df["obj_ref_col"].dtype == dtypes.OBJ_REF_DTYPE + assert len(reloaded_df) == 1 + + @pytest.mark.parametrize( ("index"), [True, False],