From a5448e7d3f21dcf03fb22561c20c081b197f9a23 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Wed, 25 Feb 2026 02:29:20 +0000 Subject: [PATCH 1/2] tests: add multimodal snippets tests --- samples/snippets/multimodal_test.py | 399 ++++++++++++++++++++++++++++ 1 file changed, 399 insertions(+) diff --git a/samples/snippets/multimodal_test.py b/samples/snippets/multimodal_test.py index 033fead33e..415ecdbe3b 100644 --- a/samples/snippets/multimodal_test.py +++ b/samples/snippets/multimodal_test.py @@ -123,3 +123,402 @@ def test_multimodal_dataframe(gcs_bucket_snippets: str) -> None: assert answer_alt is not None assert embeddings is not None assert chunked is not None + + +def test_multimodal_example(gcs_bucket_snippets: str) -> None: + BUCKET = gcs_bucket_snippets + # [START bigquery_dataframes_multimodal_load] + import bigframes.bigquery as bbq + import bigframes.pandas as bpd + + bbq.load_data( + "cymbal_pets.products", + write_disposition="OVERWRITE", + from_files_options={ + "format": "avro", + "uris": [ + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/tables/products/products_*.avro" + ], + }, + ) + # [END bigquery_dataframes_multimodal_load] + + # [START bigquery_dataframes_multimodal_create_images] + bbq.create_external_table( + "cymbal_pets.product_images", + replace=True, + connection_name="us.cymbal_conn", + options={ + "object_metadata": "SIMPLE", + "uris": [ + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/*.png" + ], + }, + ) + # [END bigquery_dataframes_multimodal_create_images] + + # [START bigquery_dataframes_multimodal_create_manuals] + bbq.create_external_table( + "cymbal_pets.product_manuals", + replace=True, + connection_name="us.cymbal_conn", + options={ + "object_metadata": "SIMPLE", + "uris": [ + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*.pdf" + ], + }, + ) + # [END bigquery_dataframes_multimodal_create_manuals] + + # [START bigquery_dataframes_multimodal_create_gemini] + gemini_model = bbq.ml.create_model( + "cymbal_pets.gemini", + replace=True, + connection_name="us.cymbal_conn", + options={"endpoint": "gemini-2.5-flash"}, + ) + # [END bigquery_dataframes_multimodal_create_gemini] + + # [START bigquery_dataframes_multimodal_create_embedding] + embedding_model = bbq.ml.create_model( + "cymbal_pets.embedding_model", + replace=True, + connection_name="us.cymbal_conn", + options={"endpoint": "multimodalembedding@001"}, + ) + # [END bigquery_dataframes_multimodal_create_embedding] + + # [START bigquery_dataframes_multimodal_create_df_products_mm] + df_images = bpd.read_gbq("cymbal_pets.product_images") + df_products = bpd.read_gbq("cymbal_pets.products") + + df_products_mm = df_images.merge(df_products, on="uri").drop(columns="uri") + df_products_mm = df_products_mm.rename(columns={"ref": "image"}) + # [END bigquery_dataframes_multimodal_create_df_products_mm] + + # [START bigquery_dataframes_multimodal_show_df_products_mm] + df_products_mm[["product_name", "image"]] + # [END bigquery_dataframes_multimodal_show_df_products_mm] + + # [START bigquery_dataframes_multimodal_image_description] + df_products_mm["url"] = bbq.obj.get_access_url( + df_products_mm["image"], "R" + ).to_frame() + df_products_mm["prompt0"] = "Can you describe the following image?" + + df_products_mm["prompt"] = bbq.struct(df_products_mm[["prompt0", "url"]]) + df_products_mm = bbq.ai.generate_table( + gemini_model, df_products_mm, output_schema={"image_description": "STRING"} + ) + + df_products_mm = df_products_mm[ + [ + "product_id", + "product_name", + "brand", + "category", + "subcategory", + "animal_type", + "search_keywords", + "price", + "description", + "inventory_level", + "supplier_id", + "average_rating", + "image", + "image_description", + ] + ] + # [END bigquery_dataframes_multimodal_image_description] + + # [START bigquery_dataframes_multimodal_generate_animal_type] + df_prompt = bbq.obj.get_access_url(df_products_mm["image"], "R").to_frame() + df_prompt[ + "prompt0" + ] = "For the image of a pet product, concisely generate the following metadata: 1) animal_type and 2) 5 SEO search keywords, and 3) product subcategory." + + df_products_mm["prompt"] = bbq.struct(df_prompt[["prompt0", "image"]]) + + df_products_mm = df_products_mm.drop( + columns=["animal_type", "search_keywords", "subcategory"] + ) + df_products_mm = bbq.ai.generate_table( + gemini_model, + df_products_mm, + output_schema="animal_type STRING, search_keywords ARRAY, subcategory STRING", + ) + # [END bigquery_dataframes_multimodal_generate_animal_type] + + # [START bigquery_dataframes_multimodal_show_animal_type] + df_products_mm[ + [ + "product_name", + "image_description", + "animal_type", + "search_keywords", + "subcategory", + ] + ] + # [END bigquery_dataframes_multimodal_show_animal_type] + + # [START bigquery_dataframes_multimodal_brand_description] + df_agg = df_products_mm[ + ["image", "description", "category", "subcategory", "brand"] + ] + df_agg["image"] = bbq.obj.get_access_url(df_products_mm["image"], "R") + df_agg = bbq.array_agg(df_agg.groupby(by=["brand"])) + + df_agg["cnt"] = bbq.array_length(df_agg["image"]) + + df_prompt = df_agg[["image", "description", "category", "subcategory"]] + df_prompt[ + "prompt0" + ] = "Use the images and text to give one concise brand description for a website brand page. Return the description only. " + + df_agg["prompt"] = bbq.struct( + df_prompt[["prompt0", "image", "description", "category", "subcategory"]] + ) + + df_agg = df_agg.reset_index() + + df_agg = bbq.ai.generate_table( + gemini_model, df_agg, output_schema={"brand_description": "STRING"} + ) + df_agg[["brand", "brand_description", "cnt"]] + # [END bigquery_dataframes_multimodal_brand_description] + + # [START bigquery_dataframes_multimodal_define_to_grayscale] + @bpd.udf( + dataset="cymbal_pets", + name="to_grayscale", + packages=["numpy", "opencv-python"], + bigquery_connection="us.cymbal_conn", + max_batching_rows=1, + ) + def to_grayscale(src_ref: str, dst_ref: str) -> str: + import json + from urllib.request import Request, urlopen + + import cv2 as cv + import numpy as np + + src_json = json.loads(src_ref) + srcUrl = src_json["access_urls"]["read_url"] + + dst_json = json.loads(dst_ref) + dstUrl = dst_json["access_urls"]["write_url"] + + req = urlopen(srcUrl) + arr = np.asarray(bytearray(req.read()), dtype=np.uint8) + img = cv.imdecode(arr, -1) # 'Load it as it is' + + # Convert the image to grayscale + gray_image = cv.cvtColor(img, cv.COLOR_BGR2GRAY) + + # Send POST request to the URL + _, img_encoded = cv.imencode(".png", gray_image) + + req = Request( + url=dstUrl, + data=img_encoded.tobytes(), + method="PUT", + headers={ + "Content-Type": "image/png", + }, + ) + with urlopen(req): + pass + return dst_ref + + # [END bigquery_dataframes_multimodal_define_to_grayscale] + + # [START bigquery_dataframes_multimodal_apply_to_grayscale] + df_grayscale = df_products_mm[["product_id", "product_name", "image"]] + df_grayscale[ + "gray_image_uri" + ] = f"gs://{BUCKET}/cymbal-pets-images/grayscale/" + df_grayscale[ + "image" + ].struct.field( + "uri" + ).str.extract( + r"([^/]+)$" + ) + + df_grayscale["gray_image"] = bbq.obj.make_ref( + df_grayscale["gray_image_uri"], "us.cymbal_conn" + ) + + df_grayscale["image_url"] = bbq.to_json_string( + bbq.obj.get_access_url(df_grayscale["image"], "r") + ) + df_grayscale["gray_image_url"] = bbq.to_json_string( + bbq.obj.get_access_url(df_grayscale["gray_image"], "rw") + ) + + df_grayscale[["image_url", "gray_image_url"]].apply(to_grayscale, axis=1) + # [END bigquery_dataframes_multimodal_apply_to_grayscale] + + # [START bigquery_dataframes_multimodal_define_chunk_pdf] + @bpd.udf( + dataset="cymbal_pets", + name="chunk_pdf", + packages=["pypdf"], + bigquery_connection="us.cymbal_conn", + max_batching_rows=1, + ) + def chunk_pdf(src_ref: str, chunk_size: int, overlap_size: int) -> list[str]: + import io + import json + from urllib.request import urlopen + + from pypdf import PdfReader # type: ignore + + src_json = json.loads(src_ref) + srcUrl = src_json["access_urls"]["read_url"] + + req = urlopen(srcUrl) + pdf_file = io.BytesIO(bytearray(req.read())) + reader = PdfReader(pdf_file, strict=False) + + # extract and chunk text simultaneously + all_text_chunks = [] + curr_chunk = "" + for page in reader.pages: + page_text = page.extract_text() + if page_text: + curr_chunk += page_text + # split the accumulated text into chunks of a specific size with overlaop + # this loop implements a sliding window approach to create chunks + while len(curr_chunk) >= chunk_size: + split_idx = curr_chunk.rfind(" ", 0, chunk_size) + if split_idx == -1: + split_idx = chunk_size + actual_chunk = curr_chunk[:split_idx] + all_text_chunks.append(actual_chunk) + overlap = curr_chunk[split_idx + 1 : split_idx + 1 + overlap_size] + curr_chunk = overlap + curr_chunk[split_idx + 1 + overlap_size :] + if curr_chunk: + all_text_chunks.append(curr_chunk) + + return all_text_chunks + + # [END bigquery_dataframes_multimodal_define_chunk_pdf] + + # [START bigquery_dataframes_multimodal_apply_chunk_pdf] + df_manuals = bpd.read_gbq("cymbal_pets.product_manuals") + df_manuals["url"] = bbq.to_json_string( + bbq.obj.get_access_url(df_manuals["ref"], "R") + ) + + df_manuals["chunk_size"] = 1000 + df_manuals["overlap_size"] = 100 + + df_manuals["chunked"] = df_manuals[["url", "chunk_size", "overlap_size"]].apply( + chunk_pdf, axis=1 + ) + # [END bigquery_dataframes_multimodal_apply_chunk_pdf] + + # [START bigquery_dataframes_multimodal_analyze_pdf] + df_chunked = df_manuals["chunked"].explode().to_frame() + df_chunked[ + "prompt0" + ] = "Can you summarize the product manual as bullet points? Highlight the legal clauses" + + df_chunked["prompt"] = bbq.struct(df_chunked[["prompt0", "chunked"]]) + + result = bbq.ai.generate_text(gemini_model, df_chunked["prompt"]) + result + # [START bigquery_dataframes_multimodal_analyze_pdf] + + # [START bigquery_dataframes_multimodal_create_embed_table] + df_products_mm["content"] = bbq.obj.get_access_url(df_products_mm["image"], "R") + df_embed = bbq.ai.generate_embedding( + embedding_model, df_products_mm[["content", "product_id"]] + ) + + df_embed.to_gbq("cymbal_pets.products_embedding", if_exists="replace") + # [END bigquery_dataframes_multimodal_create_embed_table] + + # [START bigquery_dataframes_multimodal_vector_search] + df_image = bpd.DataFrame( + { + "uri": [ + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/cozy-naps-cat-scratching-post-with-condo.png" + ] + } + ).cache() + df_image["image"] = bbq.obj.make_ref(df_image["uri"], "us.cymbal_conn") + df_search = bbq.ai.generate_embedding( + embedding_model, + bbq.obj.get_access_url(bbq.obj.fetch_metadata(df_image["image"]), "R"), + ) + + search_result = bbq.vector_search( + "cymbal_pets.products_embedding", "embedding", df_search["embedding"] + ) + search_result + # [END bigquery_dataframes_multimodal_vector_search] + + # [START bigquery_dataframes_create_external_table_all] + bbq.create_external_table( + "cymbal_pets.product_manuals_all", + replace=True, + connection_name="us.cymbal_conn", + options={ + "object_metadata": "SIMPLE", + "uris": [ + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*.pdf", + "gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/document_chunks/*.pdf", + ], + }, + ) + # [END bigquery_dataframes_create_external_table_all] + + # [START bigquery_dataframes_create_manual_to_chunks] + df1 = bpd.read_gbq("cymbal_pets.product_manuals_all").sort_values("uri") + df2 = df1.copy() + df1["name"] = df1["uri"].str.extract(r".*/([^.]*).[^/]+") + df2["name"] = df2["uri"].str.extract(r".*/([^.]*)_page[0-9]+.[^/]+") + df_manuals_all = df1.merge(df2, on="name") + df_manuals_agg = ( + bbq.array_agg(df_manuals_all[["ref_x", "uri_x"]].groupby("uri_x"))["ref_x"] + .str[0] + .to_frame() + ) + df_manuals_agg["chunks"] = bbq.array_agg( + df_manuals_all[["ref_y", "uri_x"]].groupby("uri_x") + )["ref_y"] + # [END bigquery_dataframes_create_manual_to_chunks] + + # [START bigquery_dataframes_show_manual_to_chunks] + df_manuals_agg + # [END bigquery_dataframes_show_manual_to_chunks] + + # [START bigquery_dataframes_generate_pages_summary] + df_manuals_agg["chunks_url"] = bbq.array_agg( + bbq.obj.get_access_url(df_manuals_agg.explode("chunks")["chunks"], "R").groupby( + "uri_x" + ) + ) + df_manuals_agg[ + "prompt0" + ] = "Can you provide a page by page summary for the first 3 pages of the attached manual? Only write one line for each page. The pages are provided in serial order" + df_manuals_agg["prompt"] = bbq.struct(df_manuals_agg[["prompt0", "chunks_url"]]) + + result = bbq.ai.generate_text(gemini_model, df_manuals_agg["prompt"])["result"] + result + # [END bigquery_dataframes_generate_pages_summary] + + # [START bigquery_dataframes_generate_each_page_summary] + result = bbq.ai.generate_table( + gemini_model, + df_manuals_agg["prompt"], + output_schema={ + "page1_summary": "STRING", + "page2_summary": "STRING", + "page3_summary": "STRING", + }, + )[["page1_summary", "page2_summary", "page3_summary"]] + result + # [END bigquery_dataframes_generate_each_page_summary] From bc17036ae1e86b686d6a9ca808135de759587896 Mon Sep 17 00:00:00 2001 From: Garrett Wu Date: Wed, 25 Feb 2026 02:31:52 +0000 Subject: [PATCH 2/2] fix --- samples/snippets/multimodal_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/snippets/multimodal_test.py b/samples/snippets/multimodal_test.py index 415ecdbe3b..5dec09108a 100644 --- a/samples/snippets/multimodal_test.py +++ b/samples/snippets/multimodal_test.py @@ -429,7 +429,7 @@ def chunk_pdf(src_ref: str, chunk_size: int, overlap_size: int) -> list[str]: result = bbq.ai.generate_text(gemini_model, df_chunked["prompt"]) result - # [START bigquery_dataframes_multimodal_analyze_pdf] + # [END bigquery_dataframes_multimodal_analyze_pdf] # [START bigquery_dataframes_multimodal_create_embed_table] df_products_mm["content"] = bbq.obj.get_access_url(df_products_mm["image"], "R")