From d7b9d73d8a188ae30ac82cc8ef97c4e1717ba655 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Mon, 9 Mar 2026 03:52:13 -0700 Subject: [PATCH 1/6] lakekeeper boostrap script --- bin/bootstrap-lakekeeper.sh | 525 ++++++++++++++++++++++++++++++++++++ bin/parse-storage-config.py | 108 ++++++++ 2 files changed, 633 insertions(+) create mode 100755 bin/bootstrap-lakekeeper.sh create mode 100755 bin/parse-storage-config.py diff --git a/bin/bootstrap-lakekeeper.sh b/bin/bootstrap-lakekeeper.sh new file mode 100755 index 00000000000..1d4adc659c2 --- /dev/null +++ b/bin/bootstrap-lakekeeper.sh @@ -0,0 +1,525 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Bootstrap script to start Lakekeeper and create warehouse (idempotent). +# This script does four things: +# 1. Starts Lakekeeper if it's not already running +# 2. Bootstraps the Lakekeeper server (creates default project, idempotent) +# 3. Checks if MinIO bucket exists (and creates it if needed) +# 4. Checks and creates the warehouse if it doesn't exist +# +# +# Usage: +# ./bin/bootstrap-lakekeeper.sh + +set -e + +# ============================================================================== +# User Configuration - Edit the values below before running this script +# ============================================================================== + +# Lakekeeper binary path +LAKEKEEPER_BINARY_PATH="" + +# Lakekeeper PostgreSQL connection URLs +#(LAKEKEEPER__PG_DATABASE_URL_READ="postgres://postgres_user:postgres_urlencoded_password@hostname:5432/texera_lakekeeper" +# LAKEKEEPER__PG_DATABASE_URL_WRITE="postgres://postgres_user:postgres_urlencoded_password@hostname:5432/texera_lakekeeper") +LAKEKEEPER__PG_DATABASE_URL_READ="" +LAKEKEEPER__PG_DATABASE_URL_WRITE="" + +# Lakekeeper encryption key +LAKEKEEPER__PG_ENCRYPTION_KEY="texera_key" + +# Lakekeeper metrics port +LAKEKEEPER__METRICS_PORT="9091" + +# ============================================================================== +# End of User Configuration +# ============================================================================== + +# Read remaining configuration from storage.conf +# Priority: user config above > storage.conf > default value + +# Find storage.conf path +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [ -n "$TEXERA_HOME" ]; then + STORAGE_CONF_PATH="$TEXERA_HOME/common/config/src/main/resources/storage.conf" +else + STORAGE_CONF_PATH="$SCRIPT_DIR/../common/config/src/main/resources/storage.conf" +fi + +# Extract values from storage.conf using pyhocon for proper HOCON parsing +if [ -f "$STORAGE_CONF_PATH" ]; then + # Check if pyhocon is available + if ! command -v python3 >/dev/null 2>&1; then + echo "✗ Error: python3 is required to parse storage.conf" + echo " Please install Python 3" + exit 1 + fi + + if ! python3 -c "import pyhocon" 2>/dev/null; then + echo "✗ Error: pyhocon is required to parse storage.conf" + echo " Install it with: pip install pyhocon" + exit 1 + fi + + # Use batch mode to parse all config values in a single python invocation + CONF_OUTPUT=$(python3 "$SCRIPT_DIR/parse-storage-config.py" --batch \ + REST_URI_FROM_CONF=storage.iceberg.catalog.rest.uri \ + WAREHOUSE_NAME_FROM_CONF=storage.iceberg.catalog.rest.warehouse-name \ + REST_REGION_FROM_CONF=storage.iceberg.catalog.rest.region \ + S3_BUCKET_FROM_CONF=storage.iceberg.catalog.rest.s3-bucket \ + S3_ENDPOINT_FROM_CONF=storage.s3.endpoint \ + S3_USERNAME_FROM_CONF=storage.s3.auth.username \ + S3_PASSWORD_FROM_CONF=storage.s3.auth.password \ + 2>/dev/null) || true + + # Parse the batch output (each line is VAR_NAME=value) + while IFS='=' read -r var_name var_value; do + [ -z "$var_name" ] && continue + declare "$var_name=$var_value" + done <<< "$CONF_OUTPUT" + + # Strip trailing /catalog/ from REST URI + REST_URI_FROM_CONF=$(echo "${REST_URI_FROM_CONF:-}" | sed 's|/catalog/*$||') + + echo "Configuration read from storage.conf:" + echo " REST_URI=$REST_URI_FROM_CONF" + echo " WAREHOUSE_NAME=$WAREHOUSE_NAME_FROM_CONF" + echo " REGION=$REST_REGION_FROM_CONF" + echo " S3_BUCKET=$S3_BUCKET_FROM_CONF" + echo " S3_ENDPOINT=$S3_ENDPOINT_FROM_CONF" + echo " S3_USERNAME=$S3_USERNAME_FROM_CONF" + echo " S3_PASSWORD=***" + echo "" +else + REST_URI_FROM_CONF="" + WAREHOUSE_NAME_FROM_CONF="" + REST_REGION_FROM_CONF="" + S3_BUCKET_FROM_CONF="" + S3_ENDPOINT_FROM_CONF="" + S3_USERNAME_FROM_CONF="" + S3_PASSWORD_FROM_CONF="" + echo "storage.conf not found, using environment variables or defaults" + echo "" +fi + +# Use values from storage.conf with defaults +LAKEKEEPER_BASE_URI="${REST_URI_FROM_CONF:-http://localhost:8181}" +WAREHOUSE_NAME="${WAREHOUSE_NAME_FROM_CONF:-texera}" +S3_REGION="${REST_REGION_FROM_CONF:-us-west-2}" +S3_BUCKET="${S3_BUCKET_FROM_CONF:-texera-iceberg}" +S3_ENDPOINT="${S3_ENDPOINT_FROM_CONF:-http://localhost:9000}" +S3_USERNAME="${S3_USERNAME_FROM_CONF:-texera_minio}" +S3_PASSWORD="${S3_PASSWORD_FROM_CONF:-password}" +STORAGE_PATH="s3://${S3_BUCKET}/iceberg/${WAREHOUSE_NAME}" + +echo "==========================================" +echo "Lakekeeper Bootstrap and Warehouse Setup" +echo "==========================================" +echo "Lakekeeper Base URI: $LAKEKEEPER_BASE_URI" +echo "Lakekeeper Binary: ${LAKEKEEPER_BINARY_PATH:-lakekeeper}" +echo "Warehouse Name: $WAREHOUSE_NAME" +echo "S3 Endpoint: $S3_ENDPOINT" +echo "S3 Bucket: $S3_BUCKET" +echo "Storage Path: $STORAGE_PATH" +echo "" + +# Function to check if Lakekeeper is running +check_lakekeeper_running() { + local health_url="${LAKEKEEPER_BASE_URI}/health" + if curl -s -f "$health_url" > /dev/null 2>&1; then + return 0 # Running + else + return 1 # Not running + fi +} + +# Function to bootstrap the Lakekeeper server (creates default project). +# This is idempotent - safe to call even if already bootstrapped. +# Returns: 0=success (or already bootstrapped), 1=failure +bootstrap_lakekeeper_server() { + local base_uri="$1" + local bootstrap_url="${base_uri}/management/v1/bootstrap" + + echo "Bootstrapping Lakekeeper server (creating default project)..." + echo " URL: $bootstrap_url" + + local temp_response + temp_response=$(mktemp) || { + echo "✗ Failed to create temporary file" + return 1 + } + + local http_code + http_code=$(curl -s -o "$temp_response" -w "%{http_code}" \ + -X POST \ + -H "Content-Type: application/json" \ + -d '{"accept-terms-of-use": true}' \ + "$bootstrap_url" 2>/dev/null || echo "000") + + echo " HTTP status: $http_code" + + case "$http_code" in + 000) + echo "✗ Failed to connect to Lakekeeper at $bootstrap_url" + rm -f "$temp_response" || true + return 1 + ;; + 2*) + echo "✓ Lakekeeper server bootstrapped successfully (HTTP $http_code)" + rm -f "$temp_response" || true + return 0 + ;; + *) + if grep -q "CatalogAlreadyBootstrapped" "$temp_response" 2>/dev/null; then + echo "✓ Lakekeeper server already bootstrapped (HTTP $http_code), continuing." + rm -f "$temp_response" || true + return 0 + fi + echo "✗ Failed to bootstrap Lakekeeper server (HTTP $http_code)" + echo " Response body:" + cat "$temp_response" | sed 's/^/ /' || true + rm -f "$temp_response" || true + return 1 + ;; + esac +} + +# Function to check if MinIO bucket exists (requires AWS CLI) +check_minio_bucket() { + local bucket_name="$1" + local endpoint="$2" + local username="$3" + local password="$4" + + if ! command -v aws >/dev/null 2>&1; then + echo "✗ Error: AWS CLI is required for MinIO bucket operations." + echo " Install it with: pip install awscli" + return 1 + fi + + if AWS_ACCESS_KEY_ID="$username" AWS_SECRET_ACCESS_KEY="$password" AWS_DEFAULT_REGION="us-west-2" \ + aws --endpoint-url="$endpoint" s3 ls "s3://${bucket_name}/" >/dev/null 2>&1; then + return 0 # Bucket exists + else + return 1 # Bucket doesn't exist or error + fi +} + +# Function to create MinIO bucket (requires AWS CLI) +create_minio_bucket() { + local bucket_name="$1" + local endpoint="$2" + local username="$3" + local password="$4" + + if ! command -v aws >/dev/null 2>&1; then + echo "✗ Error: AWS CLI is required for MinIO bucket operations." + echo " Install it with: pip install awscli" + return 1 + fi + + if AWS_ACCESS_KEY_ID="$username" AWS_SECRET_ACCESS_KEY="$password" AWS_DEFAULT_REGION="us-west-2" \ + aws --endpoint-url="$endpoint" s3 mb "s3://${bucket_name}" >/dev/null 2>&1; then + return 0 # Success + else + return 1 # Failed + fi +} + +# Function to start Lakekeeper +start_lakekeeper() { + export LAKEKEEPER__METRICS_PORT + export LAKEKEEPER__PG_ENCRYPTION_KEY + + echo "Starting Lakekeeper..." + + # Validate LAKEKEEPER_BINARY_PATH + if [ -z "$LAKEKEEPER_BINARY_PATH" ]; then + echo "✗ Error: LAKEKEEPER_BINARY_PATH is not set." + echo " Please set it in the User Configuration section at the top of this script." + exit 1 + fi + + if [ ! -x "$LAKEKEEPER_BINARY_PATH" ]; then + echo "✗ Error: Lakekeeper binary not found or not executable at '$LAKEKEEPER_BINARY_PATH'" + echo " Please update LAKEKEEPER_BINARY_PATH in the User Configuration section." + exit 1 + fi + + local binary_path="$LAKEKEEPER_BINARY_PATH" + + # Validate required database URLs + if [ -z "$LAKEKEEPER__PG_DATABASE_URL_READ" ] || [ -z "$LAKEKEEPER__PG_DATABASE_URL_WRITE" ]; then + echo "✗ Error: Database URLs not configured." + echo " Please set LAKEKEEPER__PG_DATABASE_URL_READ and LAKEKEEPER__PG_DATABASE_URL_WRITE" + echo " in the User Configuration section at the top of this script." + exit 1 + fi + export LAKEKEEPER__PG_DATABASE_URL_READ + export LAKEKEEPER__PG_DATABASE_URL_WRITE + + # Run migration first + echo "Running Lakekeeper migration..." + if ! "$binary_path" migrate; then + echo "✗ Failed to run Lakekeeper migration" + return 1 + fi + + # Start Lakekeeper in background + echo "Starting Lakekeeper server..." + nohup "$binary_path" serve > /tmp/lakekeeper.log 2>&1 & + local lakekeeper_pid=$! + echo "Lakekeeper started with PID: $lakekeeper_pid" + + # Wait for Lakekeeper to be ready + echo "Waiting for Lakekeeper to be ready..." + local max_attempts=30 + local attempt=1 + while [ $attempt -le $max_attempts ]; do + if check_lakekeeper_running; then + echo "✓ Lakekeeper is ready!" + return 0 + fi + if [ $attempt -eq $max_attempts ]; then + echo "✗ Lakekeeper did not become ready after $max_attempts attempts" + echo " Check logs at /tmp/lakekeeper.log" + return 1 + fi + echo " Waiting for Lakekeeper... ($attempt/$max_attempts)" + sleep 2 + attempt=$((attempt + 1)) + done +} + +# Function to check if warehouse exists +# Returns: 0=exists, 1=not found, 2=connection error +check_warehouse_exists() { + local warehouse_name="$1" + local base_uri="$2" + + local list_url="${base_uri}/management/v1/warehouse" + + echo "Checking if warehouse '$warehouse_name' exists..." + + local temp_response + temp_response=$(mktemp) || { + echo "✗ Failed to create temporary file" + return 2 + } + + local http_code + http_code=$(curl -s -o "$temp_response" -w "%{http_code}" "$list_url" 2>/dev/null || echo "000") + + if [ "$http_code" = "000" ]; then + rm -f "$temp_response" || true + echo "✗ Failed to connect to Lakekeeper at $list_url" + return 2 + fi + + if [ "$http_code" != "200" ]; then + echo "⚠ Warning: Unexpected HTTP status $http_code when listing warehouses" + cat "$temp_response" 2>/dev/null | sed 's/^/ /' || true + rm -f "$temp_response" || true + return 1 + fi + + # Check if warehouse name exists in the response + # Response format: {"warehouses":[{"name":"...",...},...]} + local found=1 + if command -v jq >/dev/null 2>&1; then + if jq -e ".warehouses[] | select(.name == \"$warehouse_name\")" "$temp_response" >/dev/null 2>&1; then + found=0 + fi + else + if grep -q "\"name\"[[:space:]]*:[[:space:]]*\"$warehouse_name\"" "$temp_response" 2>/dev/null; then + found=0 + fi + fi + + rm -f "$temp_response" 2>/dev/null || true + return $found +} + +# Function to create warehouse +# Args: warehouse_name base_uri s3_bucket s3_region s3_endpoint s3_username s3_password +# Returns: 0=success, 1=failure +create_warehouse() { + local warehouse_name="$1" + local base_uri="$2" + local bucket="$3" + local region="$4" + local endpoint="$5" + local username="$6" + local password="$7" + + local create_url="${base_uri}/management/v1/warehouse" + + local create_payload=$(cat </dev/null | sed 's/^/ /' || true + rm -f "$temp_response" || true + return 1 + ;; + esac +} + +# Step 1: Check if Lakekeeper is running, start if not +echo "Step 1: Checking Lakekeeper status..." +if check_lakekeeper_running; then + echo "✓ Lakekeeper is already running" +else + echo "Lakekeeper is not running, attempting to start..." + if start_lakekeeper; then + echo "✓ Lakekeeper started successfully" + else + echo "✗ Failed to start Lakekeeper" + exit 1 + fi +fi +echo "" + +# Step 2: Bootstrap the Lakekeeper server (creates default project) +echo "Step 2: Bootstrapping Lakekeeper server..." +if bootstrap_lakekeeper_server "$LAKEKEEPER_BASE_URI"; then + echo "✓ Lakekeeper server bootstrap completed" +else + echo "✗ Failed to bootstrap Lakekeeper server" + echo " Please check that Lakekeeper is running and accessible at $LAKEKEEPER_BASE_URI" + exit 1 +fi +echo "" + +# Step 3: Check and create MinIO bucket +echo "Step 3: Checking MinIO bucket..." +if check_minio_bucket "$S3_BUCKET" "$S3_ENDPOINT" "$S3_USERNAME" "$S3_PASSWORD"; then + echo "✓ MinIO bucket '$S3_BUCKET' already exists" +else + echo "MinIO bucket '$S3_BUCKET' does not exist, creating..." + if create_minio_bucket "$S3_BUCKET" "$S3_ENDPOINT" "$S3_USERNAME" "$S3_PASSWORD"; then + echo "✓ MinIO bucket '$S3_BUCKET' created successfully" + else + echo "✗ Failed to create MinIO bucket '$S3_BUCKET'" + echo " Please ensure MinIO is running and accessible at $S3_ENDPOINT" + exit 1 + fi +fi +echo "" + +# Step 4: Check and create warehouse +echo "Step 4: Checking and creating warehouse..." + +set +e # Temporarily disable exit on error to capture function return value +check_warehouse_exists "$WAREHOUSE_NAME" "$LAKEKEEPER_BASE_URI" +check_result=$? +set -e # Re-enable exit on error + +case $check_result in + 0) + echo "✓ Warehouse '$WAREHOUSE_NAME' already exists, skipping creation." + echo "" + echo "==========================================" + echo "✓ Bootstrap completed successfully!" + echo "==========================================" + exit 0 + ;; + 1) + echo "Warehouse '$WAREHOUSE_NAME' does not exist, will create..." + ;; + 2) + exit 1 + ;; + *) + echo "✗ Unexpected error (code: $check_result)" + exit 1 + ;; +esac + +# Create warehouse +if create_warehouse "$WAREHOUSE_NAME" "$LAKEKEEPER_BASE_URI" "$S3_BUCKET" "$S3_REGION" "$S3_ENDPOINT" "$S3_USERNAME" "$S3_PASSWORD"; then + echo "" + echo "==========================================" + echo "✓ Bootstrap completed successfully!" + echo "==========================================" + exit 0 +else + echo "" + echo "==========================================" + echo "✗ Bootstrap failed!" + echo "==========================================" + exit 1 +fi diff --git a/bin/parse-storage-config.py b/bin/parse-storage-config.py new file mode 100755 index 00000000000..262dba45add --- /dev/null +++ b/bin/parse-storage-config.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Parse storage.conf HOCON file with environment variable resolution. +This script properly handles HOCON syntax including environment variable substitution. + +Usage: + # Single key mode (backward compatible): + python3 bin/parse-storage-config.py storage.iceberg.catalog.rest.uri + + # Batch mode (outputs VAR_NAME=value lines): + python3 bin/parse-storage-config.py --batch VAR1=key.path1 VAR2=key.path2 ... + +Examples: + python3 bin/parse-storage-config.py storage.s3.endpoint + python3 bin/parse-storage-config.py --batch \ + REST_URI=storage.iceberg.catalog.rest.uri \ + WAREHOUSE_NAME=storage.iceberg.catalog.rest.warehouse-name \ + S3_ENDPOINT=storage.s3.endpoint +""" + +import os +import sys +from pathlib import Path + +try: + from pyhocon import ConfigFactory +except ImportError: + print("Error: pyhocon is not installed. Install it with: pip install pyhocon", file=sys.stderr) + sys.exit(1) + + +def find_storage_conf(): + """Find storage.conf path.""" + texera_home = os.environ.get("TEXERA_HOME") + if texera_home: + conf_path = Path(texera_home) / "common" / "config" / "src" / "main" / "resources" / "storage.conf" + else: + script_dir = Path(__file__).parent + conf_path = script_dir.parent / "common" / "config" / "src" / "main" / "resources" / "storage.conf" + + if not conf_path.exists(): + print(f"Error: storage.conf not found at {conf_path}", file=sys.stderr) + sys.exit(1) + + return conf_path + + +def parse_storage_config(): + """Parse storage.conf with environment variable resolution.""" + conf_path = find_storage_conf() + config = ConfigFactory.parse_file(str(conf_path)) + return config + + +def get_value(config, key_path): + """Get value from config by dot-separated key path.""" + try: + return config.get_string(key_path) + except Exception: + return None + + +def main(): + if len(sys.argv) < 2: + config = parse_storage_config() + print(config.get("storage", {})) + return + + if sys.argv[1] == "--batch": + config = parse_storage_config() + for arg in sys.argv[2:]: + if "=" not in arg: + print(f"Error: batch argument must be VAR_NAME=key.path, got '{arg}'", file=sys.stderr) + sys.exit(1) + var_name, key_path = arg.split("=", 1) + value = get_value(config, key_path) + if value is None: + value = "" + print(f"{var_name}={value}") + else: + key_path = sys.argv[1] + config = parse_storage_config() + value = get_value(config, key_path) + if value is None: + print(f"Key '{key_path}' not found", file=sys.stderr) + sys.exit(1) + print(value) + + +if __name__ == "__main__": + main() From 9ef6ed2b3a4bd1766790c15676347ccffb621a3a Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Mon, 30 Mar 2026 14:46:07 -0700 Subject: [PATCH 2/6] resolve comment --- sql/texera_lakekeeper.sql | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 sql/texera_lakekeeper.sql diff --git a/sql/texera_lakekeeper.sql b/sql/texera_lakekeeper.sql new file mode 100644 index 00000000000..afdca6946cc --- /dev/null +++ b/sql/texera_lakekeeper.sql @@ -0,0 +1,21 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +\c postgres + +DROP DATABASE IF EXISTS texera_lakekeeper; +CREATE DATABASE texera_lakekeeper; From b4ad41265b5adfe075071489ce21205921ac12fe Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Tue, 7 Apr 2026 18:04:42 -0700 Subject: [PATCH 3/6] remove the parse-storage-config.py --- bin/bootstrap-lakekeeper.sh | 102 +++++++++------------------------- bin/parse-storage-config.py | 108 ------------------------------------ 2 files changed, 27 insertions(+), 183 deletions(-) delete mode 100755 bin/parse-storage-config.py diff --git a/bin/bootstrap-lakekeeper.sh b/bin/bootstrap-lakekeeper.sh index 1d4adc659c2..1c2a03d4500 100755 --- a/bin/bootstrap-lakekeeper.sh +++ b/bin/bootstrap-lakekeeper.sh @@ -32,6 +32,15 @@ set -e # ============================================================================== # User Configuration - Edit the values below before running this script # ============================================================================== +# +# IMPORTANT: This script does NOT read storage.conf. The Java/Scala backend +# reads common/config/src/main/resources/storage.conf directly, but this bash +# script only reads the variables below. If you change values in storage.conf, +# you MUST also update the matching defaults here (or export the corresponding +# STORAGE_* environment variables before running this script), otherwise the +# bootstrap will register a warehouse that does not match what the backend uses. +# +# ============================================================================== # Lakekeeper binary path LAKEKEEPER_BINARY_PATH="" @@ -48,85 +57,28 @@ LAKEKEEPER__PG_ENCRYPTION_KEY="texera_key" # Lakekeeper metrics port LAKEKEEPER__METRICS_PORT="9091" +# Storage settings — must stay in sync with storage.conf (see note above) +STORAGE_ICEBERG_CATALOG_REST_URI="${STORAGE_ICEBERG_CATALOG_REST_URI:-http://localhost:8181/catalog}" +STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME="${STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME:-texera}" +STORAGE_ICEBERG_CATALOG_REST_REGION="${STORAGE_ICEBERG_CATALOG_REST_REGION:-us-west-2}" +STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET="${STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET:-texera-iceberg}" +STORAGE_S3_ENDPOINT="${STORAGE_S3_ENDPOINT:-http://localhost:9000}" +STORAGE_S3_AUTH_USERNAME="${STORAGE_S3_AUTH_USERNAME:-texera_minio}" +STORAGE_S3_AUTH_PASSWORD="${STORAGE_S3_AUTH_PASSWORD:-password}" + # ============================================================================== # End of User Configuration # ============================================================================== -# Read remaining configuration from storage.conf -# Priority: user config above > storage.conf > default value - -# Find storage.conf path -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -if [ -n "$TEXERA_HOME" ]; then - STORAGE_CONF_PATH="$TEXERA_HOME/common/config/src/main/resources/storage.conf" -else - STORAGE_CONF_PATH="$SCRIPT_DIR/../common/config/src/main/resources/storage.conf" -fi - -# Extract values from storage.conf using pyhocon for proper HOCON parsing -if [ -f "$STORAGE_CONF_PATH" ]; then - # Check if pyhocon is available - if ! command -v python3 >/dev/null 2>&1; then - echo "✗ Error: python3 is required to parse storage.conf" - echo " Please install Python 3" - exit 1 - fi - - if ! python3 -c "import pyhocon" 2>/dev/null; then - echo "✗ Error: pyhocon is required to parse storage.conf" - echo " Install it with: pip install pyhocon" - exit 1 - fi - - # Use batch mode to parse all config values in a single python invocation - CONF_OUTPUT=$(python3 "$SCRIPT_DIR/parse-storage-config.py" --batch \ - REST_URI_FROM_CONF=storage.iceberg.catalog.rest.uri \ - WAREHOUSE_NAME_FROM_CONF=storage.iceberg.catalog.rest.warehouse-name \ - REST_REGION_FROM_CONF=storage.iceberg.catalog.rest.region \ - S3_BUCKET_FROM_CONF=storage.iceberg.catalog.rest.s3-bucket \ - S3_ENDPOINT_FROM_CONF=storage.s3.endpoint \ - S3_USERNAME_FROM_CONF=storage.s3.auth.username \ - S3_PASSWORD_FROM_CONF=storage.s3.auth.password \ - 2>/dev/null) || true - - # Parse the batch output (each line is VAR_NAME=value) - while IFS='=' read -r var_name var_value; do - [ -z "$var_name" ] && continue - declare "$var_name=$var_value" - done <<< "$CONF_OUTPUT" - - # Strip trailing /catalog/ from REST URI - REST_URI_FROM_CONF=$(echo "${REST_URI_FROM_CONF:-}" | sed 's|/catalog/*$||') - - echo "Configuration read from storage.conf:" - echo " REST_URI=$REST_URI_FROM_CONF" - echo " WAREHOUSE_NAME=$WAREHOUSE_NAME_FROM_CONF" - echo " REGION=$REST_REGION_FROM_CONF" - echo " S3_BUCKET=$S3_BUCKET_FROM_CONF" - echo " S3_ENDPOINT=$S3_ENDPOINT_FROM_CONF" - echo " S3_USERNAME=$S3_USERNAME_FROM_CONF" - echo " S3_PASSWORD=***" - echo "" -else - REST_URI_FROM_CONF="" - WAREHOUSE_NAME_FROM_CONF="" - REST_REGION_FROM_CONF="" - S3_BUCKET_FROM_CONF="" - S3_ENDPOINT_FROM_CONF="" - S3_USERNAME_FROM_CONF="" - S3_PASSWORD_FROM_CONF="" - echo "storage.conf not found, using environment variables or defaults" - echo "" -fi - -# Use values from storage.conf with defaults -LAKEKEEPER_BASE_URI="${REST_URI_FROM_CONF:-http://localhost:8181}" -WAREHOUSE_NAME="${WAREHOUSE_NAME_FROM_CONF:-texera}" -S3_REGION="${REST_REGION_FROM_CONF:-us-west-2}" -S3_BUCKET="${S3_BUCKET_FROM_CONF:-texera-iceberg}" -S3_ENDPOINT="${S3_ENDPOINT_FROM_CONF:-http://localhost:9000}" -S3_USERNAME="${S3_USERNAME_FROM_CONF:-texera_minio}" -S3_PASSWORD="${S3_PASSWORD_FROM_CONF:-password}" +# Derive bootstrap-internal values from the storage settings above. +LAKEKEEPER_BASE_URI="${STORAGE_ICEBERG_CATALOG_REST_URI%/}" +LAKEKEEPER_BASE_URI="${LAKEKEEPER_BASE_URI%/catalog}" +WAREHOUSE_NAME="$STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME" +S3_REGION="$STORAGE_ICEBERG_CATALOG_REST_REGION" +S3_BUCKET="$STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET" +S3_ENDPOINT="$STORAGE_S3_ENDPOINT" +S3_USERNAME="$STORAGE_S3_AUTH_USERNAME" +S3_PASSWORD="$STORAGE_S3_AUTH_PASSWORD" STORAGE_PATH="s3://${S3_BUCKET}/iceberg/${WAREHOUSE_NAME}" echo "==========================================" diff --git a/bin/parse-storage-config.py b/bin/parse-storage-config.py deleted file mode 100755 index 262dba45add..00000000000 --- a/bin/parse-storage-config.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python3 -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Parse storage.conf HOCON file with environment variable resolution. -This script properly handles HOCON syntax including environment variable substitution. - -Usage: - # Single key mode (backward compatible): - python3 bin/parse-storage-config.py storage.iceberg.catalog.rest.uri - - # Batch mode (outputs VAR_NAME=value lines): - python3 bin/parse-storage-config.py --batch VAR1=key.path1 VAR2=key.path2 ... - -Examples: - python3 bin/parse-storage-config.py storage.s3.endpoint - python3 bin/parse-storage-config.py --batch \ - REST_URI=storage.iceberg.catalog.rest.uri \ - WAREHOUSE_NAME=storage.iceberg.catalog.rest.warehouse-name \ - S3_ENDPOINT=storage.s3.endpoint -""" - -import os -import sys -from pathlib import Path - -try: - from pyhocon import ConfigFactory -except ImportError: - print("Error: pyhocon is not installed. Install it with: pip install pyhocon", file=sys.stderr) - sys.exit(1) - - -def find_storage_conf(): - """Find storage.conf path.""" - texera_home = os.environ.get("TEXERA_HOME") - if texera_home: - conf_path = Path(texera_home) / "common" / "config" / "src" / "main" / "resources" / "storage.conf" - else: - script_dir = Path(__file__).parent - conf_path = script_dir.parent / "common" / "config" / "src" / "main" / "resources" / "storage.conf" - - if not conf_path.exists(): - print(f"Error: storage.conf not found at {conf_path}", file=sys.stderr) - sys.exit(1) - - return conf_path - - -def parse_storage_config(): - """Parse storage.conf with environment variable resolution.""" - conf_path = find_storage_conf() - config = ConfigFactory.parse_file(str(conf_path)) - return config - - -def get_value(config, key_path): - """Get value from config by dot-separated key path.""" - try: - return config.get_string(key_path) - except Exception: - return None - - -def main(): - if len(sys.argv) < 2: - config = parse_storage_config() - print(config.get("storage", {})) - return - - if sys.argv[1] == "--batch": - config = parse_storage_config() - for arg in sys.argv[2:]: - if "=" not in arg: - print(f"Error: batch argument must be VAR_NAME=key.path, got '{arg}'", file=sys.stderr) - sys.exit(1) - var_name, key_path = arg.split("=", 1) - value = get_value(config, key_path) - if value is None: - value = "" - print(f"{var_name}={value}") - else: - key_path = sys.argv[1] - config = parse_storage_config() - value = get_value(config, key_path) - if value is None: - print(f"Key '{key_path}' not found", file=sys.stderr) - sys.exit(1) - print(value) - - -if __name__ == "__main__": - main() From a314b4617512598d27283529fd6582c7ba639632 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Tue, 7 Apr 2026 18:53:35 -0700 Subject: [PATCH 4/6] resolve comment --- bin/bootstrap-lakekeeper.sh | 39 ++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/bin/bootstrap-lakekeeper.sh b/bin/bootstrap-lakekeeper.sh index 1c2a03d4500..5aa6a8ce401 100755 --- a/bin/bootstrap-lakekeeper.sh +++ b/bin/bootstrap-lakekeeper.sh @@ -42,8 +42,19 @@ set -e # # ============================================================================== -# Lakekeeper binary path -LAKEKEEPER_BINARY_PATH="" +# Storage settings — must stay in sync with storage.conf (see note above) +STORAGE_ICEBERG_CATALOG_REST_URI="${STORAGE_ICEBERG_CATALOG_REST_URI:-http://localhost:8181/catalog}" +STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME="${STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME:-texera}" +STORAGE_ICEBERG_CATALOG_REST_REGION="${STORAGE_ICEBERG_CATALOG_REST_REGION:-us-west-2}" +STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET="${STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET:-texera-iceberg}" +STORAGE_S3_ENDPOINT="${STORAGE_S3_ENDPOINT:-http://localhost:9000}" +STORAGE_S3_AUTH_USERNAME="${STORAGE_S3_AUTH_USERNAME:-texera_minio}" +STORAGE_S3_AUTH_PASSWORD="${STORAGE_S3_AUTH_PASSWORD:-password}" + +# Lakekeeper binary — defaults to `lakekeeper` on $PATH (e.g. after +# `brew install lakekeeper`). Override by exporting LAKEKEEPER_BINARY_PATH +# or by editing the default below. +LAKEKEEPER_BINARY_PATH="${LAKEKEEPER_BINARY_PATH:-lakekeeper}" # Lakekeeper PostgreSQL connection URLs #(LAKEKEEPER__PG_DATABASE_URL_READ="postgres://postgres_user:postgres_urlencoded_password@hostname:5432/texera_lakekeeper" @@ -57,14 +68,6 @@ LAKEKEEPER__PG_ENCRYPTION_KEY="texera_key" # Lakekeeper metrics port LAKEKEEPER__METRICS_PORT="9091" -# Storage settings — must stay in sync with storage.conf (see note above) -STORAGE_ICEBERG_CATALOG_REST_URI="${STORAGE_ICEBERG_CATALOG_REST_URI:-http://localhost:8181/catalog}" -STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME="${STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME:-texera}" -STORAGE_ICEBERG_CATALOG_REST_REGION="${STORAGE_ICEBERG_CATALOG_REST_REGION:-us-west-2}" -STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET="${STORAGE_ICEBERG_CATALOG_REST_S3_BUCKET:-texera-iceberg}" -STORAGE_S3_ENDPOINT="${STORAGE_S3_ENDPOINT:-http://localhost:9000}" -STORAGE_S3_AUTH_USERNAME="${STORAGE_S3_AUTH_USERNAME:-texera_minio}" -STORAGE_S3_AUTH_PASSWORD="${STORAGE_S3_AUTH_PASSWORD:-password}" # ============================================================================== # End of User Configuration @@ -202,16 +205,12 @@ start_lakekeeper() { echo "Starting Lakekeeper..." - # Validate LAKEKEEPER_BINARY_PATH - if [ -z "$LAKEKEEPER_BINARY_PATH" ]; then - echo "✗ Error: LAKEKEEPER_BINARY_PATH is not set." - echo " Please set it in the User Configuration section at the top of this script." - exit 1 - fi - - if [ ! -x "$LAKEKEEPER_BINARY_PATH" ]; then - echo "✗ Error: Lakekeeper binary not found or not executable at '$LAKEKEEPER_BINARY_PATH'" - echo " Please update LAKEKEEPER_BINARY_PATH in the User Configuration section." + # Validate LAKEKEEPER_BINARY_PATH — `command -v` resolves both bare names + # via $PATH lookup and absolute paths. + if ! command -v "$LAKEKEEPER_BINARY_PATH" >/dev/null 2>&1; then + echo "✗ Error: Lakekeeper binary '$LAKEKEEPER_BINARY_PATH' not found." + echo " Install it via 'brew install lakekeeper' (macOS) or set" + echo " LAKEKEEPER_BINARY_PATH to an absolute path / edit the default in this script." exit 1 fi From c0ef83fe4472472f176a2069a72c6195ed7ddc53 Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:23:55 -0700 Subject: [PATCH 5/6] clean --- bin/bootstrap-lakekeeper.sh | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/bin/bootstrap-lakekeeper.sh b/bin/bootstrap-lakekeeper.sh index 5aa6a8ce401..e507e41a32a 100755 --- a/bin/bootstrap-lakekeeper.sh +++ b/bin/bootstrap-lakekeeper.sh @@ -16,16 +16,12 @@ # specific language governing permissions and limitations # under the License. -# Bootstrap script to start Lakekeeper and create warehouse (idempotent). -# This script does four things: -# 1. Starts Lakekeeper if it's not already running -# 2. Bootstraps the Lakekeeper server (creates default project, idempotent) -# 3. Checks if MinIO bucket exists (and creates it if needed) -# 4. Checks and creates the warehouse if it doesn't exist -# -# -# Usage: -# ./bin/bootstrap-lakekeeper.sh +# Prerequisites: +# - Already know how to setup Texera +# - macOS or Linux (Lakekeeper does not publish Windows binaries) +# - A running PostgreSQL instance (the same one Texera uses is fine) +# - An accessible S3 bucket +# - The AWS CLI (awscli) installed set -e @@ -33,16 +29,13 @@ set -e # User Configuration - Edit the values below before running this script # ============================================================================== # -# IMPORTANT: This script does NOT read storage.conf. The Java/Scala backend -# reads common/config/src/main/resources/storage.conf directly, but this bash -# script only reads the variables below. If you change values in storage.conf, -# you MUST also update the matching defaults here (or export the corresponding -# STORAGE_* environment variables before running this script), otherwise the -# bootstrap will register a warehouse that does not match what the backend uses. +# IMPORTANT: If you change values in storage.conf, you MUST also update the +# matching defaults here (or export the corresponding STORAGE_* environment +# variables before running this script) # # ============================================================================== -# Storage settings — must stay in sync with storage.conf (see note above) +# Storage settings — must stay in sync with storage.conf STORAGE_ICEBERG_CATALOG_REST_URI="${STORAGE_ICEBERG_CATALOG_REST_URI:-http://localhost:8181/catalog}" STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME="${STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME:-texera}" STORAGE_ICEBERG_CATALOG_REST_REGION="${STORAGE_ICEBERG_CATALOG_REST_REGION:-us-west-2}" @@ -57,8 +50,8 @@ STORAGE_S3_AUTH_PASSWORD="${STORAGE_S3_AUTH_PASSWORD:-password}" LAKEKEEPER_BINARY_PATH="${LAKEKEEPER_BINARY_PATH:-lakekeeper}" # Lakekeeper PostgreSQL connection URLs -#(LAKEKEEPER__PG_DATABASE_URL_READ="postgres://postgres_user:postgres_urlencoded_password@hostname:5432/texera_lakekeeper" -# LAKEKEEPER__PG_DATABASE_URL_WRITE="postgres://postgres_user:postgres_urlencoded_password@hostname:5432/texera_lakekeeper") +# LAKEKEEPER__PG_DATABASE_URL_READ="postgres://:@:5432/texera_lakekeeper" +# LAKEKEEPER__PG_DATABASE_URL_WRITE="postgres://:@:5432/texera_lakekeeper" LAKEKEEPER__PG_DATABASE_URL_READ="" LAKEKEEPER__PG_DATABASE_URL_WRITE="" From e29e2c8f6e535d0c2ad98a824143402fa25565de Mon Sep 17 00:00:00 2001 From: mengw15 <125719918+mengw15@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:30:56 -0700 Subject: [PATCH 6/6] add comment --- bin/bootstrap-lakekeeper.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/bootstrap-lakekeeper.sh b/bin/bootstrap-lakekeeper.sh index e507e41a32a..d5b2b1cb214 100755 --- a/bin/bootstrap-lakekeeper.sh +++ b/bin/bootstrap-lakekeeper.sh @@ -36,6 +36,7 @@ set -e # ============================================================================== # Storage settings — must stay in sync with storage.conf +# if needed, update the default values after `:-` to match storage.conf STORAGE_ICEBERG_CATALOG_REST_URI="${STORAGE_ICEBERG_CATALOG_REST_URI:-http://localhost:8181/catalog}" STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME="${STORAGE_ICEBERG_CATALOG_REST_WAREHOUSE_NAME:-texera}" STORAGE_ICEBERG_CATALOG_REST_REGION="${STORAGE_ICEBERG_CATALOG_REST_REGION:-us-west-2}"