Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
export DATABRICKS_TOKEN="dapi..."
export DATABRICKS_SERVING_BASE_URL="https://<workspace>/serving-endpoints"
export DATABRICKS_MODEL="<endpoint-name>" # See databricks-model-serving
streamlit run 2-minimal-chat-app.py
streamlit run fm-minimal-chat.py
Databricks Apps Deployment:
1. Create app.yaml:
command: ["streamlit", "run", "2-minimal-chat-app.py"]
command: ["streamlit", "run", "fm-minimal-chat.py"]
env:
- name: DATABRICKS_SERVING_BASE_URL
value: "https://<workspace>/serving-endpoints"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,42 +224,42 @@ def check_audience_fit(client: OpenAI, text: str) -> Dict[str, Any]:
time_saved = (total_latency / 1000) - total_time
print(f"\n{'='*60}")
print(f"Time saved vs serial execution: {time_saved:.2f}s")
print(f"Speedup: {(total_latency/1000) / total_time:.1f}×")
if total_time > 0:
print(f"Speedup: {(total_latency/1000) / total_time:.1f}×")
print(f"{'='*60}")


# =============================================================================
# Production Best Practices
# =============================================================================
"""
Best practices from databricksters-check-and-pub:

1. Configurable concurrency
- Use LLM_MAX_CONCURRENCY env var (default: 5 in the production app)
- Balance throughput vs rate limits
- Too high = rate limit errors
- Too low = underutilized resources

2. Error handling
- Capture exceptions per job
- Return None for failed jobs
- Collect error messages for debugging
- Continue execution even if some jobs fail

3. Bounded execution
- Only parallelize independent checks
- Cap concurrency with an env var rather than firing unlimited requests
- Keep the job contract simple: name -> (callable, args, kwargs)

4. When to use parallel calls
- Multiple independent evaluations of same content
- Batch processing multiple documents
- A/B testing different prompts
- Multi-aspect analysis

5. When NOT to use parallel calls
- Dependent/sequential operations
- Single evaluation needed
- Rate limits are very strict
- Debugging (use serial for easier troubleshooting)
"""
#
# Best practices from databricksters-check-and-pub:
#
# 1. Configurable concurrency
# - Use LLM_MAX_CONCURRENCY env var (default: 5 in the production app)
# - Balance throughput vs rate limits
# - Too high = rate limit errors
# - Too low = underutilized resources
#
# 2. Error handling
# - Capture exceptions per job
# - Return None for failed jobs
# - Collect error messages for debugging
# - Continue execution even if some jobs fail
#
# 3. Bounded execution
# - Only parallelize independent checks
# - Cap concurrency with an env var rather than firing unlimited requests
# - Keep the job contract simple: name -> (callable, args, kwargs)
#
# 4. When to use parallel calls
# - Multiple independent evaluations of same content
# - Batch processing multiple documents
# - A/B testing different prompts
# - Multi-aspect analysis
#
# 5. When NOT to use parallel calls
# - Dependent/sequential operations
# - Single evaluation needed
# - Rate limits are very strict
# - Debugging (use serial for easier troubleshooting)
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,10 @@ def get_databricks_bearer_token(
access_token = payload.get("access_token")
expires_in = int(payload.get("expires_in", 300))
if not access_token:
payload_keys = sorted(payload.keys()) if isinstance(payload, dict) else []
raise DatabricksLLMConfigError(
f"Token endpoint response is missing access_token: {payload}"
"Token endpoint response is missing access_token "
f"(keys present: {payload_keys})"
)

expires_at = int(time.time()) + expires_in
Expand Down Expand Up @@ -278,8 +280,7 @@ def validate_databricks_llm_config(
if response.status_code >= 400:
raise DatabricksLLMConfigError(
f"Failed to validate DATABRICKS_MODEL={config.model!r} in workspace "
f"{config.workspace_host} (HTTP {response.status_code}). "
f"Response: {response.text[:300]}"
f"{config.workspace_host} (HTTP {response.status_code})."
)

_validation_cache[cache_key] = int(time.time()) + VALIDATION_TTL_SECONDS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,11 @@


# Query with embedding vector directly
query_vector = [0.1, 0.2, 0.3] # Replace with your real embedding (list of floats matching the index's dimension)
results = w.vector_search_indexes.query_index(
index_name="main.default.my_index",
columns=["id", "text"],
query_vector=[0.1, 0.2, 0.3, ...], # Your embedding vector
query_vector=query_vector,
num_results=10
)

Expand Down