Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 78 additions & 37 deletions src/pyob/core_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,50 @@ def stream_ollama(self, prompt: str, on_chunk) -> str:
logger.error(f"Ollama Error: {e}")
return response_text

def stream_github_models(self, prompt: str, on_chunk) -> str:
"""Fallback to GitHub Models API (Phi-4)."""
token = os.environ.get("GITHUB_TOKEN")
if not token:
return "ERROR_CODE_GITHUB_TOKEN_MISSING"

# GitHub Models use the Azure AI Inference endpoint
endpoint = "https://models.inference.ai.azure.com/chat/completions"
headers = {
"Authorization": f"Bearer {token}",
"Content-Type": "application/json",
}
data = {
"messages": [{"role": "user", "content": prompt}],
"model": "Phi-4",
"stream": True,
"temperature": 0.1,
}

full_text = ""
try:
response = requests.post(
endpoint, headers=headers, json=data, stream=True, timeout=120
)
if response.status_code != 200:
return f"ERROR_CODE_{response.status_code}"

for line in response.iter_lines():
if line:
decoded_line = line.decode("utf-8").replace("data: ", "")
if decoded_line == "[DONE]":
break
try:
chunk = json.loads(decoded_line)
content = chunk["choices"][0]["delta"].get("content", "")
if content:
full_text += content
on_chunk()
except Exception:
continue
return full_text
except Exception as e:
return f"ERROR_CODE_EXCEPTION: {e}"

def _stream_single_llm(
self, prompt: str, key: str | None = None, context: str = ""
) -> str:
Expand Down Expand Up @@ -385,10 +429,12 @@ def on_chunk():
response_text = self.stream_gemini(prompt, key, on_chunk)
else:
if os.environ.get("GITHUB_ACTIONS") == "true":
first_chunk_received[0] = True
return "ERROR_CODE_CLOUD_OLLAMA_BLOCKED"

response_text = self.stream_ollama(prompt, on_chunk)
logger.info(
"☁️ Gemini limited. Pivoting to GitHub Models (Phi-4)..."
)
response_text = self.stream_github_models(prompt, on_chunk)
else:
response_text = self.stream_ollama(prompt, on_chunk)
except Exception as e:
first_chunk_received[0] = True
return f"ERROR_CODE_EXCEPTION: {e}"
Expand Down Expand Up @@ -418,54 +464,48 @@ def get_valid_llm_response(self, prompt: str, validator, context: str = "") -> s
available_keys = [
k for k, cooldown in self.key_cooldowns.items() if now > cooldown
]

if not available_keys:
if is_cloud:
wait_times = [
cooldown - now for cooldown in self.key_cooldowns.values()
]
sleep_duration = max(
10, min(min(wait_times) if wait_times else 120, 1200)
)
logger.warning(
f"⏳ CLOUD NOTICE: All keys rate-limited. Retrying Gemini in {int(sleep_duration)}s..."
)
time.sleep(sleep_duration)
continue

if not use_ollama:
logger.warning(
"🚫 All Gemini keys rate-limited. Falling back to Local Ollama."
)
use_ollama = True
# In the cloud, we don't 'use_ollama', we just try the GitHub Models fallback
# which is handled inside _stream_single_llm(key=None)
use_ollama = False
else:
if not use_ollama:
logger.warning(
"🚫 Gemini keys limited. Falling back to Local Ollama."
)
use_ollama = True
else:
use_ollama = False
key = available_keys[attempts % len(available_keys)]
logger.info(
f"Attempting Gemini API Key {attempts % len(available_keys) + 1}/{len(available_keys)}"
)

if use_ollama:
logger.info("Using Local Ollama Engine...")

response_text = self._stream_single_llm(prompt, key=key, context=context)

if is_cloud and (
response_text.startswith("ERROR_CODE_") or not response_text.strip()
):
if "429" in response_text and key:
# Handle errors/rate-limits
if response_text.startswith("ERROR_CODE_429"):
if key:
self.key_cooldowns[key] = time.time() + 1200
logger.warning("⚠️ Key rate-limited. Rotating...")
else:
logger.warning(
"⚠️ Gemini error/empty response. Sleeping 10s before retry..."
)
time.sleep(10)
attempts += 1
continue

if response_text.startswith("ERROR_CODE_429"):
if key:
self.key_cooldowns[key] = time.time() + 1200
# If Gemini fails/returns empty in the cloud, perform Smart Sleep
if is_cloud and (
response_text.startswith("ERROR_CODE_") or not response_text.strip()
):
wait_times = [
cooldown - now for cooldown in self.key_cooldowns.values()
]
sleep_duration = max(
10, min(min(wait_times) if wait_times else 60, 600)
)
logger.warning(
f"⏳ Cloud limit reached. Resuming in {int(sleep_duration)}s..."
)
time.sleep(sleep_duration)
attempts += 1
continue

Expand All @@ -474,6 +514,7 @@ def get_valid_llm_response(self, prompt: str, validator, context: str = "") -> s
continue

if validator(response_text):
# --- SUCCESS BREATHER ---
if is_cloud:
time.sleep(2)
return response_text
Expand Down