From 2a4bc9db3532a615dbaf04dc6329d9456e22120b Mon Sep 17 00:00:00 2001
From: "promptless[bot]" <promptless[bot]@users.noreply.github.com>
Date: Thu, 7 May 2026 21:00:31 +0000
Subject: [PATCH 1/3] Document --model-reference flag for serverless create

Add documentation for the new --model-reference flag that allows
attaching models when creating serverless endpoints from templates.
---
 runpodctl/reference/runpodctl-serverless.mdx | 8 ++++++++
 1 file changed, 8 insertions(+)
diff --git a/runpodctl/reference/runpodctl-serverless.mdx b/runpodctl/reference/runpodctl-serverless.mdx
index a09b2440..fe0cfcce 100644
--- a/runpodctl/reference/runpodctl-serverless.mdx
+++ b/runpodctl/reference/runpodctl-serverless.mdx
@@ -65,6 +65,10 @@ Create a new Serverless endpoint from a template or from a Hub repo:
 # Create from a template
 runpodctl serverless create --name "my-endpoint" --template-id "tpl_abc123"
 
+# Create from a template with model references
+runpodctl serverless create --template-id "tpl_abc123" --gpu-id ADA_24 \
+  --model-reference https://example.com/models/llama:v1
+
 # Create from a Hub repo
 runpodctl hub search vllm                                         # Find the hub ID
 runpodctl serverless create --hub-id cm8h09d9n000008jvh2rqdsmb --name "my-vllm"
@@ -159,6 +163,10 @@ Execution timeout in seconds. Jobs that exceed this duration are terminated. The
 Environment variable in `KEY=VALUE` format. Use multiple `--env` flags to set multiple variables. When deploying from `--hub-id`, these values override the Hub release defaults.
 </ResponseField>
 
+<ResponseField name="--model-reference" type="string">
+Model reference URL to attach to the endpoint. Use multiple `--model-reference` flags to attach multiple models. Only supported with `--template-id` (not `--hub-id`) and requires GPU compute type.
+</ResponseField>
+
 ### Update an endpoint
 
 Update endpoint configuration:

From 412394c9b7c676f3c8065287c0109c69097a4148 Mon Sep 17 00:00:00 2001
From: "promptless[bot]" <promptless[bot]@users.noreply.github.com>
Date: Wed, 10 Jun 2026 13:01:50 +0000
Subject: [PATCH 2/3] Add CPU endpoint and unified GraphQL flags documentation

Updates runpodctl serverless create documentation based on PR #284 which
unifies all create paths onto the GraphQL saveEndpoint mutation:

- Add --instance-id flag for CPU endpoints (default: cpu3g-4-16)
- Add CPU endpoint example to examples section
- Update --gpu-id to document both type IDs and pool IDs (auto-translation)
- Update --name to document auto-generation when omitted
- Fix flag names: --scale-by/--scale-threshold (not --scaler-type/--scaler-value)
- Fix --idle-timeout range: 1-3600 (not 5-3600)
- Document --network-volume-id and --network-volume-ids mutual exclusivity
---
 runpodctl/reference/runpodctl-serverless.mdx | 35 ++++++++++++--------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/runpodctl/reference/runpodctl-serverless.mdx b/runpodctl/reference/runpodctl-serverless.mdx
index fe0cfcce..a91560e3 100644
--- a/runpodctl/reference/runpodctl-serverless.mdx
+++ b/runpodctl/reference/runpodctl-serverless.mdx
@@ -63,11 +63,14 @@ Create a new Serverless endpoint from a template or from a Hub repo:
 
 ```bash
 # Create from a template
-runpodctl serverless create --name "my-endpoint" --template-id "tpl_abc123"
+runpodctl serverless create --template-id "tpl_abc123" --gpu-id "NVIDIA GeForce RTX 4090"
 
-# Create from a template with model references
-runpodctl serverless create --template-id "tpl_abc123" --gpu-id ADA_24 \
-  --model-reference https://example.com/models/llama:v1
+# Create from a template with a model reference
+runpodctl serverless create --template-id "tpl_abc123" --gpu-id "NVIDIA GeForce RTX 4090" \
+  --model-reference https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct:main
+
+# Create a CPU endpoint
+runpodctl serverless create --template-id "tpl_abc123" --compute-type CPU
 
 # Create from a Hub repo
 runpodctl hub search vllm                                         # Find the hub ID
@@ -92,7 +95,7 @@ Each Serverless template can only be bound to one endpoint at a time. To create
 #### Create flags
 
 <ResponseField name="--name" type="string">
-Name for the endpoint.
+Name for the endpoint. If omitted, a name is auto-generated in the format `endpoint-XXXXXXXX`.
 </ResponseField>
 
 <ResponseField name="--template-id" type="string">
@@ -104,7 +107,7 @@ Hub listing ID to deploy from (alternative to `--template-id`). Use [`runpodctl
 </ResponseField>
 
 <ResponseField name="--gpu-id" type="string">
-GPU type for workers. Use [`runpodctl gpu list`](/runpodctl/reference/runpodctl-gpu) to see available GPUs.
+GPU type for workers. Accepts either a GPU type ID (e.g., `NVIDIA A40`, `NVIDIA GeForce RTX 4090`) or a GPU pool ID (e.g., `ADA_24`, `AMPERE_48`). Use [`runpodctl gpu list`](/runpodctl/reference/runpodctl-gpu) to see available GPUs.
 </ResponseField>
 
 <ResponseField name="--gpu-count" type="int" default="1">
@@ -112,7 +115,11 @@ Number of GPUs per worker.
 </ResponseField>
 
 <ResponseField name="--compute-type" type="string" default="GPU">
-Compute type (`GPU` or `CPU`).
+Compute type (`GPU` or `CPU`). For CPU endpoints, use `--instance-id` to specify the CPU instance type.
+</ResponseField>
+
+<ResponseField name="--instance-id" type="string" default="cpu3g-4-16">
+CPU instance ID when using `--compute-type CPU`. If omitted, defaults to `cpu3g-4-16`. Only valid with `--compute-type CPU`.
 </ResponseField>
 
 <ResponseField name="--workers-min" type="int" default="0">
@@ -128,27 +135,27 @@ Comma-separated list of preferred datacenter IDs. Use [`runpodctl datacenter lis
 </ResponseField>
 
 <ResponseField name="--network-volume-id" type="string">
-Network volume ID to attach. Use [`runpodctl network-volume list`](/runpodctl/reference/runpodctl-network-volume) to see available network volumes.
+Network volume ID to attach for single-region deployments. Use [`runpodctl network-volume list`](/runpodctl/reference/runpodctl-network-volume) to see available network volumes. Mutually exclusive with `--network-volume-ids`.
 </ResponseField>
 
 <ResponseField name="--network-volume-ids" type="string">
-Comma-separated list of network volume IDs to attach. Use this when attaching multiple network volumes to an endpoint.
+Comma-separated list of network volume IDs for multi-region deployments. Mutually exclusive with `--network-volume-id`.
 </ResponseField>
 
 <ResponseField name="--min-cuda-version" type="string">
 Minimum CUDA version required for workers (e.g., `12.4`). Workers will only be scheduled on machines that meet this CUDA version requirement.
 </ResponseField>
 
-<ResponseField name="--scaler-type" type="string" default="QUEUE_DELAY">
-Autoscaler type (`QUEUE_DELAY` or `REQUEST_COUNT`). `QUEUE_DELAY` scales based on queue wait time; `REQUEST_COUNT` scales based on concurrent requests.
+<ResponseField name="--scale-by" type="string">
+Autoscaling strategy: `delay` (scales based on queue wait time in seconds) or `requests` (scales based on pending request count).
 </ResponseField>
 
-<ResponseField name="--scaler-value" type="int">
-Scaler threshold value. For `QUEUE_DELAY`, this is the target delay in seconds. For `REQUEST_COUNT`, this is the number of concurrent requests per worker before scaling.
+<ResponseField name="--scale-threshold" type="int">
+Trigger point for the autoscaler. For `delay`, this is the target queue wait time in seconds. For `requests`, this is the pending request count that triggers scaling.
 </ResponseField>
 
 <ResponseField name="--idle-timeout" type="int">
-Idle timeout in seconds. Workers shut down after being idle for this duration. Valid range: 5-3600 seconds.
+Idle timeout in seconds. Workers shut down after being idle for this duration. Valid range: 1-3600 seconds.
 </ResponseField>
 
 <ResponseField name="--flash-boot" type="bool">

From b99b2db81bb4e26ca260a10b88da5fcdd398245f Mon Sep 17 00:00:00 2001
From: "promptless[bot]" <promptless[bot]@users.noreply.github.com>
Date: Tue, 16 Jun 2026 14:58:49 +0000
Subject: [PATCH 3/3] Document --env template-id behavior and --name minimum
 length for serverless create

---
 runpodctl/reference/runpodctl-serverless.mdx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runpodctl/reference/runpodctl-serverless.mdx b/runpodctl/reference/runpodctl-serverless.mdx
index a91560e3..a3e82b12 100644
--- a/runpodctl/reference/runpodctl-serverless.mdx
+++ b/runpodctl/reference/runpodctl-serverless.mdx
@@ -95,7 +95,7 @@ Each Serverless template can only be bound to one endpoint at a time. To create
 #### Create flags
 
 <ResponseField name="--name" type="string">
-Name for the endpoint. If omitted, a name is auto-generated in the format `endpoint-XXXXXXXX`.
+Name for the endpoint. Must be at least 3 characters. If omitted, a name is auto-generated in the format `endpoint-XXXXXXXX`.
 </ResponseField>
 
 <ResponseField name="--template-id" type="string">
@@ -167,7 +167,7 @@ Execution timeout in seconds. Jobs that exceed this duration are terminated. The
 </ResponseField>
 
 <ResponseField name="--env" type="string">
-Environment variable in `KEY=VALUE` format. Use multiple `--env` flags to set multiple variables. When deploying from `--hub-id`, these values override the Hub release defaults.
+Environment variable in `KEY=VALUE` format. Use multiple `--env` flags to set multiple variables. These values only apply when deploying from `--hub-id`, where they override the Hub release defaults. With `--template-id`, environment variables come from the template, so `--env` is ignored and the CLI prints a note to that effect.
 </ResponseField>
 
 <ResponseField name="--model-reference" type="string">