Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -853,10 +853,10 @@ export default {
// Set the top level Schema_UUID
this.model.Schema_UUID = this.deviceSchema.uuid

// Set the top level Instance_UUID if it doesn't already exist
if (!this.model.Instance_UUID) {
this.set('Instance_UUID', uuidv4(), this.model)
}
// Set the top level Instance_UUID to the ConfigDB object UUID.
// This ensures stability across schema changes and alignment with the
// device's identity in ConfigDB.
this.set('Instance_UUID', this.device.uuid, this.model)

},

Expand Down
2 changes: 2 additions & 0 deletions acs-configdb/sql/migrate.sql
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ BEGIN;
\ir v9.sql
\ir v10.sql
\ir v11.sql
\ir v12.sql
\ir v13.sql

\ir grant.sql

Expand Down
96 changes: 96 additions & 0 deletions acs-configdb/sql/v13.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
-- Factory+ config DB
-- DB schema v13: align Sparkplug Device UUIDs with Instance_UUIDs
-- Copyright 2026 University of Sheffield AMRC

-- When Sparkplug Device objects were registered in ConfigDB their UUIDs
-- were generated independently from the Instance_UUID recorded in their
-- DeviceInformation origin map. InfluxDB historian data is already
-- tagged with the Instance_UUID, so we change the ConfigDB object UUID
-- to match the Instance_UUID (not the reverse).

call migrate_to(13, $$
do $body$
declare
-- Sparkplug Device class UUID
sparkplug_device_class uuid
:= '18773d6d-a70d-443a-b29a-3f1583195290';
-- DeviceInformation app UUID
device_info_app uuid
:= 'a98ffed5-c613-4e70-bfd3-efeee250ade5';

dup_instance_uuid uuid;
obj_id integer;
obj_uuid uuid;
instance_uuid uuid;
begin
-- Pre-flight: check for duplicate Instance_UUIDs among
-- Sparkplug Device objects that have an origin map.
select (c.json->'originMap'->>'Instance_UUID')::uuid
into dup_instance_uuid
from all_membership m
join object o on o.id = m.id
join object app on app.uuid = device_info_app
join config c
on c.object = m.id
and c.app = app.id
where m.class = (
select id from object
where uuid = sparkplug_device_class)
and c.json->'originMap' ? 'Instance_UUID'
group by (c.json->'originMap'->>'Instance_UUID')::uuid
having count(*) > 1
limit 1;

if found then
raise exception
'Duplicate Instance_UUID % found among Sparkplug Device '
'objects — cannot safely align UUIDs; resolve duplicates '
'before running this migration.',
dup_instance_uuid;
end if;

-- Iterate over each Sparkplug Device whose object UUID does
-- not already match its Instance_UUID.
for obj_id, obj_uuid, instance_uuid in
select o.id, o.uuid, (c.json->'originMap'->>'Instance_UUID')::uuid
from all_membership m
join object o on o.id = m.id
join object app on app.uuid = device_info_app
join config c
on c.object = m.id
and c.app = app.id
where m.class = (
select id from object
where uuid = sparkplug_device_class)
and c.json->'originMap' ? 'Instance_UUID'
and o.uuid != (c.json->'originMap'->>'Instance_UUID')::uuid
loop
raise notice 'Aligning Device %: object UUID % -> Instance_UUID %',
obj_id, obj_uuid, instance_uuid;

-- Replace all occurrences of the old object UUID with the
-- Instance_UUID across the entire config table. This handles
-- JSON blobs that cross-reference this device's UUID.
update config
set json = replace(
json::text,
obj_uuid::text,
instance_uuid::text)::jsonb
where json::text like '%' || obj_uuid::text || '%';

-- Update the object UUID itself. The on-update-cascade FKs
-- in membership, subclass, and config.object propagate
-- this change automatically.
update object
set uuid = instance_uuid
where id = obj_id;
end loop;

-- Rebuild all Object Registration config entries to reflect
-- the updated UUIDs.
call update_registration(null);

raise notice 'v13 migration complete: Sparkplug Device UUIDs aligned with Instance_UUIDs.';
end;
$body$;
$$);
80 changes: 80 additions & 0 deletions deploy/templates/hooks/pre-upgrade-backup.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
{{- if and .Values.configdb.enabled .Values.configdb.backup.enabled }}
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: configdb-backups
namespace: {{ .Release.Namespace }}
annotations:
"helm.sh/hook": pre-upgrade
"helm.sh/hook-weight": "-10"
"helm.sh/hook-delete-policy": before-hook-creation
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.configdb.backup.storageSize }}

---

apiVersion: batch/v1
kind: Job
metadata:
name: configdb-pre-upgrade-backup
namespace: {{ .Release.Namespace }}
annotations:
"helm.sh/hook": pre-upgrade
"helm.sh/hook-weight": "-5"
"helm.sh/hook-delete-policy": before-hook-creation
spec:
ttlSecondsAfterFinished: 300
template:
spec:
restartPolicy: Never
volumes:
- name: krb5-conf
configMap:
name: krb5-conf
- name: krb5-keytabs-dbinit
secret:
secretName: krb5-keytabs
items:
- path: dbadmin
key: op1pgadmin
- name: backups
persistentVolumeClaim:
claimName: configdb-backups
containers:
- name: pg-backup
{{ include "amrc-connectivity-stack.image" (list . .Values.configdb) | indent 10 }}
command: ["/usr/bin/k5start", "-Uf", "/keytabs/dbadmin"]
args:
- /bin/sh
- -c
- |
set -e
TIMESTAMP=$(date -u +%Y-%m-%dT%H-%M-%SZ)
BACKUP_FILE="/backups/configdb-backup-${TIMESTAMP}.sql"
echo "Running pg_dump to ${BACKUP_FILE}..."
pg_dump -h "$PGHOST" -U "$PGUSER" -d configdb -f "$BACKUP_FILE"
echo "Backup complete."
RETENTION={{ .Values.configdb.backup.retention }}
ls -t /backups/configdb-backup-*.sql | tail -n +$((RETENTION + 1)) | xargs -r rm --
echo "Rotation complete. Current backups:"
ls -lh /backups/
env:
- name: KRB5_CONFIG
value: /config/krb5-conf/krb5.conf
- name: PGHOST
value: postgres.{{ .Release.Namespace }}.svc.cluster.local
- name: PGUSER
value: op1pgadmin
volumeMounts:
- mountPath: /config/krb5-conf
name: krb5-conf
- mountPath: /keytabs
name: krb5-keytabs-dbinit
- mountPath: /backups
name: backups
backoffLimit: 0
{{- end }}
7 changes: 7 additions & 0 deletions deploy/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,13 @@ configdb:
# -- The repository of the Configuration Store component
repository: acs-configdb
bodyLimit: 100kb
backup:
# -- Whether to run a pg_dump of the ConfigDB before every helm upgrade
enabled: true
# -- Number of backups to retain
retention: 5
# -- Size of the PVC for storing backups
storageSize: 2Gi

files:
enabled: true
Expand Down
85 changes: 85 additions & 0 deletions docs/plans/2026-04-08-device-uuid-alignment-design.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# Device UUID Alignment

Date: 2026-04-08
Authors: Alex Godbehere, Ben Morrow, Zebokhon Abduraimova

## Problem

The ConfigDB object UUID and the Sparkplug Instance_UUID for a device are independently generated and never equal. The Instance_UUID is a random UUIDv4 generated by the admin UI when the origin map is first saved (`acs-admin/src/components/EdgeManager/Devices/OriginMapEditor/OriginMapEditor.vue:857-858`). The ConfigDB object UUID is generated server-side when the device is created (`NewDeviceDialog.vue:105`).

This causes two problems:

1. **Instance_UUID is destroyed on schema change.** `ChangeSchemaDialog.vue:211-215` clears the origin map to null. When the user reconfigures and saves, a fresh Instance_UUID is generated. The old UUID and all InfluxDB data tagged with it become orphaned.

2. **No stable link between ConfigDB UUID and InfluxDB data.** The Sparkplug historian tags data with the Instance_UUID as the `topLevelInstance` tag (`historian-sparkplug/lib/mqttclient.ts:220-222`). The Data Access service must maintain a runtime mapping between ConfigDB UUID and Instance_UUID by reading every device's origin map. If that mapping is stale or was regenerated, historical data becomes inaccessible.

## Decision

Make the top-level Instance_UUID equal the ConfigDB object UUID. The ConfigDB UUID changes to match the Instance_UUID (not the other way around) because InfluxDB data is already tagged with the Instance_UUID and cannot be re-tagged without a batch job.

Sub-Instance_UUIDs (nested objects within the origin map) are unaffected and continue using random UUIDv4.

## Solution

Three changes, deployed together.

### 1. Admin UI fix

**File:** `acs-admin/src/components/EdgeManager/Devices/OriginMapEditor/OriginMapEditor.vue`

Change `prepareModelForSaving()` to use the device's ConfigDB object UUID as the top-level Instance_UUID instead of generating a random UUIDv4. The device UUID is already available in the component context.

This single change fixes both problems: the regen bug disappears because the Instance_UUID is always derived from the ConfigDB object UUID regardless of origin map state, and new devices are aligned from creation.

No changes to `ChangeSchemaDialog.vue` are needed. It can continue clearing the origin map on schema change.

### 2. ConfigDB SQL migration

**File:** `acs-configdb/sql/v13.sql` (wired into `migrate.sql`)

The migration runs inside the existing migration transaction and is atomic.

**Step 1 -- Pre-flight check.** Query all Sparkplug_Device objects (class `18773d6d-a70d-443a-b29a-3f1583195290`), extract the top-level Instance_UUID from each device's DeviceInformation config (`config.json->'originMap'->'Instance_UUID'`). If any two devices share the same Instance_UUID, raise an error and abort. Devices with no origin map or no Instance_UUID are skipped.

**Step 2 -- UUID replacement.** For each device where Instance_UUID differs from the object UUID:
- Global text replace across all rows in the `config` table: `replace(json::text, old_uuid::text, new_uuid::text)::jsonb`. This catches every reference to the old UUID in every config value for every app.
- Update `object.uuid` to the Instance_UUID.

**Step 3 -- Regenerate registration.** Call `update_registration(null)` to rebuild all Object Registration entries (app=6) from the updated `object` table.

**Failure and recovery.** If the migration fails (pre-flight check, constraint violation, or any other error), the transaction rolls back and the schema version stays unchanged. The `migrate_to()` procedure will re-attempt the migration on the next pod restart or `helm upgrade`. Fix the underlying issue (e.g. resolve conflicting Instance_UUIDs), then upgrade again.

### 3. Helm pre-upgrade backup hook

**Files:**
- `deploy/templates/hooks/pre-upgrade-backup.yaml` (Job, PVC, ServiceAccount, etc.)
- `deploy/values.yaml` (new config entries)

A Helm Job annotated with `helm.sh/hook: pre-upgrade` and `helm.sh/hook-weight: "-10"` runs before any chart resources are updated.

The Job:
- Uses the same `postgres:16.1` image as the Kubegres deployment
- Connects to `postgres.{namespace}.svc.cluster.local` using operator credentials
- Runs `pg_dump` of the configdb database to a timestamped file on a `configdb-backups` PVC
- Rotates backups to keep the last 5
- Exits non-zero on failure, which aborts the entire Helm upgrade

New `values.yaml` defaults:
- `configdb.backup.enabled: true`
- `configdb.backup.retention: 5`
- `configdb.backup.storageSize: 2Gi`

Hook delete policy: `before-hook-creation` (previous hook Job is cleaned up before a new one is created).

## What this doesn't change

- **InfluxDB data** -- existing `topLevelInstance` tags remain valid because the ConfigDB UUID moves to match the Instance_UUID, not the other way around.
- **Sub-Instance_UUIDs** -- only the top-level Instance_UUID is aligned to the ConfigDB object UUID.
- **The ingester** -- no changes to the historian or how it tags data.
- **Other services** -- the integer-based FK relationships in the ConfigDB schema are unaffected. Only `object.uuid` and JSON config values change.

## Risks

- **Conflicting Instance_UUIDs.** If two devices share an Instance_UUID (a bug, but possible), the migration pre-flight check will catch it and abort. Requires manual resolution before re-running.
- **External references to old device UUIDs.** Anything outside the ConfigDB database that references a device by its old ConfigDB UUID (e.g. Auth ACL entries, external scripts, bookmarks) will break. Ben notes he is not aware of anything that currently references Device UUIDs outside ConfigDB, but this should be verified before deployment.
- **Large config tables.** The global text replace iterates all rows in the `config` table for each device being fixed. On a large deployment this could be slow, but it runs within the migration transaction during startup so there is no concurrent access concern.
Loading
Loading