From c58d1ca069bfe5c02723a77db63bbcb986ecc4eb Mon Sep 17 00:00:00 2001 From: Val Lorentz Date: Sat, 17 Jan 2026 21:56:01 +0100 Subject: [PATCH 1/6] Update properties_to_json.py --- scripts/properties_to_json.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/properties_to_json.py b/scripts/properties_to_json.py index 0c71261..e9f45af 100644 --- a/scripts/properties_to_json.py +++ b/scripts/properties_to_json.py @@ -90,6 +90,13 @@ version not in existing_item["versions"] ), f"CodeMeta {version} has duplicated property {item}" existing_item["versions"].append(version) + + # values from newer versions of properties_description.json take precedence + # over new ones + if item["Type"]: + existing_item["Type"] = item["Type"] + if item["Description"]: + existing_item["Description"] = item["Description"] break else: # No similar item, create a new one From 0eb47647a8a1f671f3c040182d18eba1a2864117 Mon Sep 17 00:00:00 2001 From: Melissa Draper Date: Mon, 19 Jan 2026 21:42:41 -0800 Subject: [PATCH 2/6] Refactor the merging logic to work --- scripts/properties_to_json.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/scripts/properties_to_json.py b/scripts/properties_to_json.py index e9f45af..4485d54 100644 --- a/scripts/properties_to_json.py +++ b/scripts/properties_to_json.py @@ -85,19 +85,26 @@ # Look for a similar existing item from a newer CodeMeta version for existing_item in json_items: if existing_item.items() >= item.items(): - # We found an existing item, add this version to its list + # We found an identical existing item, add this version to its list assert ( version not in existing_item["versions"] ), f"CodeMeta {version} has duplicated property {item}" existing_item["versions"].append(version) - - # values from newer versions of properties_description.json take precedence - # over new ones - if item["Type"]: - existing_item["Type"] = item["Type"] - if item["Description"]: - existing_item["Description"] = item["Description"] + # check for existing properties that have differing types or descriptions + # values from newer versions of properties_description.json take precedence + # over new ones. + # update the versions for these here and break to avoid duplicate rows + if item["Property"] == existing_item["Property"]: + if item["Type"] != existing_item["Type"]: + item["Type"] = existing_item["Type"] + if version not in existing_item["versions"]: + existing_item["versions"].append(version) + if item["Description"] != existing_item["Description"]: + item["Description"] = existing_item["Description"] + if version not in existing_item["versions"]: + existing_item["versions"].append(version) break + else: # No similar item, create a new one item["versions"] = [version] From cb259c36e76fab4438b269a17bec07654fc3266c Mon Sep 17 00:00:00 2001 From: Melissa Draper Date: Tue, 20 Jan 2026 20:40:56 -0800 Subject: [PATCH 3/6] Further refactoring to address multiple issues: - There are multiple Identifier properties with different parents. These are now addressed independently. - A property with a change between versions for both type and description were not correctly limiting the versions. I believe this is correctly happening now. --- scripts/properties_to_json.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/properties_to_json.py b/scripts/properties_to_json.py index 4485d54..2ab3b4d 100644 --- a/scripts/properties_to_json.py +++ b/scripts/properties_to_json.py @@ -58,6 +58,7 @@ import csv import json import pathlib +import re DIR = pathlib.Path(__file__).parent.parent CSV_PATH = DIR / "data/properties_description/" @@ -94,15 +95,20 @@ # values from newer versions of properties_description.json take precedence # over new ones. # update the versions for these here and break to avoid duplicate rows - if item["Property"] == existing_item["Property"]: - if item["Type"] != existing_item["Type"]: + if item["Property"] == existing_item["Property"] and item["Parent Type"] == existing_item["Parent Type"]: + if re.sub("\\W", "", item["Type"]).lower() != re.sub("\\W", "", existing_item["Type"]).lower(): + item["versions"] = [version] + json_items.append(item) + elif item["Type"] == existing_item["Type"]: item["Type"] = existing_item["Type"] if version not in existing_item["versions"]: existing_item["versions"].append(version) - if item["Description"] != existing_item["Description"]: + + if item["Description"] != existing_item["Description"] and item["Type"] == existing_item["Type"]: item["Description"] = existing_item["Description"] if version not in existing_item["versions"]: existing_item["versions"].append(version) + break else: From 104e3795caf9691b87d23d9d3ce091dd0a42f333 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Wed, 21 Jan 2026 08:17:13 +0100 Subject: [PATCH 4/6] Deduplicate canonicalization, add comments --- scripts/properties_to_json.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/properties_to_json.py b/scripts/properties_to_json.py index 2ab3b4d..0396975 100644 --- a/scripts/properties_to_json.py +++ b/scripts/properties_to_json.py @@ -72,6 +72,10 @@ CSV_PATH.glob("*.csv"), key=lambda p: float(p.stem.lstrip("v")), reverse=True ) +def canonicalize(s): + """strips non-letters and lower-cases""" + return re.sub("\\W", "", item["Type"]).lower() + for csv_path in paths: version = csv_path.stem # header = ["Parent Type", "Property", "Type", "Description"] @@ -96,10 +100,12 @@ # over new ones. # update the versions for these here and break to avoid duplicate rows if item["Property"] == existing_item["Property"] and item["Parent Type"] == existing_item["Parent Type"]: - if re.sub("\\W", "", item["Type"]).lower() != re.sub("\\W", "", existing_item["Type"]).lower(): + if canonicalize(item["Type"]) != canonicalize(existing_item["Type"]): + # both types meaningfully differ item["versions"] = [version] json_items.append(item) elif item["Type"] == existing_item["Type"]: + # both types differ, but it's probably just typesetting. keep the newest one item["Type"] = existing_item["Type"] if version not in existing_item["versions"]: existing_item["versions"].append(version) From 553b7b099188235e5e6453f39f43632a6bf8679a Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Wed, 21 Jan 2026 08:23:01 +0100 Subject: [PATCH 5/6] Fix canonicalize() --- scripts/properties_to_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/properties_to_json.py b/scripts/properties_to_json.py index 0396975..3272f4a 100644 --- a/scripts/properties_to_json.py +++ b/scripts/properties_to_json.py @@ -74,7 +74,7 @@ def canonicalize(s): """strips non-letters and lower-cases""" - return re.sub("\\W", "", item["Type"]).lower() + return re.sub("\\W", "", s).lower() for csv_path in paths: version = csv_path.stem From 9f4deaf0db8097727f46692affcb73ab893aa92b Mon Sep 17 00:00:00 2001 From: Valentin Lorentz Date: Wed, 21 Jan 2026 08:24:47 +0100 Subject: [PATCH 6/6] Append version even when types have slight differences, instead of hiding one --- scripts/properties_to_json.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/properties_to_json.py b/scripts/properties_to_json.py index 3272f4a..90146cc 100644 --- a/scripts/properties_to_json.py +++ b/scripts/properties_to_json.py @@ -104,8 +104,7 @@ def canonicalize(s): # both types meaningfully differ item["versions"] = [version] json_items.append(item) - elif item["Type"] == existing_item["Type"]: - # both types differ, but it's probably just typesetting. keep the newest one + else: item["Type"] = existing_item["Type"] if version not in existing_item["versions"]: existing_item["versions"].append(version)