|
1 | 1 | import csv |
2 | 2 | from collections import defaultdict |
3 | | -from functools import cmp_to_key |
4 | 3 | from io import IOBase, StringIO |
5 | 4 | from typing import Dict, Iterable, Set |
6 | 5 |
|
@@ -143,7 +142,19 @@ def _generate_anchors( |
143 | 142 | for t in types_sorted: |
144 | 143 | type_ = cas.typesystem.get_type(t) |
145 | 144 | feature_structures = all_feature_structures_by_type[type_.name] |
146 | | - feature_structures.sort(key=cmp_to_key(lambda a, b: _compare_fs(type_, a, b))) |
| 145 | + # Sort deterministically using a stable key function. We avoid using |
| 146 | + # the comparator-based approach to prevent unpredictable comparisons |
| 147 | + # between mixed types during lexicographic tuple comparisons. |
| 148 | + feature_structures.sort( |
| 149 | + key=lambda fs: ( |
| 150 | + 0, |
| 151 | + fs.begin, |
| 152 | + fs.end, |
| 153 | + str(_feature_structure_hash(type_, fs)), |
| 154 | + ) |
| 155 | + if _is_annotation_fs(fs) |
| 156 | + else (1, str(_feature_structure_hash(type_, fs))) |
| 157 | + ) |
147 | 158 |
|
148 | 159 | for fs in feature_structures: |
149 | 160 | add_index_mark = mark_indexed and fs in indexed_feature_structures |
@@ -190,57 +201,89 @@ def _compare_fs(type_: Type, a: FeatureStructure, b: FeatureStructure) -> int: |
190 | 201 | if a is b: |
191 | 202 | return 0 |
192 | 203 |
|
193 | | - # duck-typing check if something is a annotation - if yes, try sorting by offets |
194 | | - fs_a_is_annotation = _is_annotation_fs(a) |
195 | | - fs_b_is_annotation = _is_annotation_fs(b) |
196 | | - if fs_a_is_annotation != fs_b_is_annotation: |
197 | | - return -1 |
198 | | - if fs_a_is_annotation and fs_b_is_annotation: |
199 | | - begin_cmp = a.begin - b.begin |
200 | | - if begin_cmp != 0: |
201 | | - return begin_cmp |
202 | | - |
203 | | - begin_cmp = b.end - a.end |
204 | | - if begin_cmp != 0: |
205 | | - return begin_cmp |
206 | | - |
207 | | - # Alternative implementation |
208 | | - # Doing arithmetics on the hash value as we have done with the offsets does not work because the hashes do not |
209 | | - # provide a global order. Hence, we map all results to 0, -1 and 1 here. |
210 | | - fs_hash_a = _feature_structure_hash(type_, a) |
211 | | - fs_hash_b = _feature_structure_hash(type_, b) |
212 | | - if fs_hash_a == fs_hash_b: |
| 204 | + # Build stable sort keys for both feature structures and compare them. |
| 205 | + # The key is a tuple; tuples compare lexicographically which yields a |
| 206 | + # deterministic ordering. For annotations we sort by (0, begin, end, hash, xmiID) |
| 207 | + # so that they appear before non-annotations and are ordered by offsets. |
| 208 | + # For non-annotations we use (1, hash, xmiID). |
| 209 | + def _fs_sort_key(fs: FeatureStructure): |
| 210 | + if _is_annotation_fs(fs): |
| 211 | + # Use feature-derived hash as tie-breaker. Avoid xmiID because it may |
| 212 | + # differ between an original CAS and one deserialized from XMI. |
| 213 | + return (0, fs.begin, fs.end, _feature_structure_hash(type_, fs)) |
| 214 | + else: |
| 215 | + return (1, _feature_structure_hash(type_, fs)) |
| 216 | + |
| 217 | + key_a = _fs_sort_key(a) |
| 218 | + key_b = _fs_sort_key(b) |
| 219 | + |
| 220 | + if key_a == key_b: |
213 | 221 | return 0 |
214 | | - return -1 if fs_hash_a < fs_hash_b else 1 |
| 222 | + return -1 if key_a < key_b else 1 |
215 | 223 |
|
216 | 224 |
|
217 | 225 | def _feature_structure_hash(type_: Type, fs: FeatureStructure): |
218 | | - hash_ = 0 |
| 226 | + # For backward compatibility keep a function that returns a stable string |
| 227 | + # representation of the FS contents. This is used as a deterministic |
| 228 | + # tie-breaker when sorting. We avoid returning complex nested tuples to |
| 229 | + # keep comparisons simple and stable across original and deserialized CASes. |
| 230 | + def _render_val(v): |
| 231 | + if v is None: |
| 232 | + return "<NULL>" |
| 233 | + if type(v) in (int, float, bool, str): |
| 234 | + return str(v) |
| 235 | + if _is_array_fs(v): |
| 236 | + # Join element representations with '|' |
| 237 | + return "[" + ",".join(_render_val(e) for e in (v.elements or [])) + "]" |
| 238 | + # Feature structure reference |
| 239 | + try: |
| 240 | + if _is_annotation_fs(v): |
| 241 | + return f"{v.type.name}@{v.begin}-{v.end}" |
| 242 | + else: |
| 243 | + return f"{v.type.name}" |
| 244 | + except Exception: |
| 245 | + return str(v) |
| 246 | + |
219 | 247 | if _is_array_fs(fs): |
220 | | - return len(fs.elements) if fs.elements else 0 |
| 248 | + return _render_val(fs.elements or []) |
221 | 249 |
|
222 | | - # Should be possible to get away with not sorting here assuming that all_features returns the features always in |
223 | | - # the same order |
| 250 | + parts = [] |
224 | 251 | for feature in type_.all_features: |
225 | 252 | if feature.name == FEATURE_BASE_NAME_SOFA: |
226 | 253 | continue |
227 | | - |
228 | | - feature_value = getattr(fs, feature.name) |
229 | | - |
230 | | - if _is_array_fs(feature_value): |
231 | | - if feature_value.elements is not None: |
232 | | - for element in feature_value.elements: |
233 | | - hash_ = _feature_value_hash(feature_value, hash_) |
234 | | - else: |
235 | | - hash_ = _feature_value_hash(feature_value, hash_) |
236 | | - return hash_ |
| 254 | + parts.append(_render_val(getattr(fs, feature.name))) |
| 255 | + return "|".join(parts) |
237 | 256 |
|
238 | 257 |
|
239 | 258 | def _feature_value_hash(feature_value: any, hash_: int): |
240 | | - # Note we do not recurse further into arrays here because that could lead to endless loops! |
241 | | - if type(feature_value) in (int, float, bool, str): |
242 | | - return hash_ + hash(feature_value) |
243 | | - else: |
244 | | - # If we get here, it is a feature structure reference... we cannot really recursively |
245 | | - # go into it to calculate a recursive hash... so we just check if the value is non-null |
246 | | - return hash_ * (-1 if feature_value is None else 1) |
| 259 | + # Deprecated: kept for backward compatibility. Prefer using |
| 260 | + # _feature_structure_hash which now returns a deterministic key. |
| 261 | + raise RuntimeError("_feature_value_hash is deprecated; use content-based keys") |
| 262 | + |
| 263 | + |
| 264 | +def _normalize_feature_value(value: any): |
| 265 | + """Return a stable, comparable representation for a feature value. |
| 266 | +
|
| 267 | + Primitives are returned as-is. Feature structure references are normalized |
| 268 | + to a tuple containing the referenced type name and offsets if the target |
| 269 | + is an annotation. Arrays are represented as tuples of normalized elements. |
| 270 | + """ |
| 271 | + # Use tagged tuples to guarantee consistent types and deterministic |
| 272 | + # ordering during comparisons. This avoids runtime TypeErrors when |
| 273 | + # different kinds of values (None, tuple, primitive) would otherwise |
| 274 | + # be compared directly. |
| 275 | + if value is None: |
| 276 | + return ("N",) |
| 277 | + if type(value) in (int, float, bool, str): |
| 278 | + return ("P", value) |
| 279 | + if _is_array_fs(value): |
| 280 | + return ("A",) + tuple(_normalize_feature_value(e) for e in (value.elements or [])) |
| 281 | + # Feature structure reference |
| 282 | + try: |
| 283 | + if _is_annotation_fs(value): |
| 284 | + return ("FS", value.type.name, value.begin, value.end) |
| 285 | + else: |
| 286 | + return ("FS", value.type.name) |
| 287 | + except Exception: |
| 288 | + # Fallback: string representation |
| 289 | + return ("FS", getattr(getattr(value, "type", None), "name", str(value))) |
0 commit comments