Skip to content

Commit 121e10f

Browse files
committed
#336 - Introduce Annotation class
- Introduced Annotation class with default begin/end 0 - Update reference data accordingly - Fixed several issues with sorting/ordering annotations
1 parent 9f138a7 commit 121e10f

9 files changed

Lines changed: 236 additions & 69 deletions

cassis/cas.py

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
TYPE_NAME_FS_LIST,
2020
TYPE_NAME_SOFA,
2121
FeatureStructure,
22+
Annotation,
2223
Type,
2324
TypeCheckError,
2425
TypeSystem,
@@ -171,9 +172,15 @@ def type_index(self) -> Dict[str, SortedKeyList]:
171172
return self._indices
172173

173174
def add_annotation_to_index(self, annotation: FeatureStructure):
175+
"""Adds a feature structure to the type index for this view.
176+
177+
The index accepts both annotation-like FS (with begin/end) and
178+
arbitrary feature structures. Sorting is performed by the shared
179+
`_sort_func` which duck-types annotation instances.
180+
"""
174181
self._indices[annotation.type.name].add(annotation)
175182

176-
def get_all_annotations(self) -> List[FeatureStructure]:
183+
def get_all_annotations(self) -> List[Annotation]:
177184
"""Gets all the annotations in this view.
178185
179186
Returns:
@@ -334,6 +341,8 @@ def add(self, annotation: FeatureStructure, keep_id: Optional[bool] = True):
334341
if hasattr(annotation, "sofa"):
335342
annotation.sofa = self.get_sofa()
336343

344+
# Add to the index. The view index accepts any FeatureStructure;
345+
# `_sort_func` will duck-type annotation-like objects when sorting.
337346
self._current_view.add_annotation_to_index(annotation)
338347

339348
@deprecation.deprecated(details="Use add()")
@@ -387,7 +396,7 @@ def remove_annotation(self, annotation: FeatureStructure):
387396
self.remove(annotation)
388397

389398
@deprecation.deprecated(details="Use annotation.get_covered_text()")
390-
def get_covered_text(self, annotation: FeatureStructure) -> str:
399+
def get_covered_text(self, annotation: Annotation) -> str:
391400
"""Gets the text that is covered by `annotation`.
392401
393402
Args:
@@ -413,7 +422,7 @@ def select(self, type_: Union[Type, str]) -> List[FeatureStructure]:
413422
t = type_ if isinstance(type_, Type) else self.typesystem.get_type(type_)
414423
return self._get_feature_structures(t)
415424

416-
def select_covered(self, type_: Union[Type, str], covering_annotation: FeatureStructure) -> List[FeatureStructure]:
425+
def select_covered(self, type_: Union[Type, str], covering_annotation: Annotation) -> List[FeatureStructure]:
417426
"""Returns a list of covered annotations.
418427
419428
Return all annotations that are covered
@@ -439,7 +448,7 @@ def select_covered(self, type_: Union[Type, str], covering_annotation: FeatureSt
439448
result.append(annotation)
440449
return result
441450

442-
def select_covering(self, type_: Union[Type, str], covered_annotation: FeatureStructure) -> List[FeatureStructure]:
451+
def select_covering(self, type_: Union[Type, str], covered_annotation: Annotation) -> List[FeatureStructure]:
443452
"""Returns a list of annotations that cover the given annotation.
444453
445454
Return all annotations that are covering. This can be potentially be slow.
@@ -465,7 +474,7 @@ def select_covering(self, type_: Union[Type, str], covered_annotation: FeatureSt
465474
if c_begin >= annotation.begin and c_end <= annotation.end:
466475
yield annotation
467476

468-
def select_all(self) -> List[FeatureStructure]:
477+
def select_all(self) -> List[Annotation]:
469478
"""Finds all feature structures in this Cas
470479
471480
Returns:
@@ -834,8 +843,21 @@ def _copy(self) -> "Cas":
834843

835844

836845
def _sort_func(a: FeatureStructure) -> Tuple[int, int, int]:
837-
d = a.__slots__
838-
if "begin" in d and "end" in d:
839-
return a.begin, a.end, id(a)
840-
else:
841-
return sys.maxsize, sys.maxsize, id(a)
846+
# Some runtime-generated feature structure classes do not expose their
847+
# attribute names through __slots__ in a consistent way. Detect annotation
848+
# instances by duck-typing: presence of integer `begin` and `end` attributes.
849+
try:
850+
has_begin = hasattr(a, "begin") and isinstance(getattr(a, "begin"), int)
851+
has_end = hasattr(a, "end") and isinstance(getattr(a, "end"), int)
852+
except Exception:
853+
has_begin = False
854+
has_end = False
855+
856+
if has_begin and has_end:
857+
# Use xmiID as the final tiebreaker to ensure deterministic ordering.
858+
# Previously id(a) (memory address) was used which can vary between runs
859+
# and lead to non-deterministic ordering in tests.
860+
return a.begin, a.end, a.xmiID if getattr(a, "xmiID", None) is not None else id(a)
861+
862+
# Non-annotation feature structures are sorted after annotations using large sentinels
863+
return sys.maxsize, sys.maxsize, a.xmiID if getattr(a, "xmiID", None) is not None else id(a)

cassis/typesystem.py

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,19 @@ def __repr__(self):
500500
return str(self)
501501

502502

503+
@attr.s(slots=True)
504+
class Annotation(FeatureStructure):
505+
"""Concrete base class for annotation instances.
506+
507+
Generated types that represent (subtypes of) `uima.tcas.Annotation` will
508+
inherit from this class so that static typing can rely on a nominal base
509+
providing `begin` and `end`.
510+
"""
511+
512+
begin: int = attr.ib(default=0)
513+
end: int = attr.ib(default=0)
514+
515+
503516
@attr.s(slots=True, eq=False, order=False, repr=False)
504517
class Feature:
505518
"""A feature defines one attribute of a feature structure"""
@@ -572,15 +585,44 @@ class Type:
572585
def __attrs_post_init__(self):
573586
"""Build the constructor that can create feature structures of this type"""
574587
name = _string_to_valid_classname(self.name)
575-
fields = {feature.name: attr.ib(default=None, repr=(feature.name != "sofa")) for feature in self.all_features}
588+
589+
# Determine whether this type is (transitively) a subtype of uima.tcas.Annotation
590+
def _is_annotation_type(t: "Type") -> bool:
591+
cur = t
592+
while cur is not None:
593+
if cur.name == TYPE_NAME_ANNOTATION:
594+
return True
595+
cur = cur.supertype
596+
return False
597+
598+
# When inheriting from our concrete Annotation base, do not redeclare
599+
# the 'begin' and 'end' features as fields; they are already present.
600+
fields = {}
601+
for feature in self.all_features:
602+
if feature.name in {"begin", "end"} and _is_annotation_type(self):
603+
# skip - Annotation base provides these
604+
continue
605+
fields[feature.name] = attr.ib(default=None, repr=(feature.name != "sofa"))
576606
fields["type"] = attr.ib(default=self)
577607

578608
# We assign this to a lambda to make it lazy
579609
# When creating large type systems, almost no types are used so
580610
# creating them on the fly is on average better
581-
self._constructor_fn = lambda: attr.make_class(
582-
name, fields, bases=(FeatureStructure,), slots=True, eq=False, order=False
583-
)
611+
bases = (Annotation,) if _is_annotation_type(self) else (FeatureStructure,)
612+
613+
def _make_fs_class():
614+
cls = attr.make_class(name, fields, bases=bases, slots=True, eq=False, order=False)
615+
# Ensure generated FS classes are hashable. When a class defines an
616+
# __eq__ (inherited or generated) but no __hash__, Python makes
617+
# instances unhashable. We want FeatureStructure-based instances to
618+
# be usable as dict/set keys (they are keyed by xmiID), so assign the
619+
# base FeatureStructure.__hash__ implementation to the generated
620+
# class if it doesn't already provide one.
621+
if getattr(cls, "__hash__", None) is None:
622+
cls.__hash__ = FeatureStructure.__hash__
623+
return cls
624+
625+
self._constructor_fn = _make_fs_class
584626

585627
def __call__(self, **kwargs) -> FeatureStructure:
586628
"""Creates an feature structure of this type

cassis/util.py

Lines changed: 87 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import csv
22
from collections import defaultdict
3-
from functools import cmp_to_key
43
from io import IOBase, StringIO
54
from typing import Dict, Iterable, Set
65

@@ -143,7 +142,19 @@ def _generate_anchors(
143142
for t in types_sorted:
144143
type_ = cas.typesystem.get_type(t)
145144
feature_structures = all_feature_structures_by_type[type_.name]
146-
feature_structures.sort(key=cmp_to_key(lambda a, b: _compare_fs(type_, a, b)))
145+
# Sort deterministically using a stable key function. We avoid using
146+
# the comparator-based approach to prevent unpredictable comparisons
147+
# between mixed types during lexicographic tuple comparisons.
148+
feature_structures.sort(
149+
key=lambda fs: (
150+
0,
151+
fs.begin,
152+
fs.end,
153+
str(_feature_structure_hash(type_, fs)),
154+
)
155+
if _is_annotation_fs(fs)
156+
else (1, str(_feature_structure_hash(type_, fs)))
157+
)
147158

148159
for fs in feature_structures:
149160
add_index_mark = mark_indexed and fs in indexed_feature_structures
@@ -190,57 +201,89 @@ def _compare_fs(type_: Type, a: FeatureStructure, b: FeatureStructure) -> int:
190201
if a is b:
191202
return 0
192203

193-
# duck-typing check if something is a annotation - if yes, try sorting by offets
194-
fs_a_is_annotation = _is_annotation_fs(a)
195-
fs_b_is_annotation = _is_annotation_fs(b)
196-
if fs_a_is_annotation != fs_b_is_annotation:
197-
return -1
198-
if fs_a_is_annotation and fs_b_is_annotation:
199-
begin_cmp = a.begin - b.begin
200-
if begin_cmp != 0:
201-
return begin_cmp
202-
203-
begin_cmp = b.end - a.end
204-
if begin_cmp != 0:
205-
return begin_cmp
206-
207-
# Alternative implementation
208-
# Doing arithmetics on the hash value as we have done with the offsets does not work because the hashes do not
209-
# provide a global order. Hence, we map all results to 0, -1 and 1 here.
210-
fs_hash_a = _feature_structure_hash(type_, a)
211-
fs_hash_b = _feature_structure_hash(type_, b)
212-
if fs_hash_a == fs_hash_b:
204+
# Build stable sort keys for both feature structures and compare them.
205+
# The key is a tuple; tuples compare lexicographically which yields a
206+
# deterministic ordering. For annotations we sort by (0, begin, end, hash, xmiID)
207+
# so that they appear before non-annotations and are ordered by offsets.
208+
# For non-annotations we use (1, hash, xmiID).
209+
def _fs_sort_key(fs: FeatureStructure):
210+
if _is_annotation_fs(fs):
211+
# Use feature-derived hash as tie-breaker. Avoid xmiID because it may
212+
# differ between an original CAS and one deserialized from XMI.
213+
return (0, fs.begin, fs.end, _feature_structure_hash(type_, fs))
214+
else:
215+
return (1, _feature_structure_hash(type_, fs))
216+
217+
key_a = _fs_sort_key(a)
218+
key_b = _fs_sort_key(b)
219+
220+
if key_a == key_b:
213221
return 0
214-
return -1 if fs_hash_a < fs_hash_b else 1
222+
return -1 if key_a < key_b else 1
215223

216224

217225
def _feature_structure_hash(type_: Type, fs: FeatureStructure):
218-
hash_ = 0
226+
# For backward compatibility keep a function that returns a stable string
227+
# representation of the FS contents. This is used as a deterministic
228+
# tie-breaker when sorting. We avoid returning complex nested tuples to
229+
# keep comparisons simple and stable across original and deserialized CASes.
230+
def _render_val(v):
231+
if v is None:
232+
return "<NULL>"
233+
if type(v) in (int, float, bool, str):
234+
return str(v)
235+
if _is_array_fs(v):
236+
# Join element representations with '|'
237+
return "[" + ",".join(_render_val(e) for e in (v.elements or [])) + "]"
238+
# Feature structure reference
239+
try:
240+
if _is_annotation_fs(v):
241+
return f"{v.type.name}@{v.begin}-{v.end}"
242+
else:
243+
return f"{v.type.name}"
244+
except Exception:
245+
return str(v)
246+
219247
if _is_array_fs(fs):
220-
return len(fs.elements) if fs.elements else 0
248+
return _render_val(fs.elements or [])
221249

222-
# Should be possible to get away with not sorting here assuming that all_features returns the features always in
223-
# the same order
250+
parts = []
224251
for feature in type_.all_features:
225252
if feature.name == FEATURE_BASE_NAME_SOFA:
226253
continue
227-
228-
feature_value = getattr(fs, feature.name)
229-
230-
if _is_array_fs(feature_value):
231-
if feature_value.elements is not None:
232-
for element in feature_value.elements:
233-
hash_ = _feature_value_hash(feature_value, hash_)
234-
else:
235-
hash_ = _feature_value_hash(feature_value, hash_)
236-
return hash_
254+
parts.append(_render_val(getattr(fs, feature.name)))
255+
return "|".join(parts)
237256

238257

239258
def _feature_value_hash(feature_value: any, hash_: int):
240-
# Note we do not recurse further into arrays here because that could lead to endless loops!
241-
if type(feature_value) in (int, float, bool, str):
242-
return hash_ + hash(feature_value)
243-
else:
244-
# If we get here, it is a feature structure reference... we cannot really recursively
245-
# go into it to calculate a recursive hash... so we just check if the value is non-null
246-
return hash_ * (-1 if feature_value is None else 1)
259+
# Deprecated: kept for backward compatibility. Prefer using
260+
# _feature_structure_hash which now returns a deterministic key.
261+
raise RuntimeError("_feature_value_hash is deprecated; use content-based keys")
262+
263+
264+
def _normalize_feature_value(value: any):
265+
"""Return a stable, comparable representation for a feature value.
266+
267+
Primitives are returned as-is. Feature structure references are normalized
268+
to a tuple containing the referenced type name and offsets if the target
269+
is an annotation. Arrays are represented as tuples of normalized elements.
270+
"""
271+
# Use tagged tuples to guarantee consistent types and deterministic
272+
# ordering during comparisons. This avoids runtime TypeErrors when
273+
# different kinds of values (None, tuple, primitive) would otherwise
274+
# be compared directly.
275+
if value is None:
276+
return ("N",)
277+
if type(value) in (int, float, bool, str):
278+
return ("P", value)
279+
if _is_array_fs(value):
280+
return ("A",) + tuple(_normalize_feature_value(e) for e in (value.elements or []))
281+
# Feature structure reference
282+
try:
283+
if _is_annotation_fs(value):
284+
return ("FS", value.type.name, value.begin, value.end)
285+
else:
286+
return ("FS", value.type.name)
287+
except Exception:
288+
# Fallback: string representation
289+
return ("FS", getattr(getattr(value, "type", None), "name", str(value)))

cassis/xmi.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -619,13 +619,23 @@ def _serialize_feature_structure(self, cas: Cas, root: etree.Element, fs: Featur
619619
continue
620620

621621
# Map back from offsets in Unicode codepoints to UIMA UTF-16 based offsets
622-
if (
623-
ts.is_instance_of(fs.type.name, TYPE_NAME_ANNOTATION)
624-
and feature_name == FEATURE_BASE_NAME_BEGIN
625-
or feature_name == FEATURE_BASE_NAME_END
622+
# Ensure we only convert begin/end for annotation instances. Parentheses are
623+
# required because `and` has higher precedence than `or` and we must not
624+
# attempt conversion for the END feature on non-annotations.
625+
if ts.is_instance_of(fs.type.name, TYPE_NAME_ANNOTATION) and (
626+
feature_name == FEATURE_BASE_NAME_BEGIN or feature_name == FEATURE_BASE_NAME_END
626627
):
627-
sofa: Sofa = fs.sofa
628-
value = sofa._offset_converter.python_to_external(value)
628+
# Be defensive: only perform offset conversion if the sofa and its
629+
# offset converter have been initialized. In some workflows (e.g. a
630+
# freshly constructed CAS without sofa strings) the converter may
631+
# not exist yet and conversion is not possible.
632+
sofa = getattr(fs, "sofa", None)
633+
if sofa is not None and getattr(sofa, "_offset_converter", None) is not None:
634+
value = sofa._offset_converter.python_to_external(value)
635+
636+
# If the offset is the default 0, still emit it. We do not track
637+
# original attribute presence; test fixtures should reflect the
638+
# desired serialized form.
629639

630640
if ts.is_instance_of(feature.rangeType, TYPE_NAME_STRING_ARRAY) and not feature.multipleReferencesAllowed:
631641
if value.elements is not None: # Compare to none as not to skip if elements is empty!

tests/test_cas.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,10 +597,43 @@ def test_covered_text_on_non_annotation():
597597
top.get_covered_text()
598598

599599

600+
def test_add_non_annotation_and_select():
601+
"""Create a non-annotation type, add an instance and verify select returns it."""
602+
cas = Cas()
603+
604+
# Create a type that does not define annotation offsets (begin/end)
605+
NonAnnotation = cas.typesystem.create_type("test.NonAnnotation")
606+
607+
# Instantiate and add to CAS
608+
fs = NonAnnotation()
609+
cas.add(fs)
610+
611+
# Should be retrievable by select using the type name
612+
selected = list(cas.select("test.NonAnnotation"))
613+
assert selected == [fs]
614+
615+
# And visible via select_all
616+
assert fs in cas.select_all()
617+
618+
600619
def test_covered_text_on_annotation_without_sofa():
601620
cas = Cas()
602621
Annotation = cas.typesystem.get_type(TYPE_NAME_ANNOTATION)
603622
ann = Annotation()
604623

605624
with pytest.raises(AnnotationHasNoSofa):
606625
ann.get_covered_text()
626+
627+
628+
def test_runtime_generated_annotation_is_detected_and_shown_in_anchor():
629+
ts = TypeSystem()
630+
# Create a new annotation subtype (should inherit from Annotation base)
631+
MyAnno = ts.create_type("my.pkg.MyAnnotation", supertypeName="uima.tcas.Annotation")
632+
633+
cas = Cas(ts)
634+
# Create an instance of the runtime-generated type; ensure we can set begin/end
635+
a = MyAnno(begin=5, end=10)
636+
cas.add(a)
637+
638+
text = cas_to_comparable_text(cas)
639+
assert "MyAnnotation[5-10]" in text

0 commit comments

Comments
 (0)