Skip to content

Commit b0bcf33

Browse files
authored
Merge pull request #12 from UIUCLibrary/copilot/sub-pr-8
Filter non-creator agents from indexing (exclude system users, donors)
2 parents db949e2 + f264373 commit b0bcf33

2 files changed

Lines changed: 153 additions & 14 deletions

File tree

README.md

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,30 @@ ArcFlow now generates standalone creator documents in addition to collection rec
4444
- Are marked with `is_creator: true` to distinguish from collections
4545
- Must be fed into a Solr instance with fields to match their specific facets (See: Configure Solr Schema below)
4646

47+
### Agent Filtering
48+
49+
**ArcFlow automatically filters agents to include only legitimate creators** of archival materials. The following agent types are **excluded** from indexing:
50+
51+
-**System users** - ArchivesSpace software users (identified by `is_user` field)
52+
-**System-generated agents** - Auto-created for users (identified by `system_generated` field)
53+
-**Software agents** - Excluded by not querying the `/agents/software` endpoint
54+
-**Repository agents** - Corporate entities representing the repository itself (identified by `is_repo_agent` field)
55+
-**Donor-only agents** - Agents with only the 'donor' role and no creator role
56+
57+
**Agents are included if they meet any of these criteria:**
58+
59+
- ✓ Have the **'creator' role** in linked_agent_roles
60+
- ✓ Are **linked to published records** (and not excluded by filters above)
61+
62+
This filtering ensures that only legitimate archival creators are discoverable in ArcLight, while protecting privacy and security by excluding system users and donors.
63+
4764
### How Creator Records Work
4865

4966
1. **Extraction**: `get_all_agents()` fetches all agents from ArchivesSpace
50-
2. **Processing**: `task_agent()` generates an EAC-CPF XML document for each agent with bioghist notes
51-
3. **Linking**: Handled via Solr using the persistent_id field (agents and collections linked through bioghist references)
52-
4. **Indexing**: Creator XML files are indexed to Solr using `traject_config_eac_cpf.rb`
67+
2. **Filtering**: `is_target_agent()` filters out system users, donors, and non-creator agents
68+
3. **Processing**: `task_agent()` generates an EAC-CPF XML document for each target agent with bioghist notes
69+
4. **Linking**: Handled via Solr using the persistent_id field (agents and collections linked through bioghist references)
70+
5. **Indexing**: Creator XML files are indexed to Solr using `traject_config_eac_cpf.rb`
5371

5472
### Creator Document Format
5573

arcflow/main.py

Lines changed: 132 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -674,26 +674,80 @@ def get_creator_bioghist(self, resource, indent_size=0):
674674
return None
675675

676676

677+
def is_target_agent(self, agent):
678+
"""
679+
Determine if agent is a target creator of archival materials.
680+
681+
Excludes:
682+
- System users (is_user field present)
683+
- System-generated agents (system_generated = true)
684+
- Repository agents (is_repo_agent field present)
685+
- Donor-only agents (only has 'donor' role, no creator role)
686+
687+
Note: Software agents are excluded by not querying /agents/software endpoint.
688+
689+
Args:
690+
agent: Agent record from ArchivesSpace API
691+
692+
Returns:
693+
bool: True if agent should be indexed, False to exclude
694+
"""
695+
# TIER 1: Exclude system users (PRIMARY FILTER)
696+
if agent.get('is_user'):
697+
return False
698+
699+
# TIER 2: Exclude system-generated agents
700+
if agent.get('system_generated'):
701+
return False
702+
703+
# TIER 3: Exclude repository agents (corporate entities only)
704+
if agent.get('is_repo_agent'):
705+
return False
706+
707+
# TIER 4: Role-based filtering
708+
roles = agent.get('linked_agent_roles', [])
709+
710+
# Include if explicitly marked as creator
711+
if 'creator' in roles:
712+
return True
713+
714+
# Exclude if ONLY marked as donor
715+
if roles == ['donor']:
716+
return False
717+
718+
# TIER 5: Default - include if linked to published records
719+
# (covers cases where roles aren't populated yet)
720+
return agent.get('is_linked_to_published_record', False)
721+
677722
def get_all_agents(self, agent_types=None, modified_since=0, indent_size=0):
678723
"""
679-
Fetch ALL agents from ArchivesSpace (not just creators).
680-
Uses direct agent API endpoints for comprehensive coverage.
724+
Fetch target agents from ArchivesSpace and filter to creators only.
725+
Excludes system users, donors, and other non-creator agents.
681726
682727
Args:
683728
agent_types: List of agent types to fetch. Default: ['corporate_entities', 'people', 'families']
684729
modified_since: Unix timestamp to filter agents modified since this time (if API supports it)
685730
indent_size: Indentation size for logging
686731
687732
Returns:
688-
set: Set of agent URIs (e.g., '/agents/corporate_entities/123')
733+
list: List of filtered agent URIs (e.g., '/agents/corporate_entities/123')
689734
"""
690735
if agent_types is None:
691736
agent_types = ['corporate_entities', 'people', 'families']
692737

693738
indent = ' ' * indent_size
694-
all_agents = set()
739+
target_agents = []
740+
stats = {
741+
'total': 0,
742+
'excluded_user': 0,
743+
'excluded_system_generated': 0,
744+
'excluded_repo_agent': 0,
745+
'excluded_donor_only': 0,
746+
'excluded_no_links': 0,
747+
'included': 0
748+
}
695749

696-
self.log.info(f'{indent}Fetching ALL agents from ArchivesSpace...')
750+
self.log.info(f'{indent}Fetching agents from ArchivesSpace and applying filters...')
697751

698752
for agent_type in agent_types:
699753
try:
@@ -705,12 +759,55 @@ def get_all_agents(self, agent_types=None, modified_since=0, indent_size=0):
705759
response = self.client.get(f'/agents/{agent_type}', params=params)
706760
agent_ids = response.json()
707761

708-
self.log.info(f'{indent}Found {len(agent_ids)} {agent_type} agents')
762+
self.log.info(f'{indent}Found {len(agent_ids)} {agent_type} agents, filtering...')
709763

710-
# Add agent URIs to set
764+
# Fetch and filter each agent
711765
for agent_id in agent_ids:
766+
stats['total'] += 1
712767
agent_uri = f'/agents/{agent_type}/{agent_id}'
713-
all_agents.add(agent_uri)
768+
769+
try:
770+
# Fetch full agent record to access filtering fields
771+
agent_response = self.client.get(agent_uri)
772+
agent = agent_response.json()
773+
774+
# Apply filtering logic
775+
if agent.get('is_user'):
776+
stats['excluded_user'] += 1
777+
continue
778+
779+
if agent.get('system_generated'):
780+
stats['excluded_system_generated'] += 1
781+
continue
782+
783+
if agent.get('is_repo_agent'):
784+
stats['excluded_repo_agent'] += 1
785+
continue
786+
787+
roles = agent.get('linked_agent_roles', [])
788+
789+
# Include creators
790+
if 'creator' in roles:
791+
stats['included'] += 1
792+
target_agents.append(agent_uri)
793+
continue
794+
795+
# Exclude donor-only agents
796+
if roles == ['donor']:
797+
stats['excluded_donor_only'] += 1
798+
continue
799+
800+
# Default: include if linked to published records
801+
if agent.get('is_linked_to_published_record', False):
802+
stats['included'] += 1
803+
target_agents.append(agent_uri)
804+
else:
805+
stats['excluded_no_links'] += 1
806+
807+
except Exception as e:
808+
self.log.warning(f'{indent}Error fetching agent {agent_uri}: {e}')
809+
# On error, include the agent (fail-open)
810+
target_agents.append(agent_uri)
714811

715812
except Exception as e:
716813
self.log.error(f'{indent}Error fetching {agent_type} agents: {e}')
@@ -721,14 +818,38 @@ def get_all_agents(self, agent_types=None, modified_since=0, indent_size=0):
721818
response = self.client.get(f'/agents/{agent_type}', params={'all_ids': True})
722819
agent_ids = response.json()
723820
self.log.info(f'{indent}Found {len(agent_ids)} {agent_type} agents (no date filter)')
821+
822+
# Re-process with filtering
724823
for agent_id in agent_ids:
824+
stats['total'] += 1
725825
agent_uri = f'/agents/{agent_type}/{agent_id}'
726-
all_agents.add(agent_uri)
826+
827+
try:
828+
agent_response = self.client.get(agent_uri)
829+
agent = agent_response.json()
830+
831+
if self.is_target_agent(agent):
832+
stats['included'] += 1
833+
target_agents.append(agent_uri)
834+
835+
except Exception as e:
836+
self.log.warning(f'{indent}Error fetching agent {agent_uri}: {e}')
837+
target_agents.append(agent_uri)
838+
727839
except Exception as e2:
728840
self.log.error(f'{indent}Failed to fetch {agent_type} agents: {e2}')
729841

730-
self.log.info(f'{indent}Found {len(all_agents)} total agents across all types.')
731-
return all_agents
842+
# Log filtering statistics
843+
self.log.info(f'{indent}Agent filtering complete:')
844+
self.log.info(f'{indent} Total agents processed: {stats["total"]}')
845+
self.log.info(f'{indent} Included (target creators): {stats["included"]}')
846+
self.log.info(f'{indent} Excluded (system users): {stats["excluded_user"]}')
847+
self.log.info(f'{indent} Excluded (system-generated): {stats["excluded_system_generated"]}')
848+
self.log.info(f'{indent} Excluded (repository agents): {stats["excluded_repo_agent"]}')
849+
self.log.info(f'{indent} Excluded (donor-only): {stats["excluded_donor_only"]}')
850+
self.log.info(f'{indent} Excluded (no published links): {stats["excluded_no_links"]}')
851+
852+
return target_agents
732853

733854

734855
def task_agent(self, agent_uri, agents_dir, repo_id=1, indent_size=0):

0 commit comments

Comments
 (0)