-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexample_traject_config_eac_cpf.rb
More file actions
333 lines (292 loc) · 11.8 KB
/
example_traject_config_eac_cpf.rb
File metadata and controls
333 lines (292 loc) · 11.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
# Traject configuration for indexing EAC-CPF creator records to Solr
#
# This config file processes EAC-CPF (Encoded Archival Context - Corporate Bodies,
# Persons, and Families) XML documents from ArchivesSpace archival_contexts endpoint.
#
# Usage:
# bundle exec traject -u $SOLR_URL -c example_traject_config_eac_cpf.rb /path/to/agents/*.xml
#
# For production, copy this file to your arcuit gem as traject_config_eac_cpf.rb
#
# The EAC-CPF XML documents are retrieved directly from ArchivesSpace via:
# /repositories/{repo_id}/archival_contexts/{agent_type}/{id}.xml
require 'traject'
require 'traject_plus'
require 'traject_plus/macros'
require 'time'
# Use TrajectPlus macros (provides extract_xpath and other helpers)
extend TrajectPlus::Macros
# EAC-CPF namespace - used consistently throughout this config
EAC_NS = { 'eac' => 'urn:isbn:1-931666-33-4' }
# Entity types - SINGLE SOURCE OF TRUTH
ENTITY_TYPES = ['corporate_entities', 'people', 'families']
# Pattern matching arcflow's creator file naming: creator_{entity_type}_{id}
CREATOR_ID_PATTERN = /^creator_(#{ENTITY_TYPES.join('|')})_\d+$/
settings do
provide "solr.url", ENV['SOLR_URL'] || "http://localhost:8983/solr/blacklight-core"
provide "solr_writer.commit_on_close", "true"
provide "solr_writer.thread_pool", "8"
provide "solr_writer.batch_size", "100"
provide "processing_thread_pool", "4"
# Use NokogiriReader for XML processing
provide "reader_class_name", "Traject::NokogiriReader"
end
# Each record from reader
each_record do |record, context|
context.clipboard[:is_creator] = true
end
# Solr uniqueKey - extract ID from filename using arcflow's creator_{entity_type}_{id} pattern
to_field 'id' do |record, accumulator, context|
source_file = context.source_record_id || context.input_name
if source_file
id_from_filename = File.basename(source_file, '.xml')
if id_from_filename =~ CREATOR_ID_PATTERN
accumulator << id_from_filename
context.logger.info("Using filename-based ID: #{id_from_filename}")
else
context.logger.error("Filename doesn't match expected pattern 'creator_{type}_{id}': #{id_from_filename}")
context.skip!("Invalid ID format in filename")
end
else
context.logger.error("No source filename available for record")
context.skip!("Missing source filename")
end
end
# Add is_creator boolean marker field
to_field 'is_creator' do |record, accumulator|
accumulator << true
end
# # Record type
# to_field 'record_type' do |record, accumulator|
# accumulator << 'creator'
# end
# Entity type (corporateBody, person, family)
to_field 'entity_type_ssi' do |record, accumulator|
entity = record.xpath('//eac:cpfDescription/eac:identity/eac:entityType', EAC_NS).first
accumulator << entity.text if entity
end
# Title/name fields - using ArcLight dynamic field naming convention
# _tesim = text, stored, indexed, multiValued (for full-text search)
# _ssm = string, stored, multiValued (for display)
# _ssi = string, stored, indexed (for faceting/sorting)
to_field 'title_tesim' do |record, accumulator|
name = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS)
accumulator << name.map(&:text).join(' ') if name.any?
end
to_field 'title_ssm' do |record, accumulator|
name = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS)
accumulator << name.map(&:text).join(' ') if name.any?
end
to_field 'title_filing_ssi' do |record, accumulator|
name = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS)
if name.any?
text = name.map(&:text).join(' ')
# Remove leading articles and convert to lowercase for filing
accumulator << text.gsub(/^(a|an|the)\s+/i, '').downcase
end
end
# Dates of existence - using ArcLight standard field unitdate_ssm
# (matches what ArcLight uses for collection dates)
to_field 'unitdate_ssm' do |record, accumulator|
# Try existDates element
base_path = '//eac:cpfDescription/eac:description/eac:existDates'
dates = record.xpath("#{base_path}/eac:dateRange/eac:fromDate | #{base_path}/eac:dateRange/eac:toDate | #{base_path}/eac:date", EAC_NS)
if dates.any?
from_date = record.xpath("#{base_path}/eac:dateRange/eac:fromDate", EAC_NS).first
to_date = record.xpath("#{base_path}/eac:dateRange/eac:toDate", EAC_NS).first
if from_date || to_date
from_text = from_date ? from_date.text : ''
to_text = to_date ? to_date.text : ''
accumulator << "#{from_text}-#{to_text}".gsub(/^-|-$/, '')
else
# Single date
dates.each { |d| accumulator << d.text }
end
end
end
# Biographical/historical note - using ArcLight conventions
# _tesim for searchable plain text
# _tesm for searchable HTML (text, stored, multiValued but not for display)
# _ssm for section heading display
to_field 'bioghist_tesim' do |record, accumulator|
# Extract text from biogHist elements for full-text search
bioghist = record.xpath('//eac:cpfDescription/eac:description/eac:biogHist//eac:p', EAC_NS)
if bioghist.any?
text = bioghist.map(&:text).join(' ')
accumulator << text
end
end
# Biographical/historical note - HTML
to_field 'bioghist_html_tesm' do |record, accumulator|
# Extract HTML for searchable content (matches ArcLight's bioghist_html_tesm)
bioghist = record.xpath('//eac:cpfDescription/eac:description/eac:biogHist//eac:p', EAC_NS)
if bioghist.any?
# Preserve inline EAC markup inside <eac:p> by serializing child nodes
html = bioghist.map { |p| "<p>#{p.inner_html}</p>" }.join("\n")
accumulator << html
end
end
to_field 'bioghist_heading_ssm' do |record, accumulator|
# Extract section heading (matches ArcLight's bioghist_heading_ssm pattern)
heading = record.xpath('//eac:cpfDescription/eac:description/eac:biogHist//eac:head', EAC_NS).first
accumulator << heading.text if heading
end
# Full-text search field
to_field 'text' do |record, accumulator|
# Title
name = record.xpath('//eac:cpfDescription/eac:identity/eac:nameEntry/eac:part', EAC_NS)
accumulator << name.map(&:text).join(' ') if name.any?
# Bioghist
bioghist = record.xpath('//eac:cpfDescription/eac:description/eac:biogHist//eac:p', EAC_NS)
accumulator << bioghist.map(&:text).join(' ') if bioghist.any?
end
# Related agents (from cpfRelation elements) for display parsing and debugging, stored as a single line
# "https://archivesspace-stage.library.illinois.edu/agents/corporate_entities/57|associative"
to_field 'related_agents_debug_ssim' do |record, accumulator|
relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS)
relations.each do |rel|
href = rel['href'] || rel['xlink:href']
relation_type = rel['cpfRelationType']
if href
solr_id = aspace_uri_to_solr_id(href)
if solr_id
# Format: "solr_id|type"
accumulator << "#{solr_id}|#{relation_type || 'unknown'}"
end
end
end
end
# Related agents - ASpace URIs, in parallel array to match ids and types
to_field 'related_agent_uris_ssim' do |record, accumulator|
relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS)
relations.each do |rel|
href = rel['href'] || rel['xlink:href']
accumulator << href if href
end
end
# Related agents - Parallel array of relationship ids to match relationship types and uris
to_field 'related_agent_ids_ssim' do |record, accumulator|
relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS)
relations.each do |rel|
href = rel['href'] || rel['xlink:href']
if href
solr_id = aspace_uri_to_solr_id(href) # CONVERT URI TO ID
accumulator << solr_id if solr_id
end
end
end
# Related Agents - Parallel array of names to match relationship ids, uris and type
to_field 'related_agent_names_ssim' do |record, accumulator|
relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation/eac:relationEntry', EAC_NS)
relations.each do |rel|
accumulator << rel.text
end
end
# Related Agents - Parallel array of relationship types to match relationship ids and uris
to_field 'related_agent_relationship_types_ssim' do |record, accumulator|
relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS)
relations.each do |rel|
href = rel['href'] || rel['xlink:href']
if href
relation_type = rel['cpfRelationType'] || 'unknown'
accumulator << relation_type # NO deduplication - keeps array parallel
end
end
end
# Relationship types used for faceting,
to_field 'relationship_types_ssim' do |record, accumulator|
relations = record.xpath('//eac:cpfDescription/eac:relations/eac:cpfRelation', EAC_NS)
relations.each do |rel|
relation_type = rel['cpfRelationType']
accumulator << relation_type if relation_type && !accumulator.include?(relation_type)
end
end
# Collections this creator is responsible for - EAD IDs injected by arcflow
# into <resourceRelation resourceRelationType="creatorOf"> elements as:
# <descriptiveNote><p>ead_id:{ead_id}</p></descriptiveNote>
# Indexed as an array of EAD IDs (e.g., ["ALA.9.5.16"]) for bidirectional
# creator↔collection linking in Solr.
to_field 'creator_of_collection__collection_ids_ssim' do |record, accumulator|
relations = record.xpath(
'//eac:cpfDescription/eac:relations/eac:resourceRelation[@resourceRelationType="creatorOf"]',
EAC_NS
)
relations.each do |rel|
note = rel.xpath('eac:descriptiveNote/eac:p', EAC_NS).first
if note && note.text =~ /\Aead_id:(.+)\z/
accumulator << $1.strip
end
end
end
to_field 'creator_of_collection__collection_name_ssim' do |record, accumulator|
relations = record.xpath(
'//eac:cpfDescription/eac:relations/eac:resourceRelation[@resourceRelationType="creatorOf"]',
EAC_NS
)
relations.each do |rel|
note = rel.xpath('eac:descriptiveNote/eac:p', EAC_NS).first
if note && note.text =~ /\Aead_id:(.+)\z/
name = rel.xpath('eac:relationEntry', EAC_NS)
accumulator << name.text
end
end
end
to_field 'creator_of_digital_object__do_ids_ssim' do |record, accumulator|
relations = record.xpath(
'//eac:cpfDescription/eac:relations/eac:resourceRelation[@resourceRelationType="creatorOf"]',
EAC_NS
)
relations.each do |rel|
href = rel['href'] || rel['xlink:href']
if href.include? "digital_object"
accumulator << href
end
end
end
to_field 'subject_of_digital_object__do_ids_ssim' do |record, accumulator|
relations = record.xpath(
'//eac:cpfDescription/eac:relations/eac:resourceRelation[@resourceRelationType="subjectOf"]',
EAC_NS
)
relations.each do |rel|
href = rel['href'] || rel['xlink:href']
if href.include? "digital_object"
accumulator << href
end
end
end
# Agent source URI (from original ArchivesSpace)
to_field 'agent_uri_ssi' do |record, accumulator|
# Try to extract from control section or otherRecordId
other_id = record.xpath('//eac:control/eac:otherRecordId[@localType="archivesspace_uri"]', EAC_NS).first
if other_id
accumulator << other_id.text
end
end
# Timestamp
to_field 'timestamp' do |record, accumulator|
accumulator << Time.now.utc.iso8601
end
# Log successful indexing
each_record do |record, context|
record_id = record.xpath('//eac:control/eac:recordId', EAC_NS).first
if record_id
context.logger.info("Indexed creator: #{record_id.text}")
end
end
# Helper to build and validate creator IDs
def build_creator_id(entity_type, id_number)
creator_id = "creator_#{entity_type}_#{id_number}"
unless creator_id =~ CREATOR_ID_PATTERN
raise ArgumentError, "Invalid creator ID: #{creator_id} doesn't match pattern"
end
creator_id
end
# Helper to convert ArchivesSpace URI to Solr creator ID
def aspace_uri_to_solr_id(uri)
return nil unless uri
# Match: /agents/{type}/{id} or https://.../agents/{type}/{id}
if uri =~ /agents\/(#{ENTITY_TYPES.join('|')})\/(\d+)/
build_creator_id($1, $2)
end
end