Skip to content

Commit 5972a58

Browse files
committed
feat: update citation format to include workgroup names
- Modified citation format across multiple components to replace 'speaker' with 'workgroup_name' for consistency. - Updated the constitution document to reflect the new citation format requirements. - Enhanced citation extraction and scoring logic to accommodate the new workgroup-based citations. - Adjusted related services and models to ensure proper handling of workgroup information in citations.
1 parent 66143b2 commit 5972a58

9 files changed

Lines changed: 82 additions & 37 deletions

File tree

.specify/memory/constitution.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ The system must ground all outputs strictly in archived meeting data.
3838

3939
Every generated statement shall be supported by verifiable citation.
4040

41-
- Required format: [meeting_id | date | speaker]
41+
- Required format: [meeting_id | date | workgroup_name]
4242
- Each answer includes retrieved text + provenance metadata
4343
- Citations are non-optional, non-negotiable
4444

@@ -93,7 +93,7 @@ All actions must be visible and reviewable.
9393
- System MUST extract structured entities from meeting records (meetings, workgroups, people, documents, agenda items, decision items, action items)
9494
- Entity storage MUST be local-first: JSON files in `entities/` directory structure (no external database dependencies)
9595
- Entity relationships MUST maintain referential integrity (foreign key validation, cascade delete behaviors)
96-
- Entity extraction MUST preserve traceability to source meeting records (meeting_id, date, speaker relationships)
96+
- Entity extraction MUST preserve traceability to source meeting records (meeting_id, date, workgroup relationships)
9797
- System MUST support dual querying: structured entity queries (quantitative counts, relationship navigation) AND vector similarity search (qualitative RAG queries)
9898
- Entity extraction MUST be deterministic: same meeting record produces identical entity structure and relationships
9999
- Entity storage operations MUST be atomic (temporary file + rename pattern for writes, backup/restore for deletes)

src/models/evaluation_case.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class EvaluationCase(BaseModel):
1313
case_id: Unique evaluation case identifier
1414
prompt: Test query prompt
1515
ground_truth: Expected answer content
16-
expected_citations: Expected citations in format [meeting_id | date | speaker]
16+
expected_citations: Expected citations in format [meeting_id | date | workgroup_name]
1717
evaluation_metrics: Scoring results
1818
run_timestamp: Evaluation run timestamp
1919
model_version: LLM version used for evaluation
@@ -25,7 +25,7 @@ class EvaluationCase(BaseModel):
2525
ground_truth: str = Field(..., description="Expected answer content")
2626
expected_citations: List[Dict[str, Any]] = Field(
2727
...,
28-
description="Expected citations in format [meeting_id | date | speaker]"
28+
description="Expected citations in format [meeting_id | date | workgroup_name]"
2929
)
3030
evaluation_metrics: Optional[Dict[str, Any]] = Field(
3131
None,
@@ -55,7 +55,7 @@ class Config:
5555
{
5656
"meeting_id": "meeting_001",
5757
"date": "2024-03-15",
58-
"speaker": "Alice",
58+
"workgroup_name": "Budget Committee",
5959
"excerpt": "budget allocation"
6060
}
6161
],

src/models/rag_query.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@ class RetrievedChunk(BaseModel):
1515

1616

1717
class Citation(BaseModel):
18-
"""Citation in format [meeting_id | date | speaker]."""
18+
"""Citation in format [meeting_id | date | workgroup_name]."""
1919

2020
meeting_id: str = Field(..., description="Meeting identifier")
2121
date: str = Field(..., description="Meeting date")
22-
speaker: Optional[str] = Field(None, description="Speaker name")
22+
workgroup_name: Optional[str] = Field(None, description="Workgroup name")
2323
excerpt: str = Field(..., description="Cited text excerpt")
2424

2525

@@ -33,7 +33,7 @@ class RAGQuery(BaseModel):
3333
timestamp: Query execution timestamp
3434
retrieved_chunks: Retrieved document chunks with metadata
3535
output: Generated answer text
36-
citations: Verifiable citations in format [meeting_id | date | speaker]
36+
citations: Verifiable citations in format [meeting_id | date | workgroup_name]
3737
model_version: Version of LLM used for generation
3838
embedding_version: Version of embedding model used
3939
user_id: SSO user identifier (optional)
@@ -82,7 +82,7 @@ class Config:
8282
{
8383
"meeting_id": "meeting_001",
8484
"date": "2024-03-15",
85-
"speaker": "Alice",
85+
"workgroup_name": "Budget Committee",
8686
"excerpt": "The budget committee decided..."
8787
}
8888
],

src/services/audit_writer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def write_query_audit_log(self, rag_query: RAGQuery) -> Path:
5454
{
5555
"meeting_id": citation.meeting_id,
5656
"date": citation.date,
57-
"speaker": citation.speaker,
57+
"workgroup_name": citation.workgroup_name,
5858
"excerpt": citation.excerpt
5959
}
6060
for citation in rag_query.citations

src/services/chunking.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,9 @@ def chunk_transcript(
8080
"date": meeting_record.date,
8181
"participants": meeting_record.participants,
8282
"decisions": meeting_record.decisions,
83-
"tags": meeting_record.tags
83+
"tags": meeting_record.tags,
84+
"workgroup": getattr(meeting_record, "workgroup", None),
85+
"workgroup_id": getattr(meeting_record, "workgroup_id", None)
8486
}
8587

8688
# Chunk transcript with overlap

src/services/citation_extractor.py

Lines changed: 50 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,55 @@
11
"""Citation extraction service."""
22

3-
from typing import List, Dict, Any
3+
from typing import List, Dict, Any, Optional
44
from datetime import datetime
5+
from uuid import UUID
56

67
from ..models.rag_query import Citation
78
from ..lib.citation import format_citation
89
from ..lib.logging import get_logger
10+
from ..lib.config import ENTITIES_MEETINGS_DIR, ENTITIES_WORKGROUPS_DIR
11+
from ..services.entity_storage import load_entity
12+
from ..models.meeting import Meeting
13+
from ..models.workgroup import Workgroup
914

1015
logger = get_logger(__name__)
1116

1217

18+
def _get_workgroup_name_from_meeting(meeting_id: str) -> Optional[str]:
19+
"""
20+
Get workgroup name from meeting ID.
21+
22+
Args:
23+
meeting_id: Meeting identifier (UUID string)
24+
25+
Returns:
26+
Workgroup name if found, None otherwise
27+
"""
28+
if not meeting_id or meeting_id in ("entity-storage", "quantitative-analysis", "no-evidence"):
29+
return None
30+
31+
try:
32+
meeting_uuid = UUID(meeting_id)
33+
meeting = load_entity(meeting_uuid, ENTITIES_MEETINGS_DIR, Meeting)
34+
if meeting and meeting.workgroup_id:
35+
workgroup = load_entity(meeting.workgroup_id, ENTITIES_WORKGROUPS_DIR, Workgroup)
36+
if workgroup:
37+
logger.debug("workgroup_name_found", meeting_id=meeting_id, workgroup_name=workgroup.name)
38+
return workgroup.name
39+
else:
40+
logger.debug("workgroup_not_found", meeting_id=meeting_id, workgroup_id=str(meeting.workgroup_id))
41+
else:
42+
logger.debug("meeting_or_workgroup_id_missing", meeting_id=meeting_id, has_meeting=meeting is not None)
43+
except ValueError as e:
44+
logger.debug("invalid_meeting_id_format", meeting_id=meeting_id, error=str(e))
45+
except (AttributeError, Exception) as e:
46+
logger.debug("workgroup_name_lookup_failed", meeting_id=meeting_id, error=str(e))
47+
return None
48+
49+
1350
def extract_citations(retrieved_chunks: List[Dict[str, Any]]) -> List[Citation]:
1451
"""
15-
Extract citations from retrieved chunks in format [meeting_id | date | speaker].
52+
Extract citations from retrieved chunks in format [meeting_id | date | workgroup_name].
1653
1754
Args:
1855
retrieved_chunks: List of retrieved chunk dictionaries with metadata
@@ -26,10 +63,13 @@ def extract_citations(retrieved_chunks: List[Dict[str, Any]]) -> List[Citation]:
2663
metadata = chunk.get("metadata", {})
2764
meeting_id = metadata.get("meeting_id", chunk.get("meeting_id", ""))
2865
date = metadata.get("date", "")
29-
participants = metadata.get("participants", [])
3066

31-
# Extract speaker (use first participant or leave empty)
32-
speaker = participants[0] if participants else None
67+
# Try to get workgroup name from metadata first (from FAISS index)
68+
workgroup_name = metadata.get("workgroup")
69+
70+
# If not in metadata, try to look up from entity storage
71+
if not workgroup_name and meeting_id:
72+
workgroup_name = _get_workgroup_name_from_meeting(meeting_id)
3373

3474
# Extract excerpt (chunk text)
3575
excerpt = chunk.get("text", "")
@@ -42,7 +82,7 @@ def extract_citations(retrieved_chunks: List[Dict[str, Any]]) -> List[Citation]:
4282
citation = Citation(
4383
meeting_id=meeting_id,
4484
date=date,
45-
speaker=speaker,
85+
workgroup_name=workgroup_name,
4686
excerpt=excerpt[:200] + "..." if len(excerpt) > 200 else excerpt # Truncate long excerpts
4787
)
4888

@@ -66,7 +106,7 @@ def create_no_evidence_citation(index_name: str) -> Citation:
66106
return Citation(
67107
meeting_id="no-evidence",
68108
date=datetime.utcnow().strftime("%Y-%m-%d"),
69-
speaker="System",
109+
workgroup_name=None,
70110
excerpt=f"No evidence found in retrieved chunks. RAG query searched index '{index_name}' but found no relevant results above the similarity threshold."
71111
)
72112

@@ -79,7 +119,7 @@ def format_citations_as_text(citations: List[Citation]) -> str:
79119
citations: List of Citation objects
80120
81121
Returns:
82-
Formatted citation text
122+
Formatted citation text (only citation format, no excerpt)
83123
"""
84124
if not citations:
85125
return ""
@@ -89,9 +129,9 @@ def format_citations_as_text(citations: List[Citation]) -> str:
89129
citation_str = format_citation(
90130
citation.meeting_id,
91131
citation.date,
92-
citation.speaker
132+
citation.workgroup_name
93133
)
94-
citation_lines.append(f"- {citation_str}: {citation.excerpt}")
134+
citation_lines.append(f"- {citation_str}")
95135

96136
return "\n".join(citation_lines)
97137

src/services/citation_scorer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def score_citation_accuracy(
3333
if validate_citation_format(format_citation(
3434
citation.meeting_id,
3535
citation.date,
36-
citation.speaker
36+
citation.workgroup_name
3737
))
3838
)
3939
return valid_count / len(actual_citations) if actual_citations else 0.0
@@ -44,13 +44,13 @@ def score_citation_accuracy(
4444
for expected in expected_citations:
4545
expected_meeting_id = expected.get("meeting_id", "")
4646
expected_date = expected.get("date", "")
47-
expected_speaker = expected.get("speaker")
47+
expected_workgroup_name = expected.get("workgroup_name")
4848

4949
# Find matching actual citation
5050
for actual in actual_citations:
5151
if (actual.meeting_id == expected_meeting_id and
5252
actual.date == expected_date and
53-
(not expected_speaker or actual.speaker == expected_speaker)):
53+
(not expected_workgroup_name or actual.workgroup_name == expected_workgroup_name)):
5454
matched_count += 1
5555
break
5656

src/services/entity_query.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -250,8 +250,9 @@ def get_documents_by_meeting_with_validation(self, meeting_id: UUID) -> List[Doc
250250
Raises:
251251
ValueError: If entity loading fails
252252
"""
253-
import requests
254253
from urllib.parse import urlparse
254+
from urllib.request import urlopen, Request
255+
from urllib.error import URLError, HTTPError
255256

256257
logger.info("query_documents_by_meeting_with_validation_start", meeting_id=str(meeting_id))
257258

@@ -270,17 +271,19 @@ def get_documents_by_meeting_with_validation(self, meeting_id: UUID) -> List[Doc
270271
validated_documents.append(document)
271272
continue
272273

273-
# Attempt HEAD request to validate accessibility
274+
# Attempt HEAD request to validate accessibility using standard library
274275
try:
275-
response = requests.head(str(document.link), timeout=5, allow_redirects=True)
276-
is_accessible = response.status_code < 400
277-
if not is_accessible:
278-
logger.warning("query_documents_link_inaccessible", document_id=str(document.id), link=str(document.link), status_code=response.status_code)
279-
# Still include document even if link is inaccessible (FR-012)
280-
validated_documents.append(document)
281-
except (requests.RequestException, Exception) as e:
276+
req = Request(str(document.link), method='HEAD')
277+
with urlopen(req, timeout=5) as response:
278+
is_accessible = response.status < 400
279+
if not is_accessible:
280+
logger.warning("query_documents_link_inaccessible", document_id=str(document.id), link=str(document.link), status_code=response.status)
281+
# Still include document even if link is inaccessible (FR-012)
282+
validated_documents.append(document)
283+
except (URLError, HTTPError, Exception) as e:
282284
# Link validation failed but don't block retrieval
283-
logger.warning("query_documents_link_validation_failed", document_id=str(document.id), link=str(document.link), error=str(e))
285+
status_code = getattr(e, 'code', None) if isinstance(e, HTTPError) else None
286+
logger.warning("query_documents_link_validation_failed", document_id=str(document.id), link=str(document.link), error=str(e), status_code=status_code)
284287
validated_documents.append(document)
285288

286289
except Exception as e:

src/services/query_service.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,7 @@ def execute_query(
278278
citations.append(Citation(
279279
meeting_id="entity-storage",
280280
date=datetime.utcnow().strftime("%Y-%m-%d"),
281-
speaker="System",
281+
workgroup_name=None,
282282
excerpt=f"Counted {count} meetings by scanning JSON files in {source}. Method: {method}."
283283
))
284284

@@ -293,7 +293,7 @@ def execute_query(
293293
citations.append(Citation(
294294
meeting_id="quantitative-analysis",
295295
date=datetime.utcnow().strftime("%Y-%m-%d"),
296-
speaker="System",
296+
workgroup_name=None,
297297
excerpt=f"Quantitative analysis performed. Method: {method}. Source: {source}."
298298
))
299299

@@ -304,7 +304,7 @@ def execute_query(
304304
citations.append(Citation(
305305
meeting_id=cit.get("type", "quantitative"),
306306
date=datetime.utcnow().strftime("%Y-%m-%d"),
307-
speaker="System",
307+
workgroup_name=None,
308308
excerpt=cit.get("description", f"Method: {cit.get('method', method)}")
309309
))
310310
else:

0 commit comments

Comments
 (0)