-
-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathtest_title.py
More file actions
163 lines (136 loc) · 6.55 KB
/
test_title.py
File metadata and controls
163 lines (136 loc) · 6.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import unittest
from unittest.mock import MagicMock, patch
from . import title
def make_page_dict(blocks):
"""Helper to build a get_text("dict") return value from a simple list of blocks.
Each block is a list of (text, font_size) tuples representing spans.
"""
dict_blocks = []
for spans in blocks:
dict_blocks.append({
"type": 0,
"lines": [{
"spans": [{"text": text, "size": size} for text, size in spans]
}]
})
return {"blocks": dict_blocks}
def make_mock_doc(pages_data, metadata=None):
"""Build a mock fitz.Document.
pages_data: list of block lists, one per page. Each block is a list of (text, size) tuples.
"""
doc = MagicMock()
doc.metadata = metadata or {"title": None}
doc.__len__ = lambda self: len(pages_data)
mock_pages = []
for page_blocks in pages_data:
page = MagicMock()
page.get_text.return_value = make_page_dict(page_blocks)
mock_pages.append(page)
doc.__getitem__ = lambda self, idx: mock_pages[idx]
return doc
class TestGenerateTitle(unittest.TestCase):
def test_prefers_metadata_title_if_valid(self):
doc = MagicMock()
doc.metadata = {"title": "A Study Regarding The Efficacy of Drugs"}
self.assertEqual(
"A Study Regarding The Efficacy of Drugs", title.generate_title(doc))
def test_falls_back_to_font_size_if_metadata_title_is_empty(self):
doc = make_mock_doc(
pages_data=[[
[("foo", 10.0)],
[("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia", 18.0)],
[("bar", 10.0)],
]],
metadata={"title": ""},
)
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
self.assertEqual(expected_title, title.generate_title(doc))
def test_falls_back_to_font_size_if_metadata_title_does_not_match_regex(self):
doc = make_mock_doc(
pages_data=[[
[("foo", 10.0)],
[("Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia", 18.0)],
[("bar", 10.0)],
]],
metadata={"title": "abcd1234"},
)
expected_title = "Advances in Mood Disorder Pharmacotherapy: Evaluating New Antipsychotics and Mood Stabilizers for Bipolar Disorder and Schizophrenia"
self.assertEqual(expected_title, title.generate_title(doc))
@patch("api.views.uploadFile.title.openAIServices.openAI")
def test_falls_back_to_chatgpt_if_no_title_found(self, mock_openAI):
doc = make_mock_doc(
pages_data=[[]] # no blocks at all
)
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = "A Study Regarding The Efficacy of Drugs"
mock_openAI.return_value = mock_response
title.generate_title(doc)
self.assertTrue(mock_openAI.called)
@patch("api.views.uploadFile.title.openAIServices.openAI")
def test_strips_quotes_from_openai_title(self, mock_openAI):
doc = make_mock_doc(pages_data=[[]])
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = '"Updated CANMAT/ISBD Guidelines for Treating Mixed Features in Bipolar Disorder"'
mock_openAI.return_value = mock_response
result = title.generate_title(doc)
self.assertEqual(result, "Updated CANMAT/ISBD Guidelines for Treating Mixed Features in Bipolar Disorder")
@patch("api.views.uploadFile.title.openAIServices.openAI")
def test_truncates_long_openai_title(self, mock_openAI):
doc = make_mock_doc(pages_data=[[]])
mock_response = MagicMock()
mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = "A" * 300
mock_openAI.return_value = mock_response
result = title.generate_title(doc)
# Ensure the title is truncated to fit the UploadFile model's title field (max_length=255), since OpenAI responses may exceed this limit
self.assertLessEqual(len(result), 255)
def test_font_size_joins_adjacent_spans_in_same_block(self):
"""A title split across multiple spans in the same block should be joined."""
doc = make_mock_doc(
pages_data=[[
[("Author Name", 10.0)],
[("Advances in Mood Disorder", 18.0), ("Pharmacotherapy", 18.0)],
[("Some journal info", 10.0)],
]],
)
result = title.extract_title_by_font_size(doc)
self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")
def test_font_size_ignores_short_spans(self):
"""Superscript markers and other tiny spans should be filtered out."""
doc = make_mock_doc(
pages_data=[[
[("Advances in Mood Disorder Pharmacotherapy", 18.0), ("*", 18.0)],
[("Author Name et al.", 10.0)],
]],
)
# The "*" span is < 2 chars, so it should be ignored; title is just the real text
result = title.extract_title_by_font_size(doc)
self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")
def test_font_size_returns_none_when_no_regex_match(self):
"""If the largest-font text doesn't match the title regex, return None."""
doc = make_mock_doc(
pages_data=[[
# Only 2 words — regex requires at least 3
[("Psychiatry Research", 18.0)],
[("Author Name et al.", 10.0)],
]],
)
result = title.extract_title_by_font_size(doc)
self.assertIsNone(result)
def test_font_size_finds_title_on_later_page(self):
"""Title on page 2 should still be found if it has the largest font."""
doc = make_mock_doc(
pages_data=[
[ # page 1: cover page with smaller text
[("Some preamble text here", 12.0)],
],
[ # page 2: actual title in larger font
[("Advances in Mood Disorder Pharmacotherapy", 18.0)],
[("Author Name et al.", 10.0)],
],
],
)
result = title.extract_title_by_font_size(doc)
self.assertEqual(result, "Advances in Mood Disorder Pharmacotherapy")