Skip to content

Commit c60885d

Browse files
authored
fix: make entity resolution bozo-tolerant and propagate bozo flag (#69)
* fix: make entity resolution bozo-tolerant and propagate bozo flag (#64) Refactor resolve_entity to return (String, bool) signaling unresolved entities. Update read_text to propagate bozo conditions through return tuple. Add read_text_str convenience wrapper for call sites without feed access. Add 9 tests covering invalid numeric refs, malformed syntax, unknown named entities, and mixed valid/invalid entities. * fix: use field-specific bozo_exception messages and fix read_text_str docs Use distinct bozo_exception messages per field (channel title, channel description, feed id) instead of generic message. Fix read_text_str doc comment to accurately reflect usage at entry-level call sites where bozo propagation is not yet implemented.
1 parent aa70076 commit c60885d

6 files changed

Lines changed: 300 additions & 122 deletions

File tree

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Fixed
11+
- Make entity resolution bozo-tolerant: `resolve_entity` preserves malformed entities as-is instead of failing (#64)
12+
- Propagate bozo flag from `read_text` when encountering unresolvable entities in feed-level fields (#64)
13+
14+
### Added
15+
- Edge-case tests for invalid numeric refs, malformed entity syntax, unknown named entities, and mixed valid/invalid entities (#64)
16+
1017
## [0.4.4] - 2026-02-20
1118

1219
### Fixed

crates/feedparser-rs-core/src/parser/atom.rs

Lines changed: 28 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ use quick_xml::{Reader, events::Event};
1414

1515
use super::common::{
1616
EVENT_BUFFER_CAPACITY, FromAttributes, LimitedCollectionExt, bytes_to_string, check_depth,
17-
extract_xml_base, init_feed, is_content_tag, is_dc_tag, is_media_tag, read_text, skip_element,
18-
skip_to_end,
17+
extract_xml_base, init_feed, is_content_tag, is_dc_tag, is_media_tag, read_text, read_text_str,
18+
skip_element, skip_to_end,
1919
};
2020

2121
/// Parse Atom 1.0 feed from raw bytes
@@ -153,14 +153,20 @@ fn parse_feed_element(
153153
feed.feed.set_subtitle(text);
154154
}
155155
b"id" if !is_empty => {
156-
feed.feed.id = Some(read_text(reader, &mut buf, limits)?);
156+
let (text, bozo) = read_text(reader, &mut buf, limits)?;
157+
if bozo {
158+
feed.bozo = true;
159+
feed.bozo_exception =
160+
Some("Unresolvable entity in feed id".to_string());
161+
}
162+
feed.feed.id = Some(text);
157163
}
158164
b"updated" if !is_empty => {
159-
let text = read_text(reader, &mut buf, limits)?;
165+
let text = read_text_str(reader, &mut buf, limits)?;
160166
feed.feed.updated = parse_date(&text);
161167
}
162168
b"published" if !is_empty => {
163-
let text = read_text(reader, &mut buf, limits)?;
169+
let text = read_text_str(reader, &mut buf, limits)?;
164170
feed.feed.published = parse_date(&text);
165171
}
166172
b"author" if !is_empty => {
@@ -196,11 +202,11 @@ fn parse_feed_element(
196202
feed.feed.set_generator(generator);
197203
}
198204
b"icon" if !is_empty => {
199-
let url = read_text(reader, &mut buf, limits)?;
205+
let url = read_text_str(reader, &mut buf, limits)?;
200206
feed.feed.icon = Some(base_ctx.resolve_safe(&url));
201207
}
202208
b"logo" if !is_empty => {
203-
let url = read_text(reader, &mut buf, limits)?;
209+
let url = read_text_str(reader, &mut buf, limits)?;
204210
feed.feed.logo = Some(base_ctx.resolve_safe(&url));
205211
}
206212
b"rights" if !is_empty => {
@@ -232,7 +238,7 @@ fn parse_feed_element(
232238
let handled = if let Some(dc_element) = is_dc_tag(tag) {
233239
let dc_elem = dc_element.to_string();
234240
if !is_empty {
235-
let text = read_text(reader, &mut buf, limits)?;
241+
let text = read_text_str(reader, &mut buf, limits)?;
236242
dublin_core::handle_feed_element(&dc_elem, &text, &mut feed.feed);
237243
}
238244
true
@@ -321,14 +327,14 @@ fn parse_entry(
321327
}
322328
}
323329
b"id" if !is_empty => {
324-
entry.id = Some(read_text(reader, buf, limits)?.into());
330+
entry.id = Some(read_text_str(reader, buf, limits)?.into());
325331
}
326332
b"updated" if !is_empty => {
327-
let text = read_text(reader, buf, limits)?;
333+
let text = read_text_str(reader, buf, limits)?;
328334
entry.updated = parse_date(&text);
329335
}
330336
b"published" if !is_empty => {
331-
let text = read_text(reader, buf, limits)?;
337+
let text = read_text_str(reader, buf, limits)?;
332338
entry.published = parse_date(&text);
333339
}
334340
b"summary" if !is_empty => {
@@ -377,14 +383,14 @@ fn parse_entry(
377383
let handled = if let Some(dc_element) = is_dc_tag(tag) {
378384
let dc_elem = dc_element.to_string();
379385
if !is_empty {
380-
let text = read_text(reader, buf, limits)?;
386+
let text = read_text_str(reader, buf, limits)?;
381387
dublin_core::handle_entry_element(&dc_elem, &text, &mut entry);
382388
}
383389
true
384390
} else if let Some(content_element) = is_content_tag(tag) {
385391
let content_elem = content_element.to_string();
386392
if !is_empty {
387-
let text = read_text(reader, buf, limits)?;
393+
let text = read_text_str(reader, buf, limits)?;
388394
content::handle_entry_element(&content_elem, &text, &mut entry);
389395
}
390396
true
@@ -417,7 +423,7 @@ fn parse_entry(
417423
} else {
418424
let media_elem = media_element.to_string();
419425
if !is_empty {
420-
let text = read_text(reader, buf, limits)?;
426+
let text = read_text_str(reader, buf, limits)?;
421427
media_rss::handle_entry_element(&media_elem, &text, &mut entry);
422428
}
423429
}
@@ -467,7 +473,7 @@ fn parse_text_construct(
467473
}
468474
}
469475

470-
let value = read_text(reader, buf, limits)?;
476+
let value = read_text_str(reader, buf, limits)?;
471477

472478
Ok(TextConstruct {
473479
value,
@@ -495,9 +501,9 @@ fn parse_person(
495501
check_depth(*depth, limits.max_nesting_depth)?;
496502

497503
match e.local_name().as_ref() {
498-
b"name" => name = Some(read_text(reader, buf, limits)?.into()),
499-
b"email" => email = Some(read_text(reader, buf, limits)?.into()),
500-
b"uri" => uri = Some(read_text(reader, buf, limits)?),
504+
b"name" => name = Some(read_text_str(reader, buf, limits)?.into()),
505+
b"email" => email = Some(read_text_str(reader, buf, limits)?.into()),
506+
b"uri" => uri = Some(read_text_str(reader, buf, limits)?),
501507
_ => skip_element(reader, buf, limits, *depth)?,
502508
}
503509
*depth = depth.saturating_sub(1);
@@ -540,7 +546,7 @@ fn parse_generator(
540546
}
541547

542548
Ok(Generator {
543-
value: read_text(reader, buf, limits)?,
549+
value: read_text_str(reader, buf, limits)?,
544550
uri,
545551
version,
546552
})
@@ -565,7 +571,7 @@ fn parse_content(
565571
}
566572

567573
Ok(Content {
568-
value: read_text(reader, buf, limits)?,
574+
value: read_text_str(reader, buf, limits)?,
569575
content_type,
570576
language: None,
571577
base: None,
@@ -592,7 +598,7 @@ fn parse_atom_source(
592598
let element = e.to_owned();
593599
// Use name() instead of local_name() to preserve namespace prefixes
594600
match element.name().as_ref() {
595-
b"title" => title = Some(read_text(reader, buf, limits)?),
601+
b"title" => title = Some(read_text_str(reader, buf, limits)?),
596602
b"link" => {
597603
if let Some(l) = Link::from_attributes(
598604
element.attributes().flatten(),
@@ -603,7 +609,7 @@ fn parse_atom_source(
603609
}
604610
skip_to_end(reader, buf, b"link")?;
605611
}
606-
b"id" => id = Some(read_text(reader, buf, limits)?),
612+
b"id" => id = Some(read_text_str(reader, buf, limits)?),
607613
_ => skip_element(reader, buf, limits, *depth)?,
608614
}
609615
*depth = depth.saturating_sub(1);

0 commit comments

Comments
 (0)