From ca2b8977f499cd6952b16f730916d0f0946892ba Mon Sep 17 00:00:00 2001 From: patchwright Date: Tue, 30 Jun 2026 22:30:01 +0200 Subject: [PATCH] fix: treat non-breaking space as a separator in links (#66) --- CHANGELOG.md | 5 +++++ src/domains.rs | 4 +++- src/email.rs | 2 +- src/url.rs | 2 +- tests/email.rs | 10 ++++++++++ tests/url.rs | 10 ++++++++++ 6 files changed, 30 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fdaca52..de5439e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html), with the exception that 0.x versions can break between minor versions. +## [Unreleased] +### Changed +- Non-breaking space (U+00A0) is no longer included as part of e-mail and URL + links. It is now treated as a separator, like other whitespace (#66) + ## [0.11.0] - 2026-04-12 ### Changed - Include delimiters before slashes in URLs. E.g. in `https://test.com/!/`, diff --git a/src/domains.rs b/src/domains.rs index 4585284..16c5cfd 100644 --- a/src/domains.rs +++ b/src/domains.rs @@ -49,7 +49,9 @@ pub(crate) fn find_authority_end( let can_be_last = match c { // ALPHA 'a'..='z' | 'A'..='Z' | '\u{80}'..=char::MAX => { - if !iri_parsing_enabled && c > '\u{80}' { + // Non-breaking space (U+00A0) is whitespace and must end the + // authority, even though it falls in the non-ASCII range. (#66) + if (!iri_parsing_enabled && c > '\u{80}') || c == '\u{A0}' { break; } // Can start or end a domain label, but not numeric diff --git a/src/email.rs b/src/email.rs index b7b3319..fa17fe5 100644 --- a/src/email.rs +++ b/src/email.rs @@ -89,7 +89,7 @@ impl EmailScanner { | '|' | '}' | '~' => true, - _ => c >= '\u{80}', + _ => c >= '\u{80}' && c != '\u{A0}', } } } diff --git a/src/url.rs b/src/url.rs index c1bfc9b..bd25fc3 100644 --- a/src/url.rs +++ b/src/url.rs @@ -216,7 +216,7 @@ fn find_url_end(s: &str, quote: Option, iri_parsing_enabled: bool) -> Opti for (i, c) in s.char_indices() { let can_be_last = match c { - '\u{00}'..='\u{1F}' | ' ' | '|' | '\"' | '<' | '>' | '`' | '\u{7F}'..='\u{9F}' => { + '\u{00}'..='\u{1F}' | ' ' | '|' | '\"' | '<' | '>' | '`' | '\u{7F}'..='\u{9F}' | '\u{A0}' => { // These can never be part of an URL, so stop now. See RFC 3986 and RFC 3987. // Some characters are not in the above list, even they are not in "unreserved" // or "reserved": diff --git a/tests/email.rs b/tests/email.rs index 562e637..35ba9b7 100644 --- a/tests/email.rs +++ b/tests/email.rs @@ -123,6 +123,16 @@ fn fuzz() { assert_linked("a@a.xyϸ", "|a@a.xyϸ|"); } +#[test] +fn non_breaking_space_does_not_join_email() { + // Non-breaking space (U+00A0) must not be swallowed into an e-mail link. + // https://github.com/robinst/linkify/issues/66 + assert_linked( + "this is a mail address:\u{a0}test@example.com\u{a0}surrounded by non-breaking spaces", + "this is a mail address:\u{a0}|test@example.com|\u{a0}surrounded by non-breaking spaces", + ); +} + fn assert_not_linked(s: &str) { let mut finder = LinkFinder::new(); finder.kinds(&[LinkKind::Email]); diff --git a/tests/url.rs b/tests/url.rs index 41b1650..d1378ac 100644 --- a/tests/url.rs +++ b/tests/url.rs @@ -558,6 +558,16 @@ fn fuzz() { assert_not_linked("ab:/ϸ"); } +#[test] +fn non_breaking_space_does_not_join_url() { + // Non-breaking space (U+00A0) must not be part of a URL. + // https://github.com/robinst/linkify/issues/66 + assert_linked( + "see https://example.com\u{a0}now", + "see |https://example.com|\u{a0}now", + ); +} + fn assert_not_linked(s: &str) { assert_linked(s, s); }