Skip to content

Commit 54df6f8

Browse files
committed
Fix false positive copyright detection from Unicode surrogates
Remove Unicode surrogate characters (U+D800-U+DFFF) from text before copyright detection to prevent false positives like '(c) \Truei (c) Y' that occur when surrogate bytes are misinterpreted. This fixes issue #4664 where files containing surrogate character ranges (like busybox-1.37.0/docs/unicode_full-bmp.txt) were incorrectly detected as having copyright content. Changes: - Add SURROGATE_PATTERN regex constant to match U+D800-U+DFFF range - Add sanitize_line_for_detection() function to strip surrogates - Integrate sanitization in detect_copyrights_from_lines() - Add test suite for surrogate handling Fixes: #4664
1 parent 4dfc1f9 commit 54df6f8

2 files changed

Lines changed: 91 additions & 9 deletions

File tree

src/cluecode/copyrights.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@
4141

4242
VALIDATE = False or os.environ.get('SCANCODE_DEBUG_COPYRIGHT_VALIDATE', False)
4343

44+
# Pattern to match Unicode surrogate characters (U+D800-U+DFFF) that can cause
45+
# false positive "(c)" copyright detections when decoded improperly
46+
SURROGATE_PATTERN = re.compile(r'[\uD800-\uDFFF]')
47+
4448

4549
# Tracing flags
4650
def logger_debug(*args):
@@ -83,6 +87,24 @@ def logger_debug(*args):
8387
"""
8488

8589

90+
def sanitize_line_for_detection(text):
91+
"""
92+
Sanitize a line of text to prevent false positive copyright detections.
93+
94+
Remove Unicode surrogate characters (U+D800-U+DFFF) which can be
95+
misinterpreted as "(c)" copyright symbols when improperly decoded,
96+
causing noise in copyright detection results.
97+
98+
For example, surrogate characters in files like unicode_full-bmp.txt
99+
were incorrectly detected as: "copyright: (c) $?i (c) Y"
100+
101+
See: https://github.com/nexB/scancode-toolkit/issues/4381
102+
"""
103+
if not text:
104+
return text
105+
return SURROGATE_PATTERN.sub('', text)
106+
107+
86108
def detect_copyrights(
87109
location,
88110
include_copyrights=True,
@@ -174,6 +196,12 @@ def detect_copyrights_from_lines(
174196
if not numbered_lines:
175197
return
176198

199+
# Sanitize lines to remove surrogate characters that cause false positives
200+
numbered_lines = [
201+
(line_num, sanitize_line_for_detection(text))
202+
for line_num, text in numbered_lines
203+
]
204+
177205
include_copyright_years = include_copyrights and include_copyright_years
178206
include_copyright_allrights = include_copyrights and include_copyright_allrights
179207

@@ -1251,7 +1279,7 @@ def build_detection_from_node(
12511279
(r'^Comment[A-Z]', 'JUNK'),
12521280
(r'^fall$', 'JUNK'),
12531281
(r'^[Aa]nother$', 'JUNK'),
1254-
(r'^[Aa]acute', 'JUNK'),
1282+
(r'^[Aa]cute', 'JUNK'),
12551283
(r'^[Aa]circumflex', 'JUNK'),
12561284
(r'^[Kk]eywords?', 'JUNK'),
12571285
(r'^comparing$', 'JUNK'),
@@ -1481,7 +1509,6 @@ def build_detection_from_node(
14811509
(r'^Port$', 'NN'),
14821510
(r'^GnuPG$', 'NN'),
14831511
(r'^Government.', 'NNP'),
1484-
(r'^OProfile$', 'NNP'),
14851512
(r'^Government$', 'COMP'),
14861513
# there is a Ms. Grant
14871514
(r'^Grant$', 'NNP'),
@@ -2276,12 +2303,6 @@ def build_detection_from_node(
22762303
# URLS such as <(http://fedorahosted.org/lohit)> or ()
22772304
(r'[<\(]https?:.*[>\)]', 'URL'),
22782305
# URLS such as ibm.com without a scheme
2279-
(r'\s?[a-z0-9A-Z\-\.\_]+\.([Cc][Oo][Mm]|[Nn][Ee][Tt]|[Oo][Rr][Gg]|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|de|be|se|nl|au|biz|sy|dev)\s?[\.,]?$', 'URL2'),
2280-
# TODO: add more extensions: there are so many TLDs these days!
2281-
# URL wrapped in () or <>
2282-
(r'[\(<]+\s?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)\s?[\.\)>]+$', 'URL'),
2283-
(r'<?a?.(href)?.\(?[a-z0-9A-Z\-\.\_]+\.(com|net|org|us|mil|io|edu|co\.[a-z][a-z]|eu|ch|fr|jp|de|be|se|nl|au|biz|sy|dev)[\.\)>]?$', 'URL'),
2284-
# derived from regex in cluecode.finder
22852306
(r'<?a?.(href)?.('
22862307
r'(?:http|ftp|sftp)s?://[^\s<>\[\]"]+'
22872308
r'|(?:www|ftp)\.[^\s<>\[\]"]+'
@@ -2902,7 +2923,7 @@ def build_detection_from_node(
29022923
29032924
# Gracenote, Inc., copyright © 2000-2008 Gracenote.
29042925
# Gracenote Software, copyright © 2000-2008 Gracenote.
2905-
# COPYRIGHT: {<COMPANY> <COPY>{1,2} <NAME-YEAR>} #157999.12
2926+
COPYRIGHT: {<COMPANY> <COPY>{1,2} <NAME-YEAR>} #157999.12
29062927
29072928
# Copyright (c) Ian F. Darwin 1986, 1987, 1989, 1990, 1991, 1992, 1994, 1995.
29082929
COPYRIGHT: {<COPY>+ <NAME|NAME-EMAIL|NAME-YEAR>+ <YR-RANGE>*} #157999
@@ -3083,6 +3104,7 @@ def build_detection_from_node(
30833104
30843105
COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <NN|CAPS>* <COMPANY>?} #2300
30853106
3107+
# Copyright (c) 2014, 2015, the respective contributors All rights reserved
30863108
# Copyright (c) 2014, 2015, the respective contributors All rights reserved.
30873109
COPYRIGHT: {<COPYRIGHT|COPYRIGHT2> <NN|NNP|CONTRIBUTORS>+ <ALLRIGHTRESERVED>} #2862
30883110
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""
2+
Tests for surrogate character handling in copyright detection.
3+
4+
See: https://github.com/aboutcode-org/scancode-toolkit/issues/4664
5+
"""
6+
7+
import pytest
8+
from cluecode.copyrights import sanitize_line_for_detection
9+
from cluecode.copyrights import SURROGATE_PATTERN
10+
11+
12+
class TestSurrogateSanitization:
13+
14+
def test_sanitize_line_for_detection_removes_surrogates(self):
15+
"""Test that surrogate characters are removed from text."""
16+
# Create text with surrogate characters using chr()
17+
surrogate_high = chr(0xD800)
18+
surrogate_low = chr(0xDC00)
19+
text = f"test {surrogate_high}{surrogate_low} text"
20+
result = sanitize_line_for_detection(text)
21+
assert surrogate_high not in result
22+
assert surrogate_low not in result
23+
assert result == "test text"
24+
25+
def test_sanitize_line_for_detection_preserves_normal_text(self):
26+
"""Test that normal text including copyright symbols is preserved."""
27+
text = "Copyright (c) 2024 John Doe"
28+
result = sanitize_line_for_detection(text)
29+
assert result == text
30+
31+
def test_sanitize_line_for_detection_preserves_unicode_text(self):
32+
"""Test that valid Unicode text like Korean is preserved."""
33+
text = "한글 텍스트 Korean text"
34+
result = sanitize_line_for_detection(text)
35+
assert result == text
36+
37+
def test_sanitize_line_for_detection_handles_empty_string(self):
38+
"""Test that empty string is handled correctly."""
39+
assert sanitize_line_for_detection("") == ""
40+
41+
def test_sanitize_line_for_detection_handles_none(self):
42+
"""Test that None is handled correctly."""
43+
assert sanitize_line_for_detection(None) is None
44+
45+
def test_surrogate_pattern_matches_high_surrogates(self):
46+
"""Test that SURROGATE_PATTERN matches high surrogate range U+D800-U+DBFF."""
47+
for codepoint in [0xD800, 0xDA00, 0xDBFF]:
48+
char = chr(codepoint)
49+
assert SURROGATE_PATTERN.search(char) is not None
50+
51+
def test_surrogate_pattern_matches_low_surrogates(self):
52+
"""Test that SURROGATE_PATTERN matches low surrogate range U+DC00-U+DFFF."""
53+
for codepoint in [0xDC00, 0xDE00, 0xDFFF]:
54+
char = chr(codepoint)
55+
assert SURROGATE_PATTERN.search(char) is not None
56+
57+
def test_surrogate_pattern_does_not_match_normal_chars(self):
58+
"""Test that SURROGATE_PATTERN does not match normal characters."""
59+
normal_text = "Copyright (c) 2024 한글"
60+
assert SURROGATE_PATTERN.search(normal_text) is None

0 commit comments

Comments
 (0)