From 5b77afe733c1e67c7eaad45f75bafe1d59bc5f25 Mon Sep 17 00:00:00 2001 From: forceofcalm Date: Wed, 20 May 2026 00:24:36 -0400 Subject: [PATCH 1/9] added check and cleanup for unclosed quotes in html tags --- lib/html_cleaner.rb | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/lib/html_cleaner.rb b/lib/html_cleaner.rb index 249a0775dc2..d986066b4b3 100644 --- a/lib/html_cleaner.rb +++ b/lib/html_cleaner.rb @@ -21,8 +21,7 @@ def sanitize_field(object, fieldname, image_safety_mode: false) sanitized_field end - # yank out bad end-of-line characters and evil msword curly quotes - def fix_bad_characters(text) + def clean_html_and_characters(text) return "" if text.nil? # get the text into UTF-8 and get rid of invalid characters @@ -36,7 +35,14 @@ def fix_bad_characters(text) # argh, get rid of ____spacer____ inserts text.gsub! "____spacer____", "" - return text + # fix unclosed attribute quotes + # find all HTML opening tags; then within those, searches for any unclosed quotes with look ahead; adds the closing quote before lookahead + # pattern: ="value without closing tag before another attribute= or > + text.gsub!(/<[a-zA-Z][^>]*>/i) do |tag| + tag.gsub(/="([^"]*?)(?=\s+[a-zA-Z]+=|>)/, '="\1"') + end + + text end def sanitize_value(field, value) @@ -70,10 +76,10 @@ def sanitize_value(field, value) end # Now that we know what transformers we need, let's sanitize the unfrozen value if ArchiveConfig.FIELDS_ALLOWING_CSS.include?(field.to_s) - unfrozen_value = Sanitize.clean(add_paragraphs_to_text(fix_bad_characters(unfrozen_value)), + unfrozen_value = Sanitize.clean(add_paragraphs_to_text(clean_html_and_characters(unfrozen_value)), Sanitize::Config::CSS_ALLOWED.merge(transformers: transformers)) else - unfrozen_value = Sanitize.clean(add_paragraphs_to_text(fix_bad_characters(unfrozen_value)), + unfrozen_value = Sanitize.clean(add_paragraphs_to_text(clean_html_and_characters(unfrozen_value)), Sanitize::Config::ARCHIVE.merge(transformers: transformers)) end doc = Nokogiri::HTML5::Document.new @@ -81,7 +87,7 @@ def sanitize_value(field, value) unfrozen_value = doc.fragment(unfrozen_value).to_html else # clean out all tags - unfrozen_value = Sanitize.clean(fix_bad_characters(unfrozen_value)) + unfrozen_value = Sanitize.clean(clean_html_and_characters(unfrozen_value)) end # Plain text fields can't contain & entities: From f597acef4063f422b2d5b5e57f88bafba77e9eb0 Mon Sep 17 00:00:00 2001 From: forceofcalm Date: Wed, 20 May 2026 00:31:14 -0400 Subject: [PATCH 2/9] tests, i think?? --- spec/lib/html_cleaner_spec.rb | 148 ++++++++++++++++++++++++++++++---- 1 file changed, 134 insertions(+), 14 deletions(-) diff --git a/spec/lib/html_cleaner_spec.rb b/spec/lib/html_cleaner_spec.rb index 64e8ea33dea..a709d639ac2 100644 --- a/spec/lib/html_cleaner_spec.rb +++ b/spec/lib/html_cleaner_spec.rb @@ -512,45 +512,122 @@ def one_cell_table(content) end end - describe "fix_bad_characters" do + describe "clean_html_and_characters" do it "should not touch normal text" do - expect(fix_bad_characters("normal text")).to eq("normal text") + expect(clean_html_and_characters("normal text")).to eq("normal text") end it "should not touch normal text with valid unicode chars" do - expect(fix_bad_characters("„‚nörmäl’—téxt‘“")).to eq("„‚nörmäl’—téxt‘“") + expect(clean_html_and_characters("„‚nörmäl’—téxt‘“")).to eq("„‚nörmäl’—téxt‘“") end it "does not touch zero-width non-joiner" do string = ["A".ord, 0x200C, "A".ord] # "A[zwnj]A" - expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string) + expect(clean_html_and_characters(string.pack("U*")).unpack("U*")).to eq(string) end it "does not touch zero-width joiner" do string = ["A".ord, 0x200D, "A".ord] # "A[zwj]A" - expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string) + expect(clean_html_and_characters(string.pack("U*")).unpack("U*")).to eq(string) end it "does not touch word joiner" do string = ["A".ord, 0x2060, "A".ord] # "A[wj]A" - expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string) + expect(clean_html_and_characters(string.pack("U*")).unpack("U*")).to eq(string) end it "should remove invalid unicode chars" do bad_string = [65, 150, 65].pack("C*") # => "A\226A" - expect(fix_bad_characters(bad_string)).to eq("AA") + expect(clean_html_and_characters(bad_string)).to eq("AA") end it "should escape <3" do - expect(fix_bad_characters("normal <3 text")).to eq("normal <3 text") + expect(clean_html_and_characters("normal <3 text")).to eq("normal <3 text") end it "should convert \\r\\n to \\n" do - expect(fix_bad_characters("normal\r\ntext")).to eq("normal\ntext") + expect(clean_html_and_characters("normal\r\ntext")).to eq("normal\ntext") end it "should remove the spacer" do - expect(fix_bad_characters("A____spacer____A")).to eq("AA") + expect(clean_html_and_characters("A____spacer____A")).to eq("AA") + end + + context "fixing unclosed quotes in tag attributes" do + it "should not touch normal text" do + expect(clean_html_and_characters("normal text")).to eq("normal text") + end + + it "should not touch tags with properly closed quotes" do + html = '

text

' + expect(clean_html_and_characters(html)).to eq(html) + end + + it "adds closing quote for missing quote before >" do + input = '

text

') + end + + it "adds closing quote when another attribute follows" do + input = '

text

' + result = clean_html_and_characters(input) + expect(result).to eq('

text

') + end + + it "preserves multi-word attribute values" do + input = '

text

' + result = clean_html_and_characters(input) + expect(result).to eq('

text

') + end + + it "handles unclosed quote with multi-word value" do + input = '

text

') + end + + it "fixes unclosed href attribute" do + input = 'link') + end + + it "handles original bug report example" do + input = '

Text to be centered

More text

') + end + + it "fixes multiple malformed attributes in same tag" do + input = '

text

') + end + + it "preserves properly closed attributes alongside malformed ones" do + input = '

text

') + end + + it "handles multiple tags with unclosed quotes" do + input = '

more' + result = clean_html_and_characters(input) + expect(result).to eq('

text

more
') + end + + it "handles unclosed quote with special characters in value" do + input = '

text

') + end + + it "handles data attributes with unclosed quotes" do + input = '
content
') + end end end @@ -1278,11 +1355,54 @@ def one_cell_table(content) result = 'Hi! img src="https://<span>" alt="<script/src=\'http://ha.ckers.org/xss.js\'>" Bye' expect(strip_images(string, keep_src: true)).to eq(result) end + end + end - it "escapes the value within the alt attribute when it contains an initial >" do - string = 'Hi! ><script src=\'http://ha.ckers.org/xss.js\'> Bye' - result = 'Hi! img src="https://example.com" alt="><script src=\'http://ha.ckers.org/xss.js\'>" Bye' - expect(strip_images(string, keep_src: true)).to eq(result) + + + describe "sanitize_value with unclosed quotes" do + [:content, :endnotes, :notes].each do |field| + context "#{field} field with unclosed attribute quotes" do + it "preserves text after malformed tag with unclosed quote" do + input = '

Date: Wed, 20 May 2026 01:34:12 -0400 Subject: [PATCH 3/9] removed blank space and extra var --- spec/lib/html_cleaner_spec.rb | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spec/lib/html_cleaner_spec.rb b/spec/lib/html_cleaner_spec.rb index a709d639ac2..82763cec4f4 100644 --- a/spec/lib/html_cleaner_spec.rb +++ b/spec/lib/html_cleaner_spec.rb @@ -1358,8 +1358,6 @@ def one_cell_table(content) end end - - describe "sanitize_value with unclosed quotes" do [:content, :endnotes, :notes].each do |field| context "#{field} field with unclosed attribute quotes" do @@ -1399,8 +1397,6 @@ def one_cell_table(content) it "preserves multi-class values in CSS-allowing fields" do input = '

Date: Wed, 20 May 2026 01:39:22 -0400 Subject: [PATCH 4/9] rubocop plz... --- lib/html_cleaner.rb | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/lib/html_cleaner.rb b/lib/html_cleaner.rb index d986066b4b3..6ef5dfeb1f0 100644 --- a/lib/html_cleaner.rb +++ b/lib/html_cleaner.rb @@ -75,13 +75,12 @@ def sanitize_value(field, value) transformers << OtwSanitize::UserClassSanitizer.transformer end # Now that we know what transformers we need, let's sanitize the unfrozen value - if ArchiveConfig.FIELDS_ALLOWING_CSS.include?(field.to_s) - unfrozen_value = Sanitize.clean(add_paragraphs_to_text(clean_html_and_characters(unfrozen_value)), - Sanitize::Config::CSS_ALLOWED.merge(transformers: transformers)) - else - unfrozen_value = Sanitize.clean(add_paragraphs_to_text(clean_html_and_characters(unfrozen_value)), - Sanitize::Config::ARCHIVE.merge(transformers: transformers)) - end + config = ArchiveConfig.FIELDS_ALLOWING_CSS.include?(field.to_s) ? + Sanitize::Config::CSS_ALLOWED : + Sanitize::Config::ARCHIVE + + unfrozen_value = Sanitize.clean(add_paragraphs_to_text(clean_html_and_characters(unfrozen_value)), + config.merge(transformers: transformers)) doc = Nokogiri::HTML5::Document.new doc.encoding = "UTF-8" unfrozen_value = doc.fragment(unfrozen_value).to_html From ec48c5f24981d3e43320a3021fb16447472dc8c7 Mon Sep 17 00:00:00 2001 From: forceofcalm Date: Wed, 20 May 2026 01:44:17 -0400 Subject: [PATCH 5/9] will this appease u rubocop --- lib/html_cleaner.rb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/html_cleaner.rb b/lib/html_cleaner.rb index 6ef5dfeb1f0..673f1ae8ce7 100644 --- a/lib/html_cleaner.rb +++ b/lib/html_cleaner.rb @@ -75,10 +75,7 @@ def sanitize_value(field, value) transformers << OtwSanitize::UserClassSanitizer.transformer end # Now that we know what transformers we need, let's sanitize the unfrozen value - config = ArchiveConfig.FIELDS_ALLOWING_CSS.include?(field.to_s) ? - Sanitize::Config::CSS_ALLOWED : - Sanitize::Config::ARCHIVE - + config = ArchiveConfig.FIELDS_ALLOWING_CSS.include?(field.to_s) ? Sanitize::Config::CSS_ALLOWED : Sanitize::Config::ARCHIVE unfrozen_value = Sanitize.clean(add_paragraphs_to_text(clean_html_and_characters(unfrozen_value)), config.merge(transformers: transformers)) doc = Nokogiri::HTML5::Document.new From 2806f84f6141a30bc5611d6eb91e78c714727a6b Mon Sep 17 00:00:00 2001 From: forceofcalm Date: Wed, 20 May 2026 01:51:04 -0400 Subject: [PATCH 6/9] remove redundant tests --- spec/lib/html_cleaner_spec.rb | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/spec/lib/html_cleaner_spec.rb b/spec/lib/html_cleaner_spec.rb index 82763cec4f4..b8ff0e5a860 100644 --- a/spec/lib/html_cleaner_spec.rb +++ b/spec/lib/html_cleaner_spec.rb @@ -1359,39 +1359,6 @@ def one_cell_table(content) end describe "sanitize_value with unclosed quotes" do - [:content, :endnotes, :notes].each do |field| - context "#{field} field with unclosed attribute quotes" do - it "preserves text after malformed tag with unclosed quote" do - input = '

Date: Wed, 20 May 2026 01:53:04 -0400 Subject: [PATCH 7/9] removing more redundant tests --- spec/lib/html_cleaner_spec.rb | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/spec/lib/html_cleaner_spec.rb b/spec/lib/html_cleaner_spec.rb index b8ff0e5a860..363b92942db 100644 --- a/spec/lib/html_cleaner_spec.rb +++ b/spec/lib/html_cleaner_spec.rb @@ -593,30 +593,12 @@ def one_cell_table(content) expect(result).to eq('link') end - it "handles original bug report example" do - input = '

Text to be centered

More text

') - end - it "fixes multiple malformed attributes in same tag" do input = '

text

') end - it "preserves properly closed attributes alongside malformed ones" do - input = '

text

') - end - - it "handles multiple tags with unclosed quotes" do - input = '

more' - result = clean_html_and_characters(input) - expect(result).to eq('

text

more
') - end - it "handles unclosed quote with special characters in value" do input = '

Date: Wed, 20 May 2026 01:59:04 -0400 Subject: [PATCH 9/9] remove extra ends --- spec/lib/html_cleaner_spec.rb | 3 --- 1 file changed, 3 deletions(-) diff --git a/spec/lib/html_cleaner_spec.rb b/spec/lib/html_cleaner_spec.rb index e679296db0a..093081b9321 100644 --- a/spec/lib/html_cleaner_spec.rb +++ b/spec/lib/html_cleaner_spec.rb @@ -1338,8 +1338,5 @@ def one_cell_table(content) expect(strip_images(string, keep_src: true)).to eq(result) end end - end - end - end end end