diff --git a/lib/html_cleaner.rb b/lib/html_cleaner.rb index 249a0775dc..673f1ae8ce 100644 --- a/lib/html_cleaner.rb +++ b/lib/html_cleaner.rb @@ -21,8 +21,7 @@ def sanitize_field(object, fieldname, image_safety_mode: false) sanitized_field end - # yank out bad end-of-line characters and evil msword curly quotes - def fix_bad_characters(text) + def clean_html_and_characters(text) return "" if text.nil? # get the text into UTF-8 and get rid of invalid characters @@ -36,7 +35,14 @@ def fix_bad_characters(text) # argh, get rid of ____spacer____ inserts text.gsub! "____spacer____", "" - return text + # fix unclosed attribute quotes + # find all HTML opening tags; then within those, searches for any unclosed quotes with look ahead; adds the closing quote before lookahead + # pattern: ="value without closing tag before another attribute= or > + text.gsub!(/<[a-zA-Z][^>]*>/i) do |tag| + tag.gsub(/="([^"]*?)(?=\s+[a-zA-Z]+=|>)/, '="\1"') + end + + text end def sanitize_value(field, value) @@ -69,19 +75,15 @@ def sanitize_value(field, value) transformers << OtwSanitize::UserClassSanitizer.transformer end # Now that we know what transformers we need, let's sanitize the unfrozen value - if ArchiveConfig.FIELDS_ALLOWING_CSS.include?(field.to_s) - unfrozen_value = Sanitize.clean(add_paragraphs_to_text(fix_bad_characters(unfrozen_value)), - Sanitize::Config::CSS_ALLOWED.merge(transformers: transformers)) - else - unfrozen_value = Sanitize.clean(add_paragraphs_to_text(fix_bad_characters(unfrozen_value)), - Sanitize::Config::ARCHIVE.merge(transformers: transformers)) - end + config = ArchiveConfig.FIELDS_ALLOWING_CSS.include?(field.to_s) ? Sanitize::Config::CSS_ALLOWED : Sanitize::Config::ARCHIVE + unfrozen_value = Sanitize.clean(add_paragraphs_to_text(clean_html_and_characters(unfrozen_value)), + config.merge(transformers: transformers)) doc = Nokogiri::HTML5::Document.new doc.encoding = "UTF-8" unfrozen_value = doc.fragment(unfrozen_value).to_html else # clean out all tags - unfrozen_value = Sanitize.clean(fix_bad_characters(unfrozen_value)) + unfrozen_value = Sanitize.clean(clean_html_and_characters(unfrozen_value)) end # Plain text fields can't contain & entities: diff --git a/spec/lib/html_cleaner_spec.rb b/spec/lib/html_cleaner_spec.rb index 64e8ea33de..093081b932 100644 --- a/spec/lib/html_cleaner_spec.rb +++ b/spec/lib/html_cleaner_spec.rb @@ -512,45 +512,104 @@ def one_cell_table(content) end end - describe "fix_bad_characters" do + describe "clean_html_and_characters" do it "should not touch normal text" do - expect(fix_bad_characters("normal text")).to eq("normal text") + expect(clean_html_and_characters("normal text")).to eq("normal text") end it "should not touch normal text with valid unicode chars" do - expect(fix_bad_characters("„‚nörmäl’—téxt‘“")).to eq("„‚nörmäl’—téxt‘“") + expect(clean_html_and_characters("„‚nörmäl’—téxt‘“")).to eq("„‚nörmäl’—téxt‘“") end it "does not touch zero-width non-joiner" do string = ["A".ord, 0x200C, "A".ord] # "A[zwnj]A" - expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string) + expect(clean_html_and_characters(string.pack("U*")).unpack("U*")).to eq(string) end it "does not touch zero-width joiner" do string = ["A".ord, 0x200D, "A".ord] # "A[zwj]A" - expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string) + expect(clean_html_and_characters(string.pack("U*")).unpack("U*")).to eq(string) end it "does not touch word joiner" do string = ["A".ord, 0x2060, "A".ord] # "A[wj]A" - expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string) + expect(clean_html_and_characters(string.pack("U*")).unpack("U*")).to eq(string) end it "should remove invalid unicode chars" do bad_string = [65, 150, 65].pack("C*") # => "A\226A" - expect(fix_bad_characters(bad_string)).to eq("AA") + expect(clean_html_and_characters(bad_string)).to eq("AA") end it "should escape <3" do - expect(fix_bad_characters("normal <3 text")).to eq("normal <3 text") + expect(clean_html_and_characters("normal <3 text")).to eq("normal <3 text") end it "should convert \\r\\n to \\n" do - expect(fix_bad_characters("normal\r\ntext")).to eq("normal\ntext") + expect(clean_html_and_characters("normal\r\ntext")).to eq("normal\ntext") end it "should remove the spacer" do - expect(fix_bad_characters("A____spacer____A")).to eq("AA") + expect(clean_html_and_characters("A____spacer____A")).to eq("AA") + end + + context "fixing unclosed quotes in tag attributes" do + it "should not touch normal text" do + expect(clean_html_and_characters("normal text")).to eq("normal text") + end + + it "should not touch tags with properly closed quotes" do + html = '

text

' + expect(clean_html_and_characters(html)).to eq(html) + end + + it "adds closing quote for missing quote before >" do + input = '

text

') + end + + it "adds closing quote when another attribute follows" do + input = '

text

' + result = clean_html_and_characters(input) + expect(result).to eq('

text

') + end + + it "preserves multi-word attribute values" do + input = '

text

' + result = clean_html_and_characters(input) + expect(result).to eq('

text

') + end + + it "handles unclosed quote with multi-word value" do + input = '

text

') + end + + it "fixes unclosed href attribute" do + input = 'link') + end + + it "fixes multiple malformed attributes in same tag" do + input = '

text

') + end + + it "handles unclosed quote with special characters in value" do + input = '

text

') + end + + it "handles data attributes with unclosed quotes" do + input = '
content
') + end end end @@ -1278,12 +1337,6 @@ def one_cell_table(content) result = 'Hi! img src="https://<span>" alt="<script/src=\'http://ha.ckers.org/xss.js\'>" Bye' expect(strip_images(string, keep_src: true)).to eq(result) end - - it "escapes the value within the alt attribute when it contains an initial >" do - string = 'Hi! ><script src=\'http://ha.ckers.org/xss.js\'> Bye' - result = 'Hi! img src="https://example.com" alt="><script src=\'http://ha.ckers.org/xss.js\'>" Bye' - expect(strip_images(string, keep_src: true)).to eq(result) - end end end end