Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions lib/html_cleaner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ def sanitize_field(object, fieldname, image_safety_mode: false)
sanitized_field
end

# yank out bad end-of-line characters and evil msword curly quotes
def fix_bad_characters(text)
def clean_html_and_characters(text)
return "" if text.nil?

# get the text into UTF-8 and get rid of invalid characters
Expand All @@ -36,7 +35,14 @@ def fix_bad_characters(text)
# argh, get rid of ____spacer____ inserts
text.gsub! "____spacer____", ""

return text
# fix unclosed attribute quotes
# find all HTML opening tags; then within those, searches for any unclosed quotes with look ahead; adds the closing quote before lookahead
# pattern: ="value without closing tag before another attribute= or >
text.gsub!(/<[a-zA-Z][^>]*>/i) do |tag|
tag.gsub(/="([^"]*?)(?=\s+[a-zA-Z]+=|>)/, '="\1"')
end

text
end

def sanitize_value(field, value)
Expand Down Expand Up @@ -69,19 +75,15 @@ def sanitize_value(field, value)
transformers << OtwSanitize::UserClassSanitizer.transformer
end
# Now that we know what transformers we need, let's sanitize the unfrozen value
if ArchiveConfig.FIELDS_ALLOWING_CSS.include?(field.to_s)
unfrozen_value = Sanitize.clean(add_paragraphs_to_text(fix_bad_characters(unfrozen_value)),
Sanitize::Config::CSS_ALLOWED.merge(transformers: transformers))
else
unfrozen_value = Sanitize.clean(add_paragraphs_to_text(fix_bad_characters(unfrozen_value)),
Sanitize::Config::ARCHIVE.merge(transformers: transformers))
end
config = ArchiveConfig.FIELDS_ALLOWING_CSS.include?(field.to_s) ? Sanitize::Config::CSS_ALLOWED : Sanitize::Config::ARCHIVE
unfrozen_value = Sanitize.clean(add_paragraphs_to_text(clean_html_and_characters(unfrozen_value)),
config.merge(transformers: transformers))
doc = Nokogiri::HTML5::Document.new
doc.encoding = "UTF-8"
unfrozen_value = doc.fragment(unfrozen_value).to_html
else
# clean out all tags
unfrozen_value = Sanitize.clean(fix_bad_characters(unfrozen_value))
unfrozen_value = Sanitize.clean(clean_html_and_characters(unfrozen_value))
end

# Plain text fields can't contain &amp; entities:
Expand Down
85 changes: 69 additions & 16 deletions spec/lib/html_cleaner_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -512,45 +512,104 @@ def one_cell_table(content)
end
end

describe "fix_bad_characters" do
describe "clean_html_and_characters" do
it "should not touch normal text" do
expect(fix_bad_characters("normal text")).to eq("normal text")
expect(clean_html_and_characters("normal text")).to eq("normal text")
end

it "should not touch normal text with valid unicode chars" do
expect(fix_bad_characters("„‚nörmäl’—téxt‘“")).to eq("„‚nörmäl’—téxt‘“")
expect(clean_html_and_characters("„‚nörmäl’—téxt‘“")).to eq("„‚nörmäl’—téxt‘“")
end

it "does not touch zero-width non-joiner" do
string = ["A".ord, 0x200C, "A".ord] # "A[zwnj]A"
expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string)
expect(clean_html_and_characters(string.pack("U*")).unpack("U*")).to eq(string)
end

it "does not touch zero-width joiner" do
string = ["A".ord, 0x200D, "A".ord] # "A[zwj]A"
expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string)
expect(clean_html_and_characters(string.pack("U*")).unpack("U*")).to eq(string)
end

it "does not touch word joiner" do
string = ["A".ord, 0x2060, "A".ord] # "A[wj]A"
expect(fix_bad_characters(string.pack("U*")).unpack("U*")).to eq(string)
expect(clean_html_and_characters(string.pack("U*")).unpack("U*")).to eq(string)
end

it "should remove invalid unicode chars" do
bad_string = [65, 150, 65].pack("C*") # => "A\226A"
expect(fix_bad_characters(bad_string)).to eq("AA")
expect(clean_html_and_characters(bad_string)).to eq("AA")
end

it "should escape <3" do
expect(fix_bad_characters("normal <3 text")).to eq("normal &lt;3 text")
expect(clean_html_and_characters("normal <3 text")).to eq("normal &lt;3 text")
end

it "should convert \\r\\n to \\n" do
expect(fix_bad_characters("normal\r\ntext")).to eq("normal\ntext")
expect(clean_html_and_characters("normal\r\ntext")).to eq("normal\ntext")
end

it "should remove the spacer" do
expect(fix_bad_characters("A____spacer____A")).to eq("AA")
expect(clean_html_and_characters("A____spacer____A")).to eq("AA")
end

context "fixing unclosed quotes in tag attributes" do
it "should not touch normal text" do
expect(clean_html_and_characters("normal text")).to eq("normal text")
end

it "should not touch tags with properly closed quotes" do
html = '<p class="already-closed">text</p>'
expect(clean_html_and_characters(html)).to eq(html)
end

it "adds closing quote for missing quote before >" do
input = '<p class="unclosed>text</p>'
result = clean_html_and_characters(input)
expect(result).to eq('<p class="unclosed">text</p>')
end

it "adds closing quote when another attribute follows" do
input = '<p class="unclosed id="main">text</p>'
result = clean_html_and_characters(input)
expect(result).to eq('<p class="unclosed" id="main">text</p>')
end

it "preserves multi-word attribute values" do
input = '<p class="foo bar baz">text</p>'
result = clean_html_and_characters(input)
expect(result).to eq('<p class="foo bar baz">text</p>')
end

it "handles unclosed quote with multi-word value" do
input = '<p class="align center style>text</p>'
result = clean_html_and_characters(input)
expect(result).to eq('<p class="align center style">text</p>')
end

it "fixes unclosed href attribute" do
input = '<a href="http://example.com>link</a>'
result = clean_html_and_characters(input)
expect(result).to eq('<a href="http://example.com">link</a>')
end

it "fixes multiple malformed attributes in same tag" do
input = '<p class="unclosed id="main" style="color>text</p>'
result = clean_html_and_characters(input)
expect(result).to eq('<p class="unclosed" id="main" style="color">text</p>')
end

it "handles unclosed quote with special characters in value" do
input = '<p class="align-left bold>text</p>'
result = clean_html_and_characters(input)
expect(result).to eq('<p class="align-left bold">text</p>')
end

it "handles data attributes with unclosed quotes" do
input = '<div data-value="something>content</div>'
result = clean_html_and_characters(input)
expect(result).to eq('<div data-value="something">content</div>')
end
end
end

Expand Down Expand Up @@ -1278,12 +1337,6 @@ def one_cell_table(content)
result = 'Hi! img src="https://&lt;span&gt;" alt="&lt;script/src=\'http://ha.ckers.org/xss.js\'&gt;" Bye'
expect(strip_images(string, keep_src: true)).to eq(result)
end

it "escapes the value within the alt attribute when it contains an initial >" do
string = 'Hi! <img src="https://example.com" alt="><script src=\'http://ha.ckers.org/xss.js\'>"> Bye'
result = 'Hi! img src="https://example.com" alt="&gt;&lt;script src=\'http://ha.ckers.org/xss.js\'&gt;" Bye'
expect(strip_images(string, keep_src: true)).to eq(result)
end
end
end
end
Loading