diff --git a/Readability.js b/Readability.js index 5cff4540..96ac3b02 100644 --- a/Readability.js +++ b/Readability.js @@ -64,6 +64,17 @@ function Readability(doc, options) { this._disableJSONLD = !!options.disableJSONLD; this._allowedVideoRegex = options.allowedVideoRegex || this.REGEXPS.videos; this._linkDensityModifier = options.linkDensityModifier || 0; + /** + * If true, keep the first in-article H1/H2 that duplicates the article title + * and leave H1 tags in the extracted content. Defaults to false (strip the + * duplicate title header and normalize remaining H1 elements to H2). + * When true, also prepend clones of document `h1` nodes that lie outside the + * extracted subtree and precede the grabbed content in document order (for example + * hero headings); snapshots are taken before `_grabArticle` because extraction + * mutates the DOM. Those clones are inserted before `_postProcessContent` so they + * receive URI fixes and class cleanup. + */ + this._keepOriginalTitleHeaders = !!options.keepOriginalTitleHeaders; // Start with all flags set this._flags = @@ -835,11 +846,13 @@ Readability.prototype = { this._cleanConditionally(articleContent, "ul"); this._cleanConditionally(articleContent, "div"); - // replace H1 with H2 as H1 should be only title that is displayed separately - this._replaceNodeTags( - this._getAllNodesWithTag(articleContent, ["h1"]), - "h2" - ); + if (!this._keepOriginalTitleHeaders) { + // replace H1 with H2 as H1 should be only title that is displayed separately + this._replaceNodeTags( + this._getAllNodesWithTag(articleContent, ["h1"]), + "h2" + ); + } // Remove extra paragraphs this._removeNodes( @@ -1064,7 +1077,7 @@ Readability.prototype = { var elementsToScore = []; var node = this._doc.documentElement; - let shouldRemoveTitleHeader = true; + let shouldRemoveTitleHeader = !this._keepOriginalTitleHeaders; while (node) { if (node.tagName === "HTML") { @@ -2709,6 +2722,76 @@ Readability.prototype = { return this._textSimilarity(this._articleTitle, heading) > 0.75; }, + /** + * Assign stable preorder indices (depth-first, element-only) so we can compare what + * appeared before extracted content while `_grabArticle` still sees the original tree. + * + * @param Element root + * @param {{ i: number }} counterHolder mutable `{ i }` counter. + */ + _documentPreorderWalk(root, counterHolder) { + if (!root || root.nodeType !== this.ELEMENT_NODE) { + return; + } + this._elementPreorderIndex.set(root, counterHolder.i++); + var child = root.firstElementChild; + while (child) { + this._documentPreorderWalk(child, counterHolder); + child = child.nextElementSibling; + } + }, + + /** + * Prepend `h1` clones that existed elsewhere on the page before extraction (hero, + * etc.), in document order. Snapshots pair each original node with its clone because + * `_grabArticle` may remove or move originals. Only headings whose preorder index is + * strictly before the earliest preorder among nodes inside `articleContent` are kept + * ("before grabbed content"). + * + * @param Element articleContent root returned by `_grabArticle`. + * @param Array<{original: Element, clone: Element, preorder?: number}> snapshots from before grab. + */ + _prependExternalH1HeadingsBeforePostProcess(articleContent, snapshots) { + if (!snapshots || !snapshots.length) { + return; + } + + var minPreorderInGrabbed = Infinity; + var descendants = articleContent.querySelectorAll("*"); + for (var j = 0; j < descendants.length; j++) { + var grabbedPo = this._elementPreorderIndex.get(descendants[j]); + if (grabbedPo !== undefined) { + minPreorderInGrabbed = Math.min(minPreorderInGrabbed, grabbedPo); + } + } + + var fragment = this._doc.createDocumentFragment(); + + for (var i = 0; i < snapshots.length; i++) { + var entry = snapshots[i]; + if (articleContent.contains(entry.original)) { + continue; + } + if (!this._isProbablyVisible(entry.original)) { + continue; + } + if ( + entry.preorder === undefined || + minPreorderInGrabbed === Infinity || + entry.preorder >= minPreorderInGrabbed + ) { + continue; + } + fragment.appendChild(entry.clone); + } + + if (!fragment.childNodes.length) { + return; + } + + articleContent.insertBefore(fragment, articleContent.firstChild); + }, + _flagIsActive(flag) { return (this._flags & flag) > 0; }, @@ -2770,6 +2853,24 @@ Readability.prototype = { this._metadata = metadata; this._articleTitle = metadata.title; + var prefgrabH1Snapshots = null; + if (this._keepOriginalTitleHeaders) { + this._elementPreorderIndex = new WeakMap(); + var preorderCounter = { i: 0 }; + this._documentPreorderWalk(this._doc.documentElement, preorderCounter); + + prefgrabH1Snapshots = Array.from( + this._doc.getElementsByTagName("h1"), + function (h) { + return { + original: h, + clone: h.cloneNode(true), + preorder: this._elementPreorderIndex.get(h), + }; + }.bind(this) + ); + } + var articleContent = this._grabArticle(); if (!articleContent) { return null; @@ -2777,6 +2878,13 @@ Readability.prototype = { this.log("Grabbed: " + articleContent.innerHTML); + if (prefgrabH1Snapshots) { + this._prependExternalH1HeadingsBeforePostProcess( + articleContent, + prefgrabH1Snapshots + ); + } + this._postProcessContent(articleContent); // If we haven't found an excerpt in the article's metadata, use the article's diff --git a/index.d.ts b/index.d.ts index 7ad8dd58..61af8845 100644 --- a/index.d.ts +++ b/index.d.ts @@ -75,6 +75,15 @@ export interface ReadabilityOptions { * Defaults to 1. */ linkDensityModifier?: number; + /** + * If `true`, the first in-article heading that closely matches the article + * title is kept, and H1 tags in the extracted content are not rewritten to H2. + * When `true`, also prepends clones of those `h1` elements that lie outside the + * extracted subtree **and** precede the grabbed content in document order (for example + * hero titles), captured before extraction so they still run through post-processing + * (relative URL fixes, etc.). Defaults to `false`. + */ + keepOriginalTitleHeaders?: boolean; } export class Readability { diff --git a/test/test-keep-original-title-headers.js b/test/test-keep-original-title-headers.js new file mode 100644 index 00000000..b063f762 --- /dev/null +++ b/test/test-keep-original-title-headers.js @@ -0,0 +1,185 @@ +/* eslint-env node, mocha */ + +var JSDOM = require("jsdom").JSDOM; +var chai = require("chai"); +var expect = chai.expect; + +var Readability = require("../index").Readability; + +function articleHtml(titleText, headingTag, headingText) { + var long = + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " + + "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " + + "minim veniam, quis nostrud exercitation ullamco laboris nisi ut " + + "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " + + "in voluptate velit esse cillum dolore eu fugiat nulla pariatur."; + return ( + "" + + titleText + + "
" + + "<" + + headingTag + + ">" + + headingText + + "

" + + long + + "

" + + long + + "

" + ); +} + +describe("keepOriginalTitleHeaders option", function () { + this.timeout(30000); + + it("when false, removes the first heading that duplicates the title and rewrites other H1 to H2", function () { + var titleText = "Readability Title Headers Option Test 7f3a"; + var source = articleHtml(titleText, "h1", titleText); + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc).parse(); + expect(result.content).to.not.include("

"); + expect(result.content).to.not.include("

" + titleText); + expect(result.title).to.eql(titleText); + }); + + it("when true, keeps the duplicate title header as H1 and does not rewrite it to H2", function () { + var titleText = "Readability Title Headers Option Test 7f3b"; + var source = articleHtml(titleText, "h1", titleText); + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc, { + keepOriginalTitleHeaders: true, + }).parse(); + expect(result.content).to.include("

" + titleText + "

"); + expect(result.title).to.eql(titleText); + }); + + it("when false, rewrites a non-title H1 in the article body to H2", function () { + var titleText = "Readability Title Headers Option Test 7f3c"; + var bodyHeading = "Distinct In Article Heading 9z2q"; + var source = articleHtml(titleText, "h1", bodyHeading); + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc).parse(); + expect(result.content).to.include("

" + bodyHeading + "

"); + expect(result.content).to.not.include("

" + bodyHeading); + }); + + it("when true, leaves a non-title H1 in the article body as H1", function () { + var titleText = "Readability Title Headers Option Test 7f3d"; + var bodyHeading = "Distinct In Article Heading 9z2r"; + var source = articleHtml(titleText, "h1", bodyHeading); + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc, { + keepOriginalTitleHeaders: true, + }).parse(); + expect(result.content).to.include("

" + bodyHeading + "

"); + expect(result.content).to.not.include("

" + bodyHeading); + }); + + it("when true, prepends clones of document-level H1 outside the extracted subtree (before post-processing)", function () { + var titleText = "Readability External Hero H1 Title Option Test 9x4m"; + var long = + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " + + "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " + + "minim veniam, quis nostrud exercitation ullamco laboris nisi ut " + + "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " + + "in voluptate velit esse cillum dolore eu fugiat nulla pariatur."; + var source = + "" + + titleText + + "
" + + '

' + + titleText + + "

" + + "

" + + long + + "

" + + long + + "

" + + "
"; + + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc, { + keepOriginalTitleHeaders: true, + }).parse(); + + expect(result.content).to.include("

" + titleText + "

"); + expect(result.content.indexOf("

" + titleText)).to.be.lessThan( + result.content.indexOf('id="readability-page-1"') + ); + expect(result.title).to.eql(titleText); + }); + + it("when true, does not prepend H1 that appear after grabbed content in document order", function () { + var titleText = + "Readability Article Title After Hero Ignore Later H1 Test 9x5p"; + var sidebarHeading = "Sidebar Or Footer H1 Must Not Prepend 9x5q"; + var long = + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " + + "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " + + "minim veniam, quis nostrud exercitation ullamco laboris nisi ut " + + "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " + + "in voluptate velit esse cillum dolore eu fugiat nulla pariatur."; + var source = + "" + + titleText + + "
" + + "

" + + long + + "

" + + long + + "

" + + "" + + "
"; + + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc, { + keepOriginalTitleHeaders: true, + }).parse(); + + expect(result.content).to.not.include(sidebarHeading); + expect(result.title).to.eql(titleText); + }); + + it("when false, does not prepend hero H1 from outside the extracted subtree", function () { + var titleText = + "Readability External Hero H1 Absent When Option False Test 9x4n"; + var long = + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do " + + "eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad " + + "minim veniam, quis nostrud exercitation ullamco laboris nisi ut " + + "aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " + + "in voluptate velit esse cillum dolore eu fugiat nulla pariatur."; + var source = + "" + + titleText + + "
" + + '

' + + titleText + + "

" + + "

" + + long + + "

" + + long + + "

" + + "
"; + + var doc = new JSDOM(source, { url: "http://example.com/article" }).window + .document; + var result = new Readability(doc).parse(); + + expect(result.content).to.not.include("