Skip to content

Commit 23952e7

Browse files
committed
Merge branch 'develop' into chrispy/strip-blank-lines-from-pre
2 parents 74805de + 75ab306 commit 23952e7

4 files changed

Lines changed: 27 additions & 13 deletions

File tree

README.rst

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -163,12 +163,16 @@ strip_pre
163163
``STRIP_ONE`` (one leading/trailing blank line), and ``None`` (neither).
164164
Defaults to ``STRIP``.
165165

166-
beautiful_soup_parser
167-
Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
168-
as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
169-
environment. Defaults to ``html.parser``.
170-
171-
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
166+
bs4_options
167+
Specify additional configuration options for the ``BeautifulSoup`` object
168+
used to interpret the HTML markup. String and list values (such as ``lxml``
169+
or ``html5lib``) are treated as ``features`` arguments to control parser
170+
selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
171+
are treated as full kwargs to be used for the BeautifulSoup constructor,
172+
allowing specification of any parameter. For parameter details, see the
173+
Beautiful Soup documentation at:
174+
175+
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
172176

173177
Options may be specified as kwargs to the ``markdownify`` function, or as a
174178
nested ``Options`` class in ``MarkdownConverter`` subclasses.

markdownify/__init__.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ def _next_block_content_sibling(el):
173173
class MarkdownConverter(object):
174174
class DefaultOptions:
175175
autolinks = True
176-
beautiful_soup_parser = 'html.parser'
176+
bs4_options = 'html.parser'
177177
bullets = '*+-' # An iterable of bullet types.
178178
code_language = ''
179179
code_language_callback = None
@@ -208,11 +208,15 @@ def __init__(self, **options):
208208
raise ValueError('You may specify either tags to strip or tags to'
209209
' convert, but not both.')
210210

211+
# If a string or list is passed to bs4_options, assume it is a 'features' specification
212+
if not isinstance(self.options['bs4_options'], dict):
213+
self.options['bs4_options'] = {'features': self.options['bs4_options']}
214+
211215
# Initialize the conversion function cache
212216
self.convert_fn_cache = {}
213217

214218
def convert(self, html):
215-
soup = BeautifulSoup(html, self.options['beautiful_soup_parser'])
219+
soup = BeautifulSoup(html, **self.options['bs4_options'])
216220
return self.convert_soup(soup)
217221

218222
def convert_soup(self, soup):

markdownify/main.py

100644100755
Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,11 @@ def main(argv=sys.argv[1:]):
7070
parser.add_argument('-w', '--wrap', action='store_true',
7171
help="Wrap all text paragraphs at --wrap-width characters.")
7272
parser.add_argument('--wrap-width', type=int, default=80)
73-
parser.add_argument('-p', '--beautiful-soup-parser',
74-
dest='beautiful_soup_parser',
73+
parser.add_argument('--bs4-options',
7574
default='html.parser',
76-
help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
77-
"as html5lib, lxml or even a custom parser as long as it is installed on the execution "
78-
"environment.")
75+
help="Specifies the parser that BeautifulSoup should use to parse "
76+
"the HTML markup. Examples include 'html5.parser', 'lxml', and "
77+
"'html5lib'.")
7978

8079
args = parser.parse_args(argv)
8180
print(markdownify(**vars(args)))

tests/test_args.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,10 @@ def test_strip_pre():
3939
assert markdownify("<pre> \n \n Hello \n \n </pre>", strip_pre=STRIP) == "```\n Hello\n```"
4040
assert markdownify("<pre> \n \n Hello \n \n </pre>", strip_pre=STRIP_ONE) == "```\n \n Hello \n \n```"
4141
assert markdownify("<pre> \n \n Hello \n \n </pre>", strip_pre=None) == "```\n \n \n Hello \n \n \n```"
42+
43+
44+
def bs4_options():
45+
assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
46+
assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
47+
assert markdownify("<p>Hello</p>", bs4_options={"features": "html.parser"}) == "Hello"
48+

0 commit comments

Comments
 (0)