File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change @@ -163,12 +163,16 @@ strip_pre
163163 ``STRIP_ONE `` (one leading/trailing blank line), and ``None `` (neither).
164164 Defaults to ``STRIP ``.
165165
166- beautiful_soup_parser
167- Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
168- as `html5lib `, `lxml ` or even a custom parser as long as it is installed on the execution
169- environment. Defaults to ``html.parser ``.
170-
171- .. _BeautifulSoup : https://www.crummy.com/software/BeautifulSoup/
166+ bs4_options
167+ Specify additional configuration options for the ``BeautifulSoup `` object
168+ used to interpret the HTML markup. String and list values (such as ``lxml ``
169+ or ``html5lib ``) are treated as ``features `` arguments to control parser
170+ selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"} ``)
171+ are treated as full kwargs to be used for the BeautifulSoup constructor,
172+ allowing specification of any parameter. For parameter details, see the
173+ Beautiful Soup documentation at:
174+
175+ .. _BeautifulSoup : https://www.crummy.com/software/BeautifulSoup/bs4/doc/
172176
173177Options may be specified as kwargs to the ``markdownify `` function, or as a
174178nested ``Options `` class in ``MarkdownConverter `` subclasses.
Original file line number Diff line number Diff line change @@ -173,7 +173,7 @@ def _next_block_content_sibling(el):
173173class MarkdownConverter (object ):
174174 class DefaultOptions :
175175 autolinks = True
176- beautiful_soup_parser = 'html.parser'
176+ bs4_options = 'html.parser'
177177 bullets = '*+-' # An iterable of bullet types.
178178 code_language = ''
179179 code_language_callback = None
@@ -208,11 +208,15 @@ def __init__(self, **options):
208208 raise ValueError ('You may specify either tags to strip or tags to'
209209 ' convert, but not both.' )
210210
211+ # If a string or list is passed to bs4_options, assume it is a 'features' specification
212+ if not isinstance (self .options ['bs4_options' ], dict ):
213+ self .options ['bs4_options' ] = {'features' : self .options ['bs4_options' ]}
214+
211215 # Initialize the conversion function cache
212216 self .convert_fn_cache = {}
213217
214218 def convert (self , html ):
215- soup = BeautifulSoup (html , self .options ['beautiful_soup_parser ' ])
219+ soup = BeautifulSoup (html , ** self .options ['bs4_options ' ])
216220 return self .convert_soup (soup )
217221
218222 def convert_soup (self , soup ):
Original file line number Diff line number Diff line change @@ -70,12 +70,11 @@ def main(argv=sys.argv[1:]):
7070 parser .add_argument ('-w' , '--wrap' , action = 'store_true' ,
7171 help = "Wrap all text paragraphs at --wrap-width characters." )
7272 parser .add_argument ('--wrap-width' , type = int , default = 80 )
73- parser .add_argument ('-p' , '--beautiful-soup-parser' ,
74- dest = 'beautiful_soup_parser' ,
73+ parser .add_argument ('--bs4-options' ,
7574 default = 'html.parser' ,
76- help = "Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
77- "as html5lib, lxml or even a custom parser as long as it is installed on the execution "
78- "environment ." )
75+ help = "Specifies the parser that BeautifulSoup should use to parse "
76+ "the HTML markup. Examples include 'html5. parser', 'lxml', and "
77+ "'html5lib' ." )
7978
8079 args = parser .parse_args (argv )
8180 print (markdownify (** vars (args )))
Original file line number Diff line number Diff line change @@ -39,3 +39,10 @@ def test_strip_pre():
3939 assert markdownify ("<pre> \n \n Hello \n \n </pre>" , strip_pre = STRIP ) == "```\n Hello\n ```"
4040 assert markdownify ("<pre> \n \n Hello \n \n </pre>" , strip_pre = STRIP_ONE ) == "```\n \n Hello \n \n ```"
4141 assert markdownify ("<pre> \n \n Hello \n \n </pre>" , strip_pre = None ) == "```\n \n \n Hello \n \n \n ```"
42+
43+
44+ def bs4_options ():
45+ assert markdownify ("<p>Hello</p>" , bs4_options = "html.parser" ) == "Hello"
46+ assert markdownify ("<p>Hello</p>" , bs4_options = ["html.parser" ]) == "Hello"
47+ assert markdownify ("<p>Hello</p>" , bs4_options = {"features" : "html.parser" }) == "Hello"
48+
You can’t perform that action at this time.
0 commit comments