diff options
author | Leonard Richardson <leonardr@segfault.org> | 2020-05-17 08:29:57 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2020-05-17 08:29:57 -0400 |
commit | 83c8c3a029d29fd833a8137b205f4ca78a4b1c26 (patch) | |
tree | e58982e78094c5032ae105d5f3b45ea83ae49193 /bs4/builder/_htmlparser.py | |
parent | 6218e871dd247274823d7a6d5be413544bcc8f19 (diff) |
Added a keyword argument on_duplicate_attribute to the
BeautifulSoupHTMLParser constructor (used by the html.parser tree
builder) which lets you customize the handling of markup that
contains the same attribute more than once, as in:
<a href="url1" href="url2"> [bug=1878209]
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r-- | bs4/builder/_htmlparser.py | 36 |
1 files changed, 33 insertions, 3 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 2bb764f..476fd79 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -57,8 +57,26 @@ class BeautifulSoupHTMLParser(HTMLParser): listens for HTMLParser events and translates them into calls to Beautiful Soup's tree construction API. """ + + # Strategies for handling duplicate attributes + IGNORE = 'ignore' + REPLACE = 'replace' def __init__(self, *args, **kwargs): + """Constructor. + + :param on_duplicate_attribute: A strategy for what to do if a + tag includes the same attribute more than once. Accepted + values are: REPLACE (replace earlier values with later + ones, the default), IGNORE (keep the earliest value + encountered), or a callable. A callable must take three + arguments: the dictionary of attributes already processed, + the name of the duplicate attribute, and the most recent value + encountered. + """ + self.on_duplicate_attribute = kwargs.pop( + 'on_duplicate_attribute', self.REPLACE + ) HTMLParser.__init__(self, *args, **kwargs) # Keep a list of empty-element tags that were encountered @@ -114,7 +132,19 @@ class BeautifulSoupHTMLParser(HTMLParser): # for consistency with the other tree builders. if value is None: value = '' - attr_dict[key] = value + if key in attr_dict: + # A single attribute shows up multiple times in this + # tag. How to handle it depends on the + # on_duplicate_attribute setting. + on_dupe = self.on_duplicate_attribute + if on_dupe == self.IGNORE: + pass + elif on_dupe in (None, self.REPLACE): + attr_dict[key] = value + else: + on_dupe(attr_dict, key, value) + else: + attr_dict[key] = value attrvalue = '""' #print "START", name sourceline, sourcepos = self.getpos() @@ -273,7 +303,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): # The html.parser knows which line number and position in the # original file is the source of an element. TRACKS_LINE_NUMBERS = True - + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): """Constructor. @@ -293,7 +323,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: parser_kwargs['convert_charrefs'] = False self.parser_args = (parser_args, parser_kwargs) - + def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): |