Added a keyword argument on_duplicate_attribute to the

BeautifulSoupHTMLParser constructor (used by the html.parser tree builder) which lets you customize the handling of markup that contains the same attribute more than once, as in: <a href="url1" href="url2"> [bug=1878209]
author: Leonard Richardson <leonardr@segfault.org> 2020-05-17 08:29:57 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2020-05-17 08:29:57 -0400
commit: 83c8c3a029d29fd833a8137b205f4ca78a4b1c26 (patch)
tree: e58982e78094c5032ae105d5f3b45ea83ae49193 /bs4/builder/_htmlparser.py
parent: 6218e871dd247274823d7a6d5be413544bcc8f19 (diff)
1 files changed, 33 insertions, 3 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 2bb764f..476fd79 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -57,8 +57,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
     listens for HTMLParser events and translates them into calls
     to Beautiful Soup's tree construction API.
     """
+
+    # Strategies for handling duplicate attributes
+    IGNORE = 'ignore'
+    REPLACE = 'replace'
     
     def __init__(self, *args, **kwargs):
+        """Constructor.
+
+        :param on_duplicate_attribute: A strategy for what to do if a
+            tag includes the same attribute more than once. Accepted
+            values are: REPLACE (replace earlier values with later
+            ones, the default), IGNORE (keep the earliest value
+            encountered), or a callable. A callable must take three
+            arguments: the dictionary of attributes already processed,
+            the name of the duplicate attribute, and the most recent value
+            encountered.           
+        """
+        self.on_duplicate_attribute = kwargs.pop(
+            'on_duplicate_attribute', self.REPLACE
+        )
         HTMLParser.__init__(self, *args, **kwargs)
 
         # Keep a list of empty-element tags that were encountered
@@ -114,7 +132,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
             # for consistency with the other tree builders.
             if value is None:
                 value = ''
-            attr_dict[key] = value
+            if key in attr_dict:
+                # A single attribute shows up multiple times in this
+                # tag. How to handle it depends on the
+                # on_duplicate_attribute setting.
+                on_dupe = self.on_duplicate_attribute
+                if on_dupe == self.IGNORE:
+                    pass
+                elif on_dupe in (None, self.REPLACE):
+                    attr_dict[key] = value
+                else:
+                    on_dupe(attr_dict, key, value)
+            else:
+                attr_dict[key] = value
             attrvalue = '""'
         #print "START", name
         sourceline, sourcepos = self.getpos()
@@ -273,7 +303,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
     # The html.parser knows which line number and position in the
     # original file is the source of an element.
     TRACKS_LINE_NUMBERS = True
-    
+
     def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
         """Constructor.
 
@@ -293,7 +323,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
         if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
             parser_kwargs['convert_charrefs'] = False
         self.parser_args = (parser_args, parser_kwargs)
-
+        
     def prepare_markup(self, markup, user_specified_encoding=None,
                        document_declared_encoding=None, exclude_encodings=None):
author	Leonard Richardson <leonardr@segfault.org>	2020-05-17 08:29:57 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2020-05-17 08:29:57 -0400
commit	83c8c3a029d29fd833a8137b205f4ca78a4b1c26 (patch)
tree	e58982e78094c5032ae105d5f3b45ea83ae49193 /bs4/builder/_htmlparser.py
parent	6218e871dd247274823d7a6d5be413544bcc8f19 (diff)