Added a keyword argument on_duplicate_attribute to the

BeautifulSoupHTMLParser constructor (used by the html.parser tree builder) which lets you customize the handling of markup that contains the same attribute more than once, as in: <a href="url1" href="url2"> [bug=1878209]
author: Leonard Richardson <leonardr@segfault.org> 2020-05-17 08:29:57 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2020-05-17 08:29:57 -0400
commit: 83c8c3a029d29fd833a8137b205f4ca78a4b1c26 (patch)
tree: e58982e78094c5032ae105d5f3b45ea83ae49193
parent: 6218e871dd247274823d7a6d5be413544bcc8f19 (diff)
4 files changed, 78 insertions, 4 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 7e9eca8..270a771 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,11 @@
 = 4.9.1 (unreleased)
 
+* Added a keyword argument 'on_duplicate_attribute' to the
+  BeautifulSoupHTMLParser constructor (used by the html.parser tree
+  builder) which lets you customize the handling of markup that
+  contains the same attribute more than once, as in:
+  <a href="url1" href="url2"> [bug=1878209] TODO: This needs documentation.
+
 * Added a distinct subclass, GuessedAtParserWarning, for the warning
   issued when BeautifulSoup is instantiated without a parser being
   specified. [bug=1873787]
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 2bb764f..476fd79 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -57,8 +57,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
     listens for HTMLParser events and translates them into calls
     to Beautiful Soup's tree construction API.
     """
+
+    # Strategies for handling duplicate attributes
+    IGNORE = 'ignore'
+    REPLACE = 'replace'
     
     def __init__(self, *args, **kwargs):
+        """Constructor.
+
+        :param on_duplicate_attribute: A strategy for what to do if a
+            tag includes the same attribute more than once. Accepted
+            values are: REPLACE (replace earlier values with later
+            ones, the default), IGNORE (keep the earliest value
+            encountered), or a callable. A callable must take three
+            arguments: the dictionary of attributes already processed,
+            the name of the duplicate attribute, and the most recent value
+            encountered.           
+        """
+        self.on_duplicate_attribute = kwargs.pop(
+            'on_duplicate_attribute', self.REPLACE
+        )
         HTMLParser.__init__(self, *args, **kwargs)
 
         # Keep a list of empty-element tags that were encountered
@@ -114,7 +132,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
             # for consistency with the other tree builders.
             if value is None:
                 value = ''
-            attr_dict[key] = value
+            if key in attr_dict:
+                # A single attribute shows up multiple times in this
+                # tag. How to handle it depends on the
+                # on_duplicate_attribute setting.
+                on_dupe = self.on_duplicate_attribute
+                if on_dupe == self.IGNORE:
+                    pass
+                elif on_dupe in (None, self.REPLACE):
+                    attr_dict[key] = value
+                else:
+                    on_dupe(attr_dict, key, value)
+            else:
+                attr_dict[key] = value
             attrvalue = '""'
         #print "START", name
         sourceline, sourcepos = self.getpos()
@@ -273,7 +303,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
     # The html.parser knows which line number and position in the
     # original file is the source of an element.
     TRACKS_LINE_NUMBERS = True
-    
+
     def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
         """Constructor.
 
@@ -293,7 +323,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
         if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
             parser_kwargs['convert_charrefs'] = False
         self.parser_args = (parser_args, parser_kwargs)
-
+        
     def prepare_markup(self, markup, user_specified_encoding=None,
                        document_declared_encoding=None, exclude_encodings=None):
 
diff --git a/bs4/element.py b/bs4/element.py
index 1744beb..4947be9 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1757,7 +1757,7 @@ class Tag(PageElement):
         if l:
             r = l[0]
         return r
-    findChild = find
+    findChild = find #BS2
 
     def find_all(self, name=None, attrs={}, recursive=True, text=None,
                  limit=None, **kwargs):
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 7be6493..7b06f89 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -51,7 +51,45 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
         self.assertEqual("sourceline", soup.p.sourceline.name)
         self.assertEqual("sourcepos", soup.p.sourcepos.name)
 
+    def test_on_duplicate_attribute(self):
+        # The html.parser tree builder has a variety of ways of
+        # handling a tag that contains the same attribute multiple times.
+
+        markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'
+
+        # If you don't provide any particular value for
+        # on_duplicate_attribute, later values replace earlier values.
+        soup = self.soup(markup)
+        self.assertEquals("url3", soup.a['href'])
+        self.assertEquals(["cls"], soup.a['class'])
+        self.assertEquals("id", soup.a['id'])
         
+        # You can also get this behavior explicitly.
+        def assert_attribute(on_duplicate_attribute, expected):
+            soup = self.soup(
+                markup, parser_kwargs=dict(
+                    on_duplicate_attribute=on_duplicate_attribute
+                )
+            )
+            self.assertEquals(expected, soup.a['href'])
+
+            # Verify that non-duplicate attributes are treated normally.
+            self.assertEquals(["cls"], soup.a['class'])
+            self.assertEquals("id", soup.a['id'])
+        assert_attribute(None, "url3")
+        assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
+
+        # You can ignore subsequent values in favor of the first.
+        assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
+
+        # And you can pass in a callable that does whatever you want.
+        def accumulate(attrs, key, value):
+            if not isinstance(attrs[key], list):
+                attrs[key] = [attrs[key]]
+            attrs[key].append(value)
+        assert_attribute(accumulate, ["url1", "url2", "url3"])            
+
+
 class TestHTMLParserSubclass(SoupTest):
     def test_error(self):
         """Verify that our HTMLParser subclass implements error() in a way
author	Leonard Richardson <leonardr@segfault.org>	2020-05-17 08:29:57 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2020-05-17 08:29:57 -0400
commit	83c8c3a029d29fd833a8137b205f4ca78a4b1c26 (patch)
tree	e58982e78094c5032ae105d5f3b45ea83ae49193
parent	6218e871dd247274823d7a6d5be413544bcc8f19 (diff)