Added a keyword argument on_duplicate_attribute to the

BeautifulSoupHTMLParser constructor (used by the html.parser tree builder) which lets you customize the handling of markup that contains the same attribute more than once, as in: <a href="url1" href="url2"> [bug=1878209]
author: Leonard Richardson <leonardr@segfault.org> 2020-05-17 08:29:57 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2020-05-17 08:29:57 -0400
commit: 83c8c3a029d29fd833a8137b205f4ca78a4b1c26 (patch)
tree: e58982e78094c5032ae105d5f3b45ea83ae49193 /bs4/tests/test_htmlparser.py
parent: 6218e871dd247274823d7a6d5be413544bcc8f19 (diff)
1 files changed, 38 insertions, 0 deletions
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 7be6493..7b06f89 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -51,7 +51,45 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
         self.assertEqual("sourceline", soup.p.sourceline.name)
         self.assertEqual("sourcepos", soup.p.sourcepos.name)
 
+    def test_on_duplicate_attribute(self):
+        # The html.parser tree builder has a variety of ways of
+        # handling a tag that contains the same attribute multiple times.
+
+        markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'
+
+        # If you don't provide any particular value for
+        # on_duplicate_attribute, later values replace earlier values.
+        soup = self.soup(markup)
+        self.assertEquals("url3", soup.a['href'])
+        self.assertEquals(["cls"], soup.a['class'])
+        self.assertEquals("id", soup.a['id'])
         
+        # You can also get this behavior explicitly.
+        def assert_attribute(on_duplicate_attribute, expected):
+            soup = self.soup(
+                markup, parser_kwargs=dict(
+                    on_duplicate_attribute=on_duplicate_attribute
+                )
+            )
+            self.assertEquals(expected, soup.a['href'])
+
+            # Verify that non-duplicate attributes are treated normally.
+            self.assertEquals(["cls"], soup.a['class'])
+            self.assertEquals("id", soup.a['id'])
+        assert_attribute(None, "url3")
+        assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
+
+        # You can ignore subsequent values in favor of the first.
+        assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
+
+        # And you can pass in a callable that does whatever you want.
+        def accumulate(attrs, key, value):
+            if not isinstance(attrs[key], list):
+                attrs[key] = [attrs[key]]
+            attrs[key].append(value)
+        assert_attribute(accumulate, ["url1", "url2", "url3"])            
+
+
 class TestHTMLParserSubclass(SoupTest):
     def test_error(self):
         """Verify that our HTMLParser subclass implements error() in a way
author	Leonard Richardson <leonardr@segfault.org>	2020-05-17 08:29:57 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2020-05-17 08:29:57 -0400
commit	83c8c3a029d29fd833a8137b205f4ca78a4b1c26 (patch)
tree	e58982e78094c5032ae105d5f3b45ea83ae49193 /bs4/tests/test_htmlparser.py
parent	6218e871dd247274823d7a6d5be413544bcc8f19 (diff)