diff options
author | Leonard Richardson <leonardr@segfault.org> | 2020-05-17 08:29:57 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2020-05-17 08:29:57 -0400 |
commit | 83c8c3a029d29fd833a8137b205f4ca78a4b1c26 (patch) | |
tree | e58982e78094c5032ae105d5f3b45ea83ae49193 /bs4/tests/test_htmlparser.py | |
parent | 6218e871dd247274823d7a6d5be413544bcc8f19 (diff) |
Added a keyword argument on_duplicate_attribute to the
BeautifulSoupHTMLParser constructor (used by the html.parser tree
builder) which lets you customize the handling of markup that
contains the same attribute more than once, as in:
<a href="url1" href="url2"> [bug=1878209]
Diffstat (limited to 'bs4/tests/test_htmlparser.py')
-rw-r--r-- | bs4/tests/test_htmlparser.py | 38 |
1 files changed, 38 insertions, 0 deletions
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index 7be6493..7b06f89 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -51,7 +51,45 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertEqual("sourceline", soup.p.sourceline.name) self.assertEqual("sourcepos", soup.p.sourcepos.name) + def test_on_duplicate_attribute(self): + # The html.parser tree builder has a variety of ways of + # handling a tag that contains the same attribute multiple times. + + markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">' + + # If you don't provide any particular value for + # on_duplicate_attribute, later values replace earlier values. + soup = self.soup(markup) + self.assertEquals("url3", soup.a['href']) + self.assertEquals(["cls"], soup.a['class']) + self.assertEquals("id", soup.a['id']) + # You can also get this behavior explicitly. + def assert_attribute(on_duplicate_attribute, expected): + soup = self.soup( + markup, parser_kwargs=dict( + on_duplicate_attribute=on_duplicate_attribute + ) + ) + self.assertEquals(expected, soup.a['href']) + + # Verify that non-duplicate attributes are treated normally. + self.assertEquals(["cls"], soup.a['class']) + self.assertEquals("id", soup.a['id']) + assert_attribute(None, "url3") + assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3") + + # You can ignore subsequent values in favor of the first. + assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1") + + # And you can pass in a callable that does whatever you want. + def accumulate(attrs, key, value): + if not isinstance(attrs[key], list): + attrs[key] = [attrs[key]] + attrs[key].append(value) + assert_attribute(accumulate, ["url1", "url2", "url3"]) + + class TestHTMLParserSubclass(SoupTest): def test_error(self): """Verify that our HTMLParser subclass implements error() in a way |