diff options
author | Leonard Richardson <leonardr@segfault.org> | 2020-05-17 08:29:57 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2020-05-17 08:29:57 -0400 |
commit | 83c8c3a029d29fd833a8137b205f4ca78a4b1c26 (patch) | |
tree | e58982e78094c5032ae105d5f3b45ea83ae49193 | |
parent | 6218e871dd247274823d7a6d5be413544bcc8f19 (diff) |
Added a keyword argument on_duplicate_attribute to the
BeautifulSoupHTMLParser constructor (used by the html.parser tree
builder) which lets you customize the handling of markup that
contains the same attribute more than once, as in:
<a href="url1" href="url2"> [bug=1878209]
-rw-r--r-- | CHANGELOG | 6 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 36 | ||||
-rw-r--r-- | bs4/element.py | 2 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 38 |
4 files changed, 78 insertions, 4 deletions
@@ -1,5 +1,11 @@ = 4.9.1 (unreleased) +* Added a keyword argument 'on_duplicate_attribute' to the + BeautifulSoupHTMLParser constructor (used by the html.parser tree + builder) which lets you customize the handling of markup that + contains the same attribute more than once, as in: + <a href="url1" href="url2"> [bug=1878209] TODO: This needs documentation. + * Added a distinct subclass, GuessedAtParserWarning, for the warning issued when BeautifulSoup is instantiated without a parser being specified. [bug=1873787] diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 2bb764f..476fd79 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -57,8 +57,26 @@ class BeautifulSoupHTMLParser(HTMLParser): listens for HTMLParser events and translates them into calls to Beautiful Soup's tree construction API. """ + + # Strategies for handling duplicate attributes + IGNORE = 'ignore' + REPLACE = 'replace' def __init__(self, *args, **kwargs): + """Constructor. + + :param on_duplicate_attribute: A strategy for what to do if a + tag includes the same attribute more than once. Accepted + values are: REPLACE (replace earlier values with later + ones, the default), IGNORE (keep the earliest value + encountered), or a callable. A callable must take three + arguments: the dictionary of attributes already processed, + the name of the duplicate attribute, and the most recent value + encountered. + """ + self.on_duplicate_attribute = kwargs.pop( + 'on_duplicate_attribute', self.REPLACE + ) HTMLParser.__init__(self, *args, **kwargs) # Keep a list of empty-element tags that were encountered @@ -114,7 +132,19 @@ class BeautifulSoupHTMLParser(HTMLParser): # for consistency with the other tree builders. if value is None: value = '' - attr_dict[key] = value + if key in attr_dict: + # A single attribute shows up multiple times in this + # tag. How to handle it depends on the + # on_duplicate_attribute setting. + on_dupe = self.on_duplicate_attribute + if on_dupe == self.IGNORE: + pass + elif on_dupe in (None, self.REPLACE): + attr_dict[key] = value + else: + on_dupe(attr_dict, key, value) + else: + attr_dict[key] = value attrvalue = '""' #print "START", name sourceline, sourcepos = self.getpos() @@ -273,7 +303,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): # The html.parser knows which line number and position in the # original file is the source of an element. TRACKS_LINE_NUMBERS = True - + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): """Constructor. @@ -293,7 +323,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: parser_kwargs['convert_charrefs'] = False self.parser_args = (parser_args, parser_kwargs) - + def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): diff --git a/bs4/element.py b/bs4/element.py index 1744beb..4947be9 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1757,7 +1757,7 @@ class Tag(PageElement): if l: r = l[0] return r - findChild = find + findChild = find #BS2 def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs): diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index 7be6493..7b06f89 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -51,7 +51,45 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertEqual("sourceline", soup.p.sourceline.name) self.assertEqual("sourcepos", soup.p.sourcepos.name) + def test_on_duplicate_attribute(self): + # The html.parser tree builder has a variety of ways of + # handling a tag that contains the same attribute multiple times. + + markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">' + + # If you don't provide any particular value for + # on_duplicate_attribute, later values replace earlier values. + soup = self.soup(markup) + self.assertEquals("url3", soup.a['href']) + self.assertEquals(["cls"], soup.a['class']) + self.assertEquals("id", soup.a['id']) + # You can also get this behavior explicitly. + def assert_attribute(on_duplicate_attribute, expected): + soup = self.soup( + markup, parser_kwargs=dict( + on_duplicate_attribute=on_duplicate_attribute + ) + ) + self.assertEquals(expected, soup.a['href']) + + # Verify that non-duplicate attributes are treated normally. + self.assertEquals(["cls"], soup.a['class']) + self.assertEquals("id", soup.a['id']) + assert_attribute(None, "url3") + assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3") + + # You can ignore subsequent values in favor of the first. + assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1") + + # And you can pass in a callable that does whatever you want. + def accumulate(attrs, key, value): + if not isinstance(attrs[key], list): + attrs[key] = [attrs[key]] + attrs[key].append(value) + assert_attribute(accumulate, ["url1", "url2", "url3"]) + + class TestHTMLParserSubclass(SoupTest): def test_error(self): """Verify that our HTMLParser subclass implements error() in a way |