summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2020-05-17 08:29:57 -0400
committerLeonard Richardson <leonardr@segfault.org>2020-05-17 08:29:57 -0400
commit83c8c3a029d29fd833a8137b205f4ca78a4b1c26 (patch)
treee58982e78094c5032ae105d5f3b45ea83ae49193
parent6218e871dd247274823d7a6d5be413544bcc8f19 (diff)
Added a keyword argument on_duplicate_attribute to the
BeautifulSoupHTMLParser constructor (used by the html.parser tree builder) which lets you customize the handling of markup that contains the same attribute more than once, as in: <a href="url1" href="url2"> [bug=1878209]
-rw-r--r--CHANGELOG6
-rw-r--r--bs4/builder/_htmlparser.py36
-rw-r--r--bs4/element.py2
-rw-r--r--bs4/tests/test_htmlparser.py38
4 files changed, 78 insertions, 4 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 7e9eca8..270a771 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,11 @@
= 4.9.1 (unreleased)
+* Added a keyword argument 'on_duplicate_attribute' to the
+ BeautifulSoupHTMLParser constructor (used by the html.parser tree
+ builder) which lets you customize the handling of markup that
+ contains the same attribute more than once, as in:
+ <a href="url1" href="url2"> [bug=1878209] TODO: This needs documentation.
+
* Added a distinct subclass, GuessedAtParserWarning, for the warning
issued when BeautifulSoup is instantiated without a parser being
specified. [bug=1873787]
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 2bb764f..476fd79 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -57,8 +57,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
listens for HTMLParser events and translates them into calls
to Beautiful Soup's tree construction API.
"""
+
+ # Strategies for handling duplicate attributes
+ IGNORE = 'ignore'
+ REPLACE = 'replace'
def __init__(self, *args, **kwargs):
+ """Constructor.
+
+ :param on_duplicate_attribute: A strategy for what to do if a
+ tag includes the same attribute more than once. Accepted
+ values are: REPLACE (replace earlier values with later
+ ones, the default), IGNORE (keep the earliest value
+ encountered), or a callable. A callable must take three
+ arguments: the dictionary of attributes already processed,
+ the name of the duplicate attribute, and the most recent value
+ encountered.
+ """
+ self.on_duplicate_attribute = kwargs.pop(
+ 'on_duplicate_attribute', self.REPLACE
+ )
HTMLParser.__init__(self, *args, **kwargs)
# Keep a list of empty-element tags that were encountered
@@ -114,7 +132,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
# for consistency with the other tree builders.
if value is None:
value = ''
- attr_dict[key] = value
+ if key in attr_dict:
+ # A single attribute shows up multiple times in this
+ # tag. How to handle it depends on the
+ # on_duplicate_attribute setting.
+ on_dupe = self.on_duplicate_attribute
+ if on_dupe == self.IGNORE:
+ pass
+ elif on_dupe in (None, self.REPLACE):
+ attr_dict[key] = value
+ else:
+ on_dupe(attr_dict, key, value)
+ else:
+ attr_dict[key] = value
attrvalue = '""'
#print "START", name
sourceline, sourcepos = self.getpos()
@@ -273,7 +303,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
# The html.parser knows which line number and position in the
# original file is the source of an element.
TRACKS_LINE_NUMBERS = True
-
+
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
"""Constructor.
@@ -293,7 +323,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs)
-
+
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
diff --git a/bs4/element.py b/bs4/element.py
index 1744beb..4947be9 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -1757,7 +1757,7 @@ class Tag(PageElement):
if l:
r = l[0]
return r
- findChild = find
+ findChild = find #BS2
def find_all(self, name=None, attrs={}, recursive=True, text=None,
limit=None, **kwargs):
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 7be6493..7b06f89 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -51,7 +51,45 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
self.assertEqual("sourceline", soup.p.sourceline.name)
self.assertEqual("sourcepos", soup.p.sourcepos.name)
+ def test_on_duplicate_attribute(self):
+ # The html.parser tree builder has a variety of ways of
+ # handling a tag that contains the same attribute multiple times.
+
+ markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'
+
+ # If you don't provide any particular value for
+ # on_duplicate_attribute, later values replace earlier values.
+ soup = self.soup(markup)
+ self.assertEquals("url3", soup.a['href'])
+ self.assertEquals(["cls"], soup.a['class'])
+ self.assertEquals("id", soup.a['id'])
+ # You can also get this behavior explicitly.
+ def assert_attribute(on_duplicate_attribute, expected):
+ soup = self.soup(
+ markup, parser_kwargs=dict(
+ on_duplicate_attribute=on_duplicate_attribute
+ )
+ )
+ self.assertEquals(expected, soup.a['href'])
+
+ # Verify that non-duplicate attributes are treated normally.
+ self.assertEquals(["cls"], soup.a['class'])
+ self.assertEquals("id", soup.a['id'])
+ assert_attribute(None, "url3")
+ assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
+
+ # You can ignore subsequent values in favor of the first.
+ assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
+
+ # And you can pass in a callable that does whatever you want.
+ def accumulate(attrs, key, value):
+ if not isinstance(attrs[key], list):
+ attrs[key] = [attrs[key]]
+ attrs[key].append(value)
+ assert_attribute(accumulate, ["url1", "url2", "url3"])
+
+
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
"""Verify that our HTMLParser subclass implements error() in a way