summaryrefslogtreecommitdiff
path: root/bs4/builder/_htmlparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/builder/_htmlparser.py')
-rw-r--r--bs4/builder/_htmlparser.py36
1 files changed, 33 insertions, 3 deletions
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 2bb764f..476fd79 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -57,8 +57,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
listens for HTMLParser events and translates them into calls
to Beautiful Soup's tree construction API.
"""
+
+ # Strategies for handling duplicate attributes
+ IGNORE = 'ignore'
+ REPLACE = 'replace'
def __init__(self, *args, **kwargs):
+ """Constructor.
+
+ :param on_duplicate_attribute: A strategy for what to do if a
+ tag includes the same attribute more than once. Accepted
+ values are: REPLACE (replace earlier values with later
+ ones, the default), IGNORE (keep the earliest value
+ encountered), or a callable. A callable must take three
+ arguments: the dictionary of attributes already processed,
+ the name of the duplicate attribute, and the most recent value
+ encountered.
+ """
+ self.on_duplicate_attribute = kwargs.pop(
+ 'on_duplicate_attribute', self.REPLACE
+ )
HTMLParser.__init__(self, *args, **kwargs)
# Keep a list of empty-element tags that were encountered
@@ -114,7 +132,19 @@ class BeautifulSoupHTMLParser(HTMLParser):
# for consistency with the other tree builders.
if value is None:
value = ''
- attr_dict[key] = value
+ if key in attr_dict:
+ # A single attribute shows up multiple times in this
+ # tag. How to handle it depends on the
+ # on_duplicate_attribute setting.
+ on_dupe = self.on_duplicate_attribute
+ if on_dupe == self.IGNORE:
+ pass
+ elif on_dupe in (None, self.REPLACE):
+ attr_dict[key] = value
+ else:
+ on_dupe(attr_dict, key, value)
+ else:
+ attr_dict[key] = value
attrvalue = '""'
#print "START", name
sourceline, sourcepos = self.getpos()
@@ -273,7 +303,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
# The html.parser knows which line number and position in the
# original file is the source of an element.
TRACKS_LINE_NUMBERS = True
-
+
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
"""Constructor.
@@ -293,7 +323,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs)
-
+
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):