diff options
Diffstat (limited to 'bs4/builder')
-rw-r--r-- | bs4/builder/__init__.py | 14 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 2 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 11 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 3 |
4 files changed, 19 insertions, 11 deletions
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 4207750..9dad920 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -96,11 +96,15 @@ class TreeBuilder(object): # A value for these tag/attribute combinations is a space- or # comma-separated list of CDATA, rather than a single CDATA. - cdata_list_attributes = {} + DEFAULT_CDATA_LIST_ATTRIBUTES = {} - - def __init__(self): + USE_DEFAULT = object() + + def __init__(self, cdata_list_attributes=USE_DEFAULT): self.soup = None + if cdata_list_attributes is self.USE_DEFAULT: + cdata_list_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES + self.cdata_list_attributes = cdata_list_attributes def initialize_soup(self, soup): """The BeautifulSoup object has been initialized and is now @@ -131,7 +135,7 @@ class TreeBuilder(object): if self.empty_element_tags is None: return True return tag_name in self.empty_element_tags - + def feed(self, markup): raise NotImplementedError() @@ -259,7 +263,7 @@ class HTMLTreeBuilder(TreeBuilder): # encounter one of these attributes, we will parse its value into # a list of values if possible. Upon output, the list will be # converted back into a string. - cdata_list_attributes = { + DEFAULT_CDATA_LIST_ATTRIBUTES = { "*" : ['class', 'accesskey', 'dropzone'], "a" : ['rel', 'rev'], "link" : ['rel', 'rev'], diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 6fa8593..6892a93 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -199,7 +199,7 @@ class AttrList(object): def __setitem__(self, name, value): # If this attribute is a multi-valued attribute for this element, # turn its value into a list. - list_attr = HTML5TreeBuilder.cdata_list_attributes + list_attr = self.element.cdata_list_attributes if (name in list_attr['*'] or (self.element.name in list_attr and name in list_attr[self.element.name])): diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index ff09ca3..56b8b91 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -214,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): NAME = HTMLPARSER features = [NAME, HTML, STRICT] - def __init__(self, *args, **kwargs): + def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): + super(HTMLParserTreeBuilder, self).__init__(**kwargs) + parser_args = parser_args or [] + parser_kwargs = parser_kwargs or {} if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: - kwargs['strict'] = False + parser_kwargs['strict'] = False if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: - kwargs['convert_charrefs'] = False - self.parser_args = (args, kwargs) + parser_kwargs['convert_charrefs'] = False + self.parser_args = (parser_args, parser_kwargs) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None, exclude_encodings=None): diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index b7e172c..27cadcb 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -94,7 +94,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): parser = parser(target=self, strip_cdata=False, encoding=encoding) return parser - def __init__(self, parser=None, empty_element_tags=None): + def __init__(self, parser=None, empty_element_tags=None, **kwargs): # TODO: Issue a warning if parser is present but not a # callable, since that means there's no way to create new # parsers for different encodings. @@ -103,6 +103,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.empty_element_tags = set(empty_element_tags) self.soup = None self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + super(LXMLTreeBuilderForXML, self).__init__(**kwargs) def _getNsTag(self, tag): # Split the namespace URL out of a fully-qualified lxml tag |