diff options
Diffstat (limited to 'beautifulsoup/builder')
-rw-r--r-- | beautifulsoup/builder/__init__.py | 33 | ||||
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 10 |
2 files changed, 30 insertions, 13 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 5c275d7..deaa613 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -12,16 +12,37 @@ class TreeBuilder(Entities): """Turn a document into a Beautiful Soup object tree.""" assume_html = False + preserve_whitespace_tags = set() + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. def __init__(self): self.soup = None - def isSelfClosingTag(self, name): - return name in self.self_closing_tags - def reset(self): pass + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTML builder does not consider a <p> tag to + be an empty-element tag (it's not in empty_element_tags). This + means an empty <p> tag will be presented as "<p></p>", not + "<p />". + + The default builder has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no contents. + "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will + be left alone. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + def feed(self, markup): raise NotImplementedError() @@ -95,14 +116,14 @@ class SAXTreeBuilder(TreeBuilder): class HTMLTreeBuilder(TreeBuilder): """This TreeBuilder knows facts about HTML. - Such as which tags are self-closing tags. + Such as which tags are empty-element tags. """ assume_html = True preserve_whitespace_tags = set(['pre', 'textarea']) - self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) + empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) # Used by set_up_substitutions to detect the charset in a META tag CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index 72e5913..e431a62 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -7,16 +7,15 @@ import types class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser - preserve_whitespace_tags = set() - self_closing_tags = set() - @property def default_parser(self): # This can either return a parser object or a class, which # will be instantiated with default arguments. return etree.XMLParser - def __init__(self, parser=None): + def __init__(self, parser=None, empty_element_tags=None): + if empty_element_tags is not None: + self.empty_element_tags = set(empty_element_tags) if parser is None: # Use the default parser. parser = self.default_parser @@ -53,9 +52,6 @@ class LXMLTreeBuilderForXML(TreeBuilder): def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] - if len(completed_tag.contents) == 0: - completed_tag.isSelfClosing = True - self.soup.handle_endtag(name) def pi(self, target, data): |