diff options
Diffstat (limited to 'beautifulsoup/builder')
-rw-r--r-- | beautifulsoup/builder/__init__.py | 31 | ||||
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 6 |
2 files changed, 26 insertions, 11 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 5c275d7..37e9c8a 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -12,16 +12,37 @@ class TreeBuilder(Entities): """Turn a document into a Beautiful Soup object tree.""" assume_html = False + preserve_whitespace_tags = set() + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. def __init__(self): self.soup = None - def isSelfClosingTag(self, name): - return name in self.self_closing_tags - def reset(self): pass + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTML builder does not consider a <p> tag to + be an empty-element tag (it's not in empty_element_tags). This + means an empty <p> tag will be presented as "<p></p>", not + "<p />". + + The default builder has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no contents. + "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will + be left alone. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + def feed(self, markup): raise NotImplementedError() @@ -101,8 +122,8 @@ class HTMLTreeBuilder(TreeBuilder): assume_html = True preserve_whitespace_tags = set(['pre', 'textarea']) - self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) + empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) # Used by set_up_substitutions to detect the charset in a META tag CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index 5e053b1..0cc9e51 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -7,9 +7,6 @@ import types class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser - preserve_whitespace_tags = set() - self_closing_tags = set() - @property def default_parser(self): # This can either return a parser object or a class, which @@ -55,9 +52,6 @@ class LXMLTreeBuilderForXML(TreeBuilder): def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] - if len(completed_tag.contents) == 0: - completed_tag.isSelfClosing = True - self.soup.handle_endtag(name) def pi(self, target, data): |