diff options
Diffstat (limited to 'beautifulsoup/builder/__init__.py')
-rw-r--r-- | beautifulsoup/builder/__init__.py | 31 |
1 files changed, 26 insertions, 5 deletions
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 5c275d7..37e9c8a 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -12,16 +12,37 @@ class TreeBuilder(Entities): """Turn a document into a Beautiful Soup object tree.""" assume_html = False + preserve_whitespace_tags = set() + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. def __init__(self): self.soup = None - def isSelfClosingTag(self, name): - return name in self.self_closing_tags - def reset(self): pass + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTML builder does not consider a <p> tag to + be an empty-element tag (it's not in empty_element_tags). This + means an empty <p> tag will be presented as "<p></p>", not + "<p />". + + The default builder has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no contents. + "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will + be left alone. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + def feed(self, markup): raise NotImplementedError() @@ -101,8 +122,8 @@ class HTMLTreeBuilder(TreeBuilder): assume_html = True preserve_whitespace_tags = set(['pre', 'textarea']) - self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) + empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) # Used by set_up_substitutions to detect the charset in a META tag CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) |