summaryrefslogtreecommitdiff
path: root/BeautifulSoup.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonard.richardson@canonical.com>2009-04-08 18:09:23 -0400
committerLeonard Richardson <leonard.richardson@canonical.com>2009-04-08 18:09:23 -0400
commitd1abb37f36b7594d284504095d83d0b11a8ee95b (patch)
tree52573ae0d52f86e685e02f3c2161ab7ea1f27564 /BeautifulSoup.py
parentd9dc85451b92a1795b5de99cc1af7cac2c9f6468 (diff)
Moved reset_nesting_tags and nestable_tags from BeautifulSoup to the HTML builder.
Diffstat (limited to 'BeautifulSoup.py')
-rw-r--r--BeautifulSoup.py95
1 files changed, 48 insertions, 47 deletions
diff --git a/BeautifulSoup.py b/BeautifulSoup.py
index 5e9e443..2615edc 100644
--- a/BeautifulSoup.py
+++ b/BeautifulSoup.py
@@ -1046,6 +1046,9 @@ class XMLParserBuilder(HTMLParser, TreeBuilder):
want.
"""
+ reset_nesting_tags = {}
+ nestable_tags = {}
+
MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
lambda x: x.group(1) + ' />'),
(re.compile('<!\s+([^<>]*)>'),
@@ -1220,6 +1223,48 @@ class HTMLParserBuilder(XMLParserBuilder):
self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
+ #According to the HTML standard, each of these inline tags can
+ #contain another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
+ 'center']
+
+ #According to the HTML standard, these block tags can contain
+ #another tag of the same type. Furthermore, it's common
+ #to actually use these tags this way.
+ nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del']
+
+ #Lists can contain other lists, but there are restrictions.
+ nestable_list_tags = { 'ol' : [],
+ 'ul' : [],
+ 'li' : ['ul', 'ol'],
+ 'dl' : [],
+ 'dd' : ['dl'],
+ 'dt' : ['dl'] }
+
+ #Tables can contain other tables, but there are restrictions.
+ nestable_table_tags = {'table' : [],
+ 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
+ 'td' : ['tr'],
+ 'th' : ['tr'],
+ 'thead' : ['table'],
+ 'tbody' : ['table'],
+ 'tfoot' : ['table'],
+ }
+
+ non_nestable_block_tags = ['address', 'form', 'p', 'pre']
+
+ #If one of these tags is encountered, all tags up to the next tag of
+ #this type are popped.
+ reset_nesting_tags = buildTagMap(None, nestable_block_tags, 'noscript',
+ non_nestable_block_tags,
+ nestable_list_tags,
+ nestable_table_tags)
+
+ nestable_tags = buildTagMap([], nestable_inline_tags, nestable_block_tags,
+ nestable_list_tags, nestable_table_tags)
+
+
def __init__(self, *args, **kwargs):
if not kwargs.has_key('smartQuotesTo'):
kwargs['smartQuotesTo'] = self.HTML_ENTITIES
@@ -1251,9 +1296,6 @@ class BeautifulStoneSoup(Tag):
or when BeautifulSoup makes an assumption counter to what you were
expecting."""
- nestable_tags = {}
- reset_nesting_tags = {}
-
ROOT_TAG_NAME = u'[document]'
# Used when determining whether a text node is all whitespace and
@@ -1408,9 +1450,9 @@ class BeautifulStoneSoup(Tag):
<td><tr><td> *<td>* should pop to 'tr', not the first 'td'
"""
- nestingResetTriggers = self.nestable_tags.get(name)
+ nestingResetTriggers = self.builder.nestable_tags.get(name)
isNestable = nestingResetTriggers != None
- isResetNesting = self.reset_nesting_tags.has_key(name)
+ isResetNesting = self.builder.reset_nesting_tags.has_key(name)
popTo = None
inclusive = True
for i in range(len(self.tagStack)-1, 0, -1):
@@ -1423,7 +1465,7 @@ class BeautifulStoneSoup(Tag):
if (nestingResetTriggers != None
and p.name in nestingResetTriggers) \
or (nestingResetTriggers == None and isResetNesting
- and self.reset_nesting_tags.has_key(p.name)):
+ and self.builder.reset_nesting_tags.has_key(p.name)):
#If we encounter one of the nesting reset triggers
#peculiar to this tag, or we encounter another tag
@@ -1543,47 +1585,6 @@ class BeautifulSoup(BeautifulStoneSoup):
kwargs['isHTML'] = True
BeautifulStoneSoup.__init__(self, *args, **kwargs)
- #According to the HTML standard, each of these inline tags can
- #contain another tag of the same type. Furthermore, it's common
- #to actually use these tags this way.
- nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
- 'center']
-
- #According to the HTML standard, these block tags can contain
- #another tag of the same type. Furthermore, it's common
- #to actually use these tags this way.
- nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del']
-
- #Lists can contain other lists, but there are restrictions.
- nestable_list_tags = { 'ol' : [],
- 'ul' : [],
- 'li' : ['ul', 'ol'],
- 'dl' : [],
- 'dd' : ['dl'],
- 'dt' : ['dl'] }
-
- #Tables can contain other tables, but there are restrictions.
- nestable_table_tags = {'table' : [],
- 'tr' : ['table', 'tbody', 'tfoot', 'thead'],
- 'td' : ['tr'],
- 'th' : ['tr'],
- 'thead' : ['table'],
- 'tbody' : ['table'],
- 'tfoot' : ['table'],
- }
-
- non_nestable_block_tags = ['address', 'form', 'p', 'pre']
-
- #If one of these tags is encountered, all tags up to the next tag of
- #this type are popped.
- reset_nesting_tags = buildTagMap(None, nestable_block_tags, 'noscript',
- non_nestable_block_tags,
- nestable_list_tags,
- nestable_table_tags)
-
- nestable_tags = buildTagMap([], nestable_inline_tags, nestable_block_tags,
- nestable_list_tags, nestable_table_tags)
-
# Used to detect the charset in a META tag; see start_meta
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)