import markupbase
import re
from util import buildSet, isList, isString
# element has taken care of import weirdness, so import name2codepoint
# from there to avoid duplicating the weirdness.
from element import name2codepoint
from element import (
CData, Comment, Declaration, Entities, ProcessingInstruction)
from HTMLParser import HTMLParser, HTMLParseError
__all__ = ['TreeBuilder',
'HTMLParserXMLTreeBuilder',
'HTMLParserTreeBuilder',
'XMLTreeBuilder',
'HTMLTreeBuilder',
'ICantBelieveItsValidHTMLTreeBuilder']
#This hack makes the HTMLParser-based tree builders able to parse XML
#with namespaces.
markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
def buildTagMap(default, *args):
"""Turns a list of maps, lists, or scalars into a single map.
Used to build the nestable_tags and reset_nesting_tags maps out of
lists and partial maps."""
built = {}
for portion in args:
if hasattr(portion, 'items'):
#It's a map. Merge it.
for k,v in portion.items():
built[k] = v
elif isList(portion) and not isString(portion):
#It's a list. Map each item to the default.
for k in portion:
built[k] = default
else:
#It's a scalar. Map it to the default.
built[portion] = default
return built
class TreeBuilder(Entities):
smartQuotesTo = Entities.XML_ENTITIES
preserve_whitespace_tags = buildSet()
quote_tags = buildSet()
self_closing_tags = buildSet()
assume_html = False
def __init__(self):
self.soup = None
def isSelfClosingTag(self, name):
return name in self.self_closing_tags
def reset(self):
pass
def feed(self):
pass
class HTMLParserXMLTreeBuilder(HTMLParser, TreeBuilder):
"""
This class defines a basic tree builder based on Python's built-in
HTMLParser. The tree builder knows nothing about tag
behavior except for the following:
You can't close a tag without closing all the tags it encloses.
That is, "
(No space between name of closing tag and tag close)
(Extraneous whitespace in declaration)
You can pass in a custom list of (RE object, replace method)
tuples to get HTMLParserXMLTreeBuilder to scrub your input the way you
want.
"""
reset_nesting_tags = {}
nestable_tags = {}
MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
lambda x: x.group(1) + ' />'),
(re.compile(']*)>'),
lambda x: '')
]
def __init__(self, convertEntities=None, markupMassage=True,
selfClosingTags=None,
smartQuotesTo=Entities.XML_ENTITIES):
HTMLParser.__init__(self)
self.soup = None
self.convertEntities = convertEntities
self.instanceSelfClosingTags = buildSet(selfClosingTags or [])
self.markupMassage = markupMassage
self.smartQuotesTo = smartQuotesTo
self.quoteStack = []
# Set the rules for how we'll deal with the entities we
# encounter
if self.convertEntities:
# It doesn't make sense to convert encoded characters to
# entities even while you're converting entities to Unicode.
# Just convert it all to Unicode.
self.smartQuotesTo = None
if convertEntities == self.HTML_ENTITIES:
self.convertXMLEntities = False
self.convertHTMLEntities = True
self.escapeUnrecognizedEntities = True
elif convertEntities == self.XHTML_ENTITIES:
self.convertXMLEntities = True
self.convertHTMLEntities = True
self.escapeUnrecognizedEntities = False
elif convertEntities == self.XML_ENTITIES:
self.convertXMLEntities = True
self.convertHTMLEntities = False
self.escapeUnrecognizedEntities = False
else:
self.convertXMLEntities = False
self.convertHTMLEntities = False
self.escapeUnrecognizedEntities = False
def feed(self, markup):
if markup is not None:
if self.markupMassage:
if not isList(self.markupMassage):
self.markupMassage = self.MARKUP_MASSAGE
for fix, m in self.markupMassage:
markup = fix.sub(m, markup)
# TODO: We get rid of markupMassage so that the
# soup object can be deepcopied later on. Some
# Python installations can't copy regexes. If anyone
# was relying on the existence of markupMassage, this
# might cause problems.
# XXX: This might not be necessary now that we've moved
# the massage code into the builder.
#del(self.markupMassage)
HTMLParser.feed(self, markup)
def isSelfClosingTag(self, name):
"""Returns true iff the given string is the name of a
self-closing tag according to this parser."""
return (name in self.self_closing_tags
or name in self.instanceSelfClosingTags)
def handle_starttag(self, name, attrs):
if len(self.quoteStack) > 0:
#This is not a real tag.
#print "<%s> is not real!" % name
attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
self.handle_data('<%s%s>' % (name, attrs))
return
if not self.isSelfClosingTag(name):
self.soup.endData()
self._smartPop(name)
tag = self.soup.handle_starttag(name, attrs)
if tag is None:
# The tag was filtered out by the SoupStrainer
return
if name in self.quote_tags:
#print "Beginning quote (%s)" % name
self.quoteStack.append(name)
self.literal = 1
if self.isSelfClosingTag(name):
self.soup.handle_endtag(name)
def handle_endtag(self, name):
if self.quoteStack and self.quoteStack[-1] != name:
#This is not a real end tag.
#print "%s> is not real!" % name
self.handle_data('%s>' % name)
return
self.soup.handle_endtag(name)
if self.quoteStack and self.quoteStack[-1] == name:
self.quoteStack.pop()
self.literal = (len(self.quoteStack) > 0)
def handle_data(self, content):
#print "Handling data " + content
self.soup.handle_data(content)
def handle_pi(self, text):
"""Handle a processing instruction as a ProcessingInstruction
object, possibly one with a %SOUP-ENCODING% slot into which an
encoding will be plugged later."""
if text[:3] == "xml":
text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
self._toStringSubclass(text, ProcessingInstruction)
def handle_comment(self, text):
"Handle comments as Comment objects."
self._toStringSubclass(text, Comment)
def handle_charref(self, ref):
"Handle character references as data."
if self.convertEntities:
data = unichr(int(ref))
else:
data = '%s;' % ref
self.handle_data(data)
def handle_entityref(self, ref):
"""Handle entity references as data, possibly converting known
HTML and/or XML entity references to the corresponding Unicode
characters."""
data = None
if self.convertHTMLEntities:
try:
data = unichr(name2codepoint[ref])
except KeyError:
pass
if not data and self.convertXMLEntities:
data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
if not data and self.convertHTMLEntities and \
not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
# TODO: We've got a problem here. We're told this is
# an entity reference, but it's not an XML entity
# reference or an HTML entity reference. Nonetheless,
# the logical thing to do is to pass it through as an
# unrecognized entity reference.
#
# Except: when the input is "&carol;" this function
# will be called with input "carol". When the input is
# "AT&T", this function will be called with input
# "T". We have no way of knowing whether a semicolon
# was present originally, so we don't know whether
# this is an unknown entity or just a misplaced
# ampersand.
#
# The more common case is a misplaced ampersand, so I
# escape the ampersand and omit the trailing semicolon.
data = "&%s" % ref
if not data:
# This case is different from the one above, because we
# haven't already gone through a supposedly comprehensive
# mapping of entities to Unicode characters. We might not
# have gone through any mapping at all. So the chances are
# very high that this is a real entity, and not a
# misplaced ampersand.
data = "&%s;" % ref
self.handle_data(data)
def handle_decl(self, data):
"Handle DOCTYPEs and the like as Declaration objects."
self._toStringSubclass(data, Declaration)
def _toStringSubclass(self, text, subclass):
"""Adds a certain piece of text to the tree as a NavigableString
subclass."""
self.soup.endData()
self.handle_data(text)
self.soup.endData(subclass)
def _smartPop(self, name):
"""We need to pop up to the previous tag of this type, unless
one of this tag's nesting reset triggers comes between this
tag and the previous tag of this type, OR unless this tag is a
generic nesting trigger and another generic nesting trigger
comes between this tag and the previous tag of this type.
Examples:
FooBar *
* should pop to 'p', not 'b'.
Foo
* | * should pop to 'tr', not the first 'td'
"""
nestingResetTriggers = self.nestable_tags.get(name)
isNestable = nestingResetTriggers != None
isResetNesting = self.reset_nesting_tags.has_key(name)
popTo = None
inclusive = True
for i in range(len(self.soup.tagStack)-1, 0, -1):
p = self.soup.tagStack[i]
if (not p or p.name == name) and not isNestable:
#Non-nestable tags get popped to the top or to their
#last occurance.
popTo = name
break
if (nestingResetTriggers != None
and p.name in nestingResetTriggers) \
or (nestingResetTriggers == None and isResetNesting
and self.reset_nesting_tags.has_key(p.name)):
#If we encounter one of the nesting reset triggers
#peculiar to this tag, or we encounter another tag
#that causes nesting to reset, pop up to but not
#including that tag.
popTo = p.name
inclusive = False
break
p = p.parent
if popTo:
self.soup._popToTag(popTo, inclusive)
def parse_declaration(self, i):
"""Treat a bogus SGML declaration as raw data. Treat a CDATA
declaration as a CData object."""
j = None
if self.rawdata[i:i+9] == '', i)
if k == -1:
k = len(self.rawdata)
data = self.rawdata[i+9:k]
j = k+3
self._toStringSubclass(data, CData)
else:
try:
j = HTMLParser.parse_declaration(self, i)
except HTMLParseError:
toHandle = self.rawdata[i:]
self.handle_data(toHandle)
j = i + len(toHandle)
return j
class HTMLParserTreeBuilder(HTMLParserXMLTreeBuilder):
"""This builder knows the following facts about HTML:
* Some tags have no closing tag and should be interpreted as being
closed as soon as they are encountered.
* The text inside some tags (ie. 'script') may contain tags which
are not really part of the document and which should be parsed
as text, not tags. If you want to parse the text as tags, you can
always fetch it and parse it explicitly.
* Tag nesting rules:
Most tags can't be nested at all. For instance, the occurance of
a tag should implicitly close the previous tag. Para1 Para2 should be transformed into: Para1 Para2 Some tags can be nested arbitrarily. For instance, the occurance of a tag should _not_ implicitly close the previoustag. Alice said:Bob said:Blah should NOT be transformed into: Alice said:Bob said:Blah Some tags can be nested, but the nesting is reset by the interposition of other tags. For instance, a |