import markupbase import re from util import buildSet, isList, isString # element has taken care of import weirdness, so import name2codepoint # from there to avoid duplicating the weirdness. from element import name2codepoint from element import ( CData, Comment, Declaration, Entities, ProcessingInstruction) from HTMLParser import HTMLParser, HTMLParseError __all__ = ['TreeBuilder', 'HTMLParserXMLTreeBuilder', 'HTMLParserTreeBuilder', 'XMLTreeBuilder', 'HTMLTreeBuilder', 'ICantBelieveItsValidHTMLTreeBuilder'] #This hack makes the HTMLParser-based tree builders able to parse XML #with namespaces. markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match def buildTagMap(default, *args): """Turns a list of maps, lists, or scalars into a single map. Used to build the nestable_tags and reset_nesting_tags maps out of lists and partial maps.""" built = {} for portion in args: if hasattr(portion, 'items'): #It's a map. Merge it. for k,v in portion.items(): built[k] = v elif isList(portion) and not isString(portion): #It's a list. Map each item to the default. for k in portion: built[k] = default else: #It's a scalar. Map it to the default. built[portion] = default return built class TreeBuilder(Entities): smartQuotesTo = Entities.XML_ENTITIES preserve_whitespace_tags = buildSet() quote_tags = buildSet() self_closing_tags = buildSet() assume_html = False def __init__(self): self.soup = None def isSelfClosingTag(self, name): return name in self.self_closing_tags def reset(self): pass def feed(self): pass class HTMLParserXMLTreeBuilder(HTMLParser, TreeBuilder): """ This class defines a basic tree builder based on Python's built-in HTMLParser. The tree builder knows nothing about tag behavior except for the following: You can't close a tag without closing all the tags it encloses. That is, "" actually means "". [Another possible explanation is "", but unless you specify 'bar' in self_closing_tags, this class will never use that explanation.] This class is useful for parsing XML or made-up markup languages, or when BeautifulSoup makes an assumption counter to what you were expecting. HTMLParser will process most bad HTML, and the BeautifulSoup class has some tricks for dealing with some HTML that kills HTMLParser, but Beautiful Soup can nonetheless choke or lose data if your data uses self-closing tags or declarations incorrectly. This class uses regexes to sanitize input, avoiding the vast majority of these problems. If the problems don't apply to you, pass in False for markupMassage, and you'll get better performance. The default parser massage techniques fix the two most common instances of invalid HTML that choke HTMLParser:
(No space between name of closing tag and tag close) (Extraneous whitespace in declaration) You can pass in a custom list of (RE object, replace method) tuples to get HTMLParserXMLTreeBuilder to scrub your input the way you want. """ reset_nesting_tags = {} nestable_tags = {} MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), lambda x: x.group(1) + ' />'), (re.compile(']*)>'), lambda x: '') ] def __init__(self, convertEntities=None, markupMassage=True, selfClosingTags=None, smartQuotesTo=Entities.XML_ENTITIES): HTMLParser.__init__(self) self.soup = None self.convertEntities = convertEntities self.instanceSelfClosingTags = buildSet(selfClosingTags or []) self.markupMassage = markupMassage self.smartQuotesTo = smartQuotesTo self.quoteStack = [] # Set the rules for how we'll deal with the entities we # encounter if self.convertEntities: # It doesn't make sense to convert encoded characters to # entities even while you're converting entities to Unicode. # Just convert it all to Unicode. self.smartQuotesTo = None if convertEntities == self.HTML_ENTITIES: self.convertXMLEntities = False self.convertHTMLEntities = True self.escapeUnrecognizedEntities = True elif convertEntities == self.XHTML_ENTITIES: self.convertXMLEntities = True self.convertHTMLEntities = True self.escapeUnrecognizedEntities = False elif convertEntities == self.XML_ENTITIES: self.convertXMLEntities = True self.convertHTMLEntities = False self.escapeUnrecognizedEntities = False else: self.convertXMLEntities = False self.convertHTMLEntities = False self.escapeUnrecognizedEntities = False def feed(self, markup): if markup is not None: if self.markupMassage: if not isList(self.markupMassage): self.markupMassage = self.MARKUP_MASSAGE for fix, m in self.markupMassage: markup = fix.sub(m, markup) # TODO: We get rid of markupMassage so that the # soup object can be deepcopied later on. Some # Python installations can't copy regexes. If anyone # was relying on the existence of markupMassage, this # might cause problems. # XXX: This might not be necessary now that we've moved # the massage code into the builder. #del(self.markupMassage) HTMLParser.feed(self, markup) def isSelfClosingTag(self, name): """Returns true iff the given string is the name of a self-closing tag according to this parser.""" return (name in self.self_closing_tags or name in self.instanceSelfClosingTags) def handle_starttag(self, name, attrs): if len(self.quoteStack) > 0: #This is not a real tag. #print "<%s> is not real!" % name attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) self.handle_data('<%s%s>' % (name, attrs)) return if not self.isSelfClosingTag(name): self.soup.endData() self._smartPop(name) tag = self.soup.handle_starttag(name, attrs) if tag is None: # The tag was filtered out by the SoupStrainer return if name in self.quote_tags: #print "Beginning quote (%s)" % name self.quoteStack.append(name) self.literal = 1 if self.isSelfClosingTag(name): self.soup.handle_endtag(name) def handle_endtag(self, name): if self.quoteStack and self.quoteStack[-1] != name: #This is not a real end tag. #print " is not real!" % name self.handle_data('' % name) return self.soup.handle_endtag(name) if self.quoteStack and self.quoteStack[-1] == name: self.quoteStack.pop() self.literal = (len(self.quoteStack) > 0) def handle_data(self, content): #print "Handling data " + content self.soup.handle_data(content) def handle_pi(self, text): """Handle a processing instruction as a ProcessingInstruction object, possibly one with a %SOUP-ENCODING% slot into which an encoding will be plugged later.""" if text[:3] == "xml": text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" self._toStringSubclass(text, ProcessingInstruction) def handle_comment(self, text): "Handle comments as Comment objects." self._toStringSubclass(text, Comment) def handle_charref(self, ref): "Handle character references as data." if self.convertEntities: data = unichr(int(ref)) else: data = '&#%s;' % ref self.handle_data(data) def handle_entityref(self, ref): """Handle entity references as data, possibly converting known HTML and/or XML entity references to the corresponding Unicode characters.""" data = None if self.convertHTMLEntities: try: data = unichr(name2codepoint[ref]) except KeyError: pass if not data and self.convertXMLEntities: data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) if not data and self.convertHTMLEntities and \ not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): # TODO: We've got a problem here. We're told this is # an entity reference, but it's not an XML entity # reference or an HTML entity reference. Nonetheless, # the logical thing to do is to pass it through as an # unrecognized entity reference. # # Except: when the input is "&carol;" this function # will be called with input "carol". When the input is # "AT&T", this function will be called with input # "T". We have no way of knowing whether a semicolon # was present originally, so we don't know whether # this is an unknown entity or just a misplaced # ampersand. # # The more common case is a misplaced ampersand, so I # escape the ampersand and omit the trailing semicolon. data = "&%s" % ref if not data: # This case is different from the one above, because we # haven't already gone through a supposedly comprehensive # mapping of entities to Unicode characters. We might not # have gone through any mapping at all. So the chances are # very high that this is a real entity, and not a # misplaced ampersand. data = "&%s;" % ref self.handle_data(data) def handle_decl(self, data): "Handle DOCTYPEs and the like as Declaration objects." self._toStringSubclass(data, Declaration) def _toStringSubclass(self, text, subclass): """Adds a certain piece of text to the tree as a NavigableString subclass.""" self.soup.endData() self.handle_data(text) self.soup.endData(subclass) def _smartPop(self, name): """We need to pop up to the previous tag of this type, unless one of this tag's nesting reset triggers comes between this tag and the previous tag of this type, OR unless this tag is a generic nesting trigger and another generic nesting trigger comes between this tag and the previous tag of this type. Examples:

FooBar *

* should pop to 'p', not 'b'.

FooBar *

* should pop to 'table', not 'p'.

Foo

Bar *

* should pop to 'tr', not 'p'.

    • *
    • * should pop to 'ul', not the first 'li'.
  • ** should pop to 'table', not the first 'tr' tag should implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' """ nestingResetTriggers = self.nestable_tags.get(name) isNestable = nestingResetTriggers != None isResetNesting = self.reset_nesting_tags.has_key(name) popTo = None inclusive = True for i in range(len(self.soup.tagStack)-1, 0, -1): p = self.soup.tagStack[i] if (not p or p.name == name) and not isNestable: #Non-nestable tags get popped to the top or to their #last occurance. popTo = name break if (nestingResetTriggers != None and p.name in nestingResetTriggers) \ or (nestingResetTriggers == None and isResetNesting and self.reset_nesting_tags.has_key(p.name)): #If we encounter one of the nesting reset triggers #peculiar to this tag, or we encounter another tag #that causes nesting to reset, pop up to but not #including that tag. popTo = p.name inclusive = False break p = p.parent if popTo: self.soup._popToTag(popTo, inclusive) def parse_declaration(self, i): """Treat a bogus SGML declaration as raw data. Treat a CDATA declaration as a CData object.""" j = None if self.rawdata[i:i+9] == '', i) if k == -1: k = len(self.rawdata) data = self.rawdata[i+9:k] j = k+3 self._toStringSubclass(data, CData) else: try: j = HTMLParser.parse_declaration(self, i) except HTMLParseError: toHandle = self.rawdata[i:] self.handle_data(toHandle) j = i + len(toHandle) return j class HTMLParserTreeBuilder(HTMLParserXMLTreeBuilder): """This builder knows the following facts about HTML: * Some tags have no closing tag and should be interpreted as being closed as soon as they are encountered. * The text inside some tags (ie. 'script') may contain tags which are not really part of the document and which should be parsed as text, not tags. If you want to parse the text as tags, you can always fetch it and parse it explicitly. * Tag nesting rules: Most tags can't be nested at all. For instance, the occurance of a

    tag should implicitly close the previous

    tag.

    Para1

    Para2 should be transformed into:

    Para1

    Para2 Some tags can be nested arbitrarily. For instance, the occurance of a

    tag should _not_ implicitly close the previous
    tag. Alice said:
    Bob said:
    Blah should NOT be transformed into: Alice said:
    Bob said:
    Blah Some tags can be nested, but the nesting is reset by the interposition of other tags. For instance, a
    , but not close a tag in another table.
    BlahBlah should be transformed into:
    BlahBlah but, Blah
    Blah should NOT be transformed into Blah
    Blah Differing assumptions about tag nesting rules are a major source of problems with the BeautifulSoup class. If BeautifulSoup is not treating as nestable a tag your page author treats as nestable, try subclassing this tree builder or using another parser's tree builder.""" assume_html = True preserve_whitespace_tags = buildSet(['pre', 'textarea']) quote_tags = buildSet(['script', 'textarea']) self_closing_tags = buildSet(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) #According to the HTML standard, each of these inline tags can #contain another tag of the same type. Furthermore, it's common #to actually use these tags this way. nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 'center'] #According to the HTML standard, these block tags can contain #another tag of the same type. Furthermore, it's common #to actually use these tags this way. nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del'] #Lists can contain other lists, but there are restrictions. nestable_list_tags = { 'ol' : [], 'ul' : [], 'li' : ['ul', 'ol'], 'dl' : [], 'dd' : ['dl'], 'dt' : ['dl'] } #Tables can contain other tables, but there are restrictions. nestable_table_tags = {'table' : [], 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 'td' : ['tr'], 'th' : ['tr'], 'thead' : ['table'], 'tbody' : ['table'], 'tfoot' : ['table'], } non_nestable_block_tags = ['address', 'form', 'p', 'pre'] #If one of these tags is encountered, all tags up to the next tag of #this type are popped. reset_nesting_tags = buildTagMap(None, nestable_block_tags, 'noscript', non_nestable_block_tags, nestable_list_tags, nestable_table_tags) nestable_tags = buildTagMap([], nestable_inline_tags, nestable_block_tags, nestable_list_tags, nestable_table_tags) def __init__(self, *args, **kwargs): if not kwargs.has_key('smartQuotesTo'): kwargs['smartQuotesTo'] = self.HTML_ENTITIES HTMLParserXMLTreeBuilder.__init__(self, *args, **kwargs) # Some aliases to use if you don't care about the underlying # implementation. XMLTreeBuilder = HTMLParserXMLTreeBuilder HTMLTreeBuilder = HTMLParserTreeBuilder class ICantBelieveItsValidHTMLTreeBuilder(HTMLParserTreeBuilder): """The is oriented towards skipping over common HTML errors like unclosed tags. However, sometimes it makes errors of its own. For instance, consider this fragment: FooBar This is perfectly valid (if bizarre) HTML. However, the BeautifulSoup class will implicitly close the first b tag when it encounters the second 'b'. It will think the author wrote "FooBar", and didn't close the first 'b' tag, because there's no real-world reason to bold something that's already bold. When it encounters '' it will close two more 'b' tags, for a grand total of three tags closed instead of two. This can throw off the rest of your document structure. The same is true of a number of other tags, listed below. It's much more common for someone to forget to close a 'b' tag than to actually use nested 'b' tags, and the BeautifulSoup class handles the common case. This class handles the not-co-common case: where you can't believe someone wrote what they did, but it's valid HTML and BeautifulSoup screwed up by assuming it wouldn't be.""" i_cant_believe_theyre_nestable_inline_tags = \ ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 'big'] i_cant_believe_theyre_nestable_block_tags = ['noscript'] nestable_tags = buildTagMap([], HTMLParserTreeBuilder.nestable_tags, i_cant_believe_theyre_nestable_block_tags, i_cant_believe_theyre_nestable_inline_tags)