diff options
Diffstat (limited to 'src/beautifulsoup/builder.py')
-rw-r--r-- | src/beautifulsoup/builder.py | 484 |
1 files changed, 484 insertions, 0 deletions
diff --git a/src/beautifulsoup/builder.py b/src/beautifulsoup/builder.py new file mode 100644 index 0000000..aeae1e8 --- /dev/null +++ b/src/beautifulsoup/builder.py @@ -0,0 +1,484 @@ +import markupbase +import re +from util import buildSet, isList, isString +# element has taken care of import weirdness, so import name2codepoint +# from there to avoid duplicating the weirdness. +from element import name2codepoint +from element import ( + CData, Comment, Declaration, Entities, ProcessingInstruction) +from HTMLParser import HTMLParser, HTMLParseError + +__all__ = ['TreeBuilder', + 'HTMLParserXMLTreeBuilder', + 'HTMLParserTreeBuilder', + 'XMLTreeBuilder', + 'HTMLTreeBuilder', + 'ICantBelieveItsValidHTMLTreeBuilder'] + +#This hack makes the HTMLParser-based tree builders able to parse XML +#with namespaces. +markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match + +def buildTagMap(default, *args): + """Turns a list of maps, lists, or scalars into a single map. + Used to build the nestable_tags and reset_nesting_tags maps out of + lists and partial maps.""" + built = {} + for portion in args: + if hasattr(portion, 'items'): + #It's a map. Merge it. + for k,v in portion.items(): + built[k] = v + elif isList(portion) and not isString(portion): + #It's a list. Map each item to the default. + for k in portion: + built[k] = default + else: + #It's a scalar. Map it to the default. + built[portion] = default + return built + + +class TreeBuilder(Entities): + + smartQuotesTo = Entities.XML_ENTITIES + preserve_whitespace_tags = buildSet() + quote_tags = buildSet() + self_closing_tags = buildSet() + assume_html = False + + def __init__(self): + self.soup = None + + def isSelfClosingTag(self, name): + return name in self.self_closing_tags + + def reset(self): + pass + + def feed(self): + pass + + +class HTMLParserXMLTreeBuilder(HTMLParser, TreeBuilder): + + """ + This class defines a basic tree builder based on Python's built-in + HTMLParser. The tree builder knows nothing about tag + behavior except for the following: + + You can't close a tag without closing all the tags it encloses. + That is, "<foo><bar></foo>" actually means + "<foo><bar></bar></foo>". + + [Another possible explanation is "<foo><bar /></foo>", but unless + you specify 'bar' in self_closing_tags, this class will never use + that explanation.] + + This class is useful for parsing XML or made-up markup languages, + or when BeautifulSoup makes an assumption counter to what you were + expecting. + + + HTMLParser will process most bad HTML, and the BeautifulSoup class + has some tricks for dealing with some HTML that kills HTMLParser, + but Beautiful Soup can nonetheless choke or lose data if your data + uses self-closing tags or declarations incorrectly. + + This class uses regexes to sanitize input, avoiding the vast + majority of these problems. If the problems don't apply to you, + pass in False for markupMassage, and you'll get better + performance. + + The default parser massage techniques fix the two most common + instances of invalid HTML that choke HTMLParser: + + <br/> (No space between name of closing tag and tag close) + <! --Comment--> (Extraneous whitespace in declaration) + + You can pass in a custom list of (RE object, replace method) + tuples to get HTMLParserXMLTreeBuilder to scrub your input the way you + want. + """ + reset_nesting_tags = {} + nestable_tags = {} + + MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), + lambda x: x.group(1) + ' />'), + (re.compile('<!\s+([^<>]*)>'), + lambda x: '<!' + x.group(1) + '>') + ] + + def __init__(self, convertEntities=None, markupMassage=True, + selfClosingTags=None, + smartQuotesTo=Entities.XML_ENTITIES): + HTMLParser.__init__(self) + self.soup = None + self.convertEntities = convertEntities + self.instanceSelfClosingTags = buildSet(selfClosingTags or []) + self.markupMassage = markupMassage + self.smartQuotesTo = smartQuotesTo + self.quoteStack = [] + + # Set the rules for how we'll deal with the entities we + # encounter + if self.convertEntities: + # It doesn't make sense to convert encoded characters to + # entities even while you're converting entities to Unicode. + # Just convert it all to Unicode. + self.smartQuotesTo = None + if convertEntities == self.HTML_ENTITIES: + self.convertXMLEntities = False + self.convertHTMLEntities = True + self.escapeUnrecognizedEntities = True + elif convertEntities == self.XHTML_ENTITIES: + self.convertXMLEntities = True + self.convertHTMLEntities = True + self.escapeUnrecognizedEntities = False + elif convertEntities == self.XML_ENTITIES: + self.convertXMLEntities = True + self.convertHTMLEntities = False + self.escapeUnrecognizedEntities = False + else: + self.convertXMLEntities = False + self.convertHTMLEntities = False + self.escapeUnrecognizedEntities = False + + def feed(self, markup): + if markup is not None: + if self.markupMassage: + if not isList(self.markupMassage): + self.markupMassage = self.MARKUP_MASSAGE + for fix, m in self.markupMassage: + markup = fix.sub(m, markup) + # TODO: We get rid of markupMassage so that the + # soup object can be deepcopied later on. Some + # Python installations can't copy regexes. If anyone + # was relying on the existence of markupMassage, this + # might cause problems. + # XXX: This might not be necessary now that we've moved + # the massage code into the builder. + #del(self.markupMassage) + HTMLParser.feed(self, markup) + + def isSelfClosingTag(self, name): + """Returns true iff the given string is the name of a + self-closing tag according to this parser.""" + return (name in self.self_closing_tags + or name in self.instanceSelfClosingTags) + + def handle_starttag(self, name, attrs): + if len(self.quoteStack) > 0: + #This is not a real tag. + #print "<%s> is not real!" % name + attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) + self.handle_data('<%s%s>' % (name, attrs)) + return + if not self.isSelfClosingTag(name): + self.soup.endData() + self._smartPop(name) + tag = self.soup.handle_starttag(name, attrs) + if tag is None: + # The tag was filtered out by the SoupStrainer + return + if name in self.quote_tags: + #print "Beginning quote (%s)" % name + self.quoteStack.append(name) + self.literal = 1 + if self.isSelfClosingTag(name): + self.soup.handle_endtag(name) + + def handle_endtag(self, name): + if self.quoteStack and self.quoteStack[-1] != name: + #This is not a real end tag. + #print "</%s> is not real!" % name + self.handle_data('</%s>' % name) + return + self.soup.handle_endtag(name) + if self.quoteStack and self.quoteStack[-1] == name: + self.quoteStack.pop() + self.literal = (len(self.quoteStack) > 0) + + def handle_data(self, content): + #print "Handling data " + content + self.soup.handle_data(content) + + def handle_pi(self, text): + """Handle a processing instruction as a ProcessingInstruction + object, possibly one with a %SOUP-ENCODING% slot into which an + encoding will be plugged later.""" + if text[:3] == "xml": + text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" + self._toStringSubclass(text, ProcessingInstruction) + + def handle_comment(self, text): + "Handle comments as Comment objects." + self._toStringSubclass(text, Comment) + + def handle_charref(self, ref): + "Handle character references as data." + if self.convertEntities: + data = unichr(int(ref)) + else: + data = '&#%s;' % ref + self.handle_data(data) + + def handle_entityref(self, ref): + """Handle entity references as data, possibly converting known + HTML and/or XML entity references to the corresponding Unicode + characters.""" + data = None + if self.convertHTMLEntities: + try: + data = unichr(name2codepoint[ref]) + except KeyError: + pass + + if not data and self.convertXMLEntities: + data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) + + if not data and self.convertHTMLEntities and \ + not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): + # TODO: We've got a problem here. We're told this is + # an entity reference, but it's not an XML entity + # reference or an HTML entity reference. Nonetheless, + # the logical thing to do is to pass it through as an + # unrecognized entity reference. + # + # Except: when the input is "&carol;" this function + # will be called with input "carol". When the input is + # "AT&T", this function will be called with input + # "T". We have no way of knowing whether a semicolon + # was present originally, so we don't know whether + # this is an unknown entity or just a misplaced + # ampersand. + # + # The more common case is a misplaced ampersand, so I + # escape the ampersand and omit the trailing semicolon. + data = "&%s" % ref + if not data: + # This case is different from the one above, because we + # haven't already gone through a supposedly comprehensive + # mapping of entities to Unicode characters. We might not + # have gone through any mapping at all. So the chances are + # very high that this is a real entity, and not a + # misplaced ampersand. + data = "&%s;" % ref + self.handle_data(data) + + def handle_decl(self, data): + "Handle DOCTYPEs and the like as Declaration objects." + self._toStringSubclass(data, Declaration) + + def _toStringSubclass(self, text, subclass): + """Adds a certain piece of text to the tree as a NavigableString + subclass.""" + self.soup.endData() + self.handle_data(text) + self.soup.endData(subclass) + + def _smartPop(self, name): + + """We need to pop up to the previous tag of this type, unless + one of this tag's nesting reset triggers comes between this + tag and the previous tag of this type, OR unless this tag is a + generic nesting trigger and another generic nesting trigger + comes between this tag and the previous tag of this type. + + Examples: + <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. + <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. + <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. + + <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. + <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' + <td><tr><td> *<td>* should pop to 'tr', not the first 'td' + """ + + nestingResetTriggers = self.nestable_tags.get(name) + isNestable = nestingResetTriggers != None + isResetNesting = self.reset_nesting_tags.has_key(name) + popTo = None + inclusive = True + for i in range(len(self.soup.tagStack)-1, 0, -1): + p = self.soup.tagStack[i] + if (not p or p.name == name) and not isNestable: + #Non-nestable tags get popped to the top or to their + #last occurance. + popTo = name + break + if (nestingResetTriggers != None + and p.name in nestingResetTriggers) \ + or (nestingResetTriggers == None and isResetNesting + and self.reset_nesting_tags.has_key(p.name)): + + #If we encounter one of the nesting reset triggers + #peculiar to this tag, or we encounter another tag + #that causes nesting to reset, pop up to but not + #including that tag. + popTo = p.name + inclusive = False + break + p = p.parent + if popTo: + self.soup._popToTag(popTo, inclusive) + + def parse_declaration(self, i): + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as a CData object.""" + j = None + if self.rawdata[i:i+9] == '<![CDATA[': + k = self.rawdata.find(']]>', i) + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i+9:k] + j = k+3 + self._toStringSubclass(data, CData) + else: + try: + j = HTMLParser.parse_declaration(self, i) + except HTMLParseError: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j + + +class HTMLParserTreeBuilder(HTMLParserXMLTreeBuilder): + """This builder knows the following facts about HTML: + + * Some tags have no closing tag and should be interpreted as being + closed as soon as they are encountered. + + * The text inside some tags (ie. 'script') may contain tags which + are not really part of the document and which should be parsed + as text, not tags. If you want to parse the text as tags, you can + always fetch it and parse it explicitly. + + * Tag nesting rules: + + Most tags can't be nested at all. For instance, the occurance of + a <p> tag should implicitly close the previous <p> tag. + + <p>Para1<p>Para2 + should be transformed into: + <p>Para1</p><p>Para2 + + Some tags can be nested arbitrarily. For instance, the occurance + of a <blockquote> tag should _not_ implicitly close the previous + <blockquote> tag. + + Alice said: <blockquote>Bob said: <blockquote>Blah + should NOT be transformed into: + Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah + + Some tags can be nested, but the nesting is reset by the + interposition of other tags. For instance, a <tr> tag should + implicitly close the previous <tr> tag within the same <table>, + but not close a <tr> tag in another table. + + <table><tr>Blah<tr>Blah + should be transformed into: + <table><tr>Blah</tr><tr>Blah + but, + <tr>Blah<table><tr>Blah + should NOT be transformed into + <tr>Blah<table></tr><tr>Blah + + Differing assumptions about tag nesting rules are a major source + of problems with the BeautifulSoup class. If BeautifulSoup is not + treating as nestable a tag your page author treats as nestable, + try subclassing this tree builder or using another parser's tree + builder.""" + + assume_html = True + preserve_whitespace_tags = buildSet(['pre', 'textarea']) + quote_tags = buildSet(['script', 'textarea']) + self_closing_tags = buildSet(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + #According to the HTML standard, each of these inline tags can + #contain another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', + 'center'] + + #According to the HTML standard, these block tags can contain + #another tag of the same type. Furthermore, it's common + #to actually use these tags this way. + nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del'] + + #Lists can contain other lists, but there are restrictions. + nestable_list_tags = { 'ol' : [], + 'ul' : [], + 'li' : ['ul', 'ol'], + 'dl' : [], + 'dd' : ['dl'], + 'dt' : ['dl'] } + + #Tables can contain other tables, but there are restrictions. + nestable_table_tags = {'table' : [], + 'tr' : ['table', 'tbody', 'tfoot', 'thead'], + 'td' : ['tr'], + 'th' : ['tr'], + 'thead' : ['table'], + 'tbody' : ['table'], + 'tfoot' : ['table'], + } + + non_nestable_block_tags = ['address', 'form', 'p', 'pre'] + + #If one of these tags is encountered, all tags up to the next tag of + #this type are popped. + reset_nesting_tags = buildTagMap(None, nestable_block_tags, 'noscript', + non_nestable_block_tags, + nestable_list_tags, + nestable_table_tags) + + nestable_tags = buildTagMap([], nestable_inline_tags, nestable_block_tags, + nestable_list_tags, nestable_table_tags) + + + def __init__(self, *args, **kwargs): + if not kwargs.has_key('smartQuotesTo'): + kwargs['smartQuotesTo'] = self.HTML_ENTITIES + HTMLParserXMLTreeBuilder.__init__(self, *args, **kwargs) + +# Some aliases to use if you don't care about the underlying +# implementation. +XMLTreeBuilder = HTMLParserXMLTreeBuilder +HTMLTreeBuilder = HTMLParserTreeBuilder + +class ICantBelieveItsValidHTMLTreeBuilder(HTMLParserTreeBuilder): + """The is oriented towards skipping over + common HTML errors like unclosed tags. However, sometimes it makes + errors of its own. For instance, consider this fragment: + + <b>Foo<b>Bar</b></b> + + This is perfectly valid (if bizarre) HTML. However, the + BeautifulSoup class will implicitly close the first b tag when it + encounters the second 'b'. It will think the author wrote + "<b>Foo<b>Bar", and didn't close the first 'b' tag, because + there's no real-world reason to bold something that's already + bold. When it encounters '</b></b>' it will close two more 'b' + tags, for a grand total of three tags closed instead of two. This + can throw off the rest of your document structure. The same is + true of a number of other tags, listed below. + + It's much more common for someone to forget to close a 'b' tag + than to actually use nested 'b' tags, and the BeautifulSoup class + handles the common case. This class handles the not-co-common + case: where you can't believe someone wrote what they did, but + it's valid HTML and BeautifulSoup screwed up by assuming it + wouldn't be.""" + i_cant_believe_theyre_nestable_inline_tags = \ + ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', + 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', + 'big'] + + i_cant_believe_theyre_nestable_block_tags = ['noscript'] + + nestable_tags = buildTagMap([], HTMLParserTreeBuilder.nestable_tags, + i_cant_believe_theyre_nestable_block_tags, + i_cant_believe_theyre_nestable_inline_tags) |