diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2010-12-28 11:12:20 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2010-12-28 11:12:20 -0500 |
commit | 30cb1ad76ee23fb89f0611db3f7b9a26cbbce06e (patch) | |
tree | 2ae8cd9f786b9b101075b9ae59f5979019260e89 | |
parent | d575daed6b0ac6fffc691c44e256a3e8de8e8cdc (diff) |
Removed the HTMLParser tree builder to give me some room to maneuver.
-rw-r--r-- | src/beautifulsoup/__init__.py | 20 | ||||
-rw-r--r-- | src/beautifulsoup/builder/__init__.py | 474 | ||||
-rw-r--r-- | src/beautifulsoup/builder/lxml_builder.py | 12 | ||||
-rw-r--r-- | src/beautifulsoup/tests/helpers.py | 1 | ||||
-rw-r--r-- | src/beautifulsoup/tests/test_soup.py | 5 |
5 files changed, 30 insertions, 482 deletions
diff --git a/src/beautifulsoup/__init__.py b/src/beautifulsoup/__init__.py index 0999c07..e4a8ca4 100644 --- a/src/beautifulsoup/__init__.py +++ b/src/beautifulsoup/__init__.py @@ -83,10 +83,6 @@ __all__ = ['BeautifulSoup', # Stuff imported from other packages 'Entities', - 'HTMLParserXMLTreeBuilder', - 'HTMLParserTreeBuilder', - 'HTMLTreeBuilder', - 'XMLTreeBuilder', 'BeautifulStoneSoup', 'ICantBelieveItsBeautifulSoup'] @@ -95,9 +91,6 @@ import re from util import isList, isString, buildSet from dammit import UnicodeDammit -from builder import ( - HTMLParserXMLTreeBuilder, HTMLParserTreeBuilder, HTMLTreeBuilder, - ICantBelieveItsValidHTMLTreeBuilder, XMLTreeBuilder) from element import Entities, NavigableString, Tag @@ -137,7 +130,9 @@ class BeautifulStoneSoup(Tag): STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } def _defaultBuilder(self): - return HTMLParserXMLTreeBuilder() + from lxml import etree + from builder.lxml_builder import LXMLTreeBuilder + return LXMLTreeBuilder(parser_class=etree.XMLParser) def __init__(self, markup="", builder=None, parseOnlyThese=None, fromEncoding=None): @@ -175,7 +170,7 @@ class BeautifulStoneSoup(Tag): else: dammit = UnicodeDammit\ (markup, [self.fromEncoding, inDocumentEncoding], - smartQuotesTo=self.builder.smartQuotesTo, isHTML=isHTML) + smartQuotesTo=self.builder.smart_quotes_to, isHTML=isHTML) markup = dammit.unicode self.originalEncoding = dammit.originalEncoding self.declaredHTMLEncoding = dammit.declaredHTMLEncoding @@ -349,7 +344,12 @@ class BeautifulStoneSoup(Tag): class BeautifulSoup(BeautifulStoneSoup): """A convenience class for parsing HTML without creating a builder.""" def _defaultBuilder(self): - return HTMLParserTreeBuilder() + try: + from builder.html5_builder import HTML5TreeBuilder + return HTML5TreeBuilder() + except ImportError: + from builder.lxml_builder import LXMLTreeBuilder + return LXMLTreeBuilder() class StopParsing(Exception): diff --git a/src/beautifulsoup/builder/__init__.py b/src/beautifulsoup/builder/__init__.py index 0d2ad14..544e896 100644 --- a/src/beautifulsoup/builder/__init__.py +++ b/src/beautifulsoup/builder/__init__.py @@ -1,54 +1,19 @@ -import markupbase -import re -from HTMLParser import HTMLParser, HTMLParseError -# element has taken care of import weirdness, so import name2codepoint -# from there to avoid duplicating the weirdness. -from beautifulsoup.element import name2codepoint -from beautifulsoup.element import ( - CData, Comment, Declaration, Entities, ProcessingInstruction) -from beautifulsoup.util import buildSet, isList, isString +from beautifulsoup.element import Entities __all__ = ['TreeBuilder', - 'HTMLParserXMLTreeBuilder', - 'HTMLParserTreeBuilder', - 'XMLTreeBuilder', 'HTMLTreeBuilder', - 'ICantBelieveItsValidHTMLTreeBuilder'] - -#This hack makes the HTMLParser-based tree builders able to parse XML -#with namespaces. -markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match - -def buildTagMap(default, *args): - """Turns a list of maps, lists, or scalars into a single map. - Used to build the nestable_tags and reset_nesting_tags maps out of - lists and partial maps.""" - built = {} - for portion in args: - if hasattr(portion, 'items'): - #It's a map. Merge it. - for k,v in portion.items(): - built[k] = v - elif isList(portion) and not isString(portion): - #It's a list. Map each item to the default. - for k in portion: - built[k] = default - else: - #It's a scalar. Map it to the default. - built[portion] = default - return built - + ] class TreeBuilder(Entities): + """Turn a document into a Beautiful Soup object tree.""" - smartQuotesTo = Entities.XML_ENTITIES - preserve_whitespace_tags = buildSet() - quote_tags = buildSet() - self_closing_tags = buildSet() assume_html = False + smart_quotes_to = Entities.XML_ENTITIES def __init__(self): self.soup = None + self.self_closing_tags = set() + self.preserve_whitespace_tags = set() def isSelfClosingTag(self, name): return name in self.self_closing_tags @@ -56,429 +21,20 @@ class TreeBuilder(Entities): def reset(self): pass - def feed(self): - pass - - -class HTMLParserXMLTreeBuilder(HTMLParser, TreeBuilder): - - """ - This class defines a basic tree builder based on Python's built-in - HTMLParser. The tree builder knows nothing about tag - behavior except for the following: - - You can't close a tag without closing all the tags it encloses. - That is, "<foo><bar></foo>" actually means - "<foo><bar></bar></foo>". - - [Another possible explanation is "<foo><bar /></foo>", but unless - you specify 'bar' in self_closing_tags, this class will never use - that explanation.] - - This class is useful for parsing XML or made-up markup languages, - or when BeautifulSoup makes an assumption counter to what you were - expecting. - - - HTMLParser will process most bad HTML, and the BeautifulSoup class - has some tricks for dealing with some HTML that kills HTMLParser, - but Beautiful Soup can nonetheless choke or lose data if your data - uses self-closing tags or declarations incorrectly. - - This class uses regexes to sanitize input, avoiding the vast - majority of these problems. If the problems don't apply to you, - pass in False for markupMassage, and you'll get better - performance. - - The default parser massage techniques fix the two most common - instances of invalid HTML that choke HTMLParser: - - <br/> (No space between name of closing tag and tag close) - <! --Comment--> (Extraneous whitespace in declaration) - - You can pass in a custom list of (RE object, replace method) - tuples to get HTMLParserXMLTreeBuilder to scrub your input the way you - want. - """ - reset_nesting_tags = {} - nestable_tags = {} - - MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), - lambda x: x.group(1) + ' />'), - (re.compile('<!\s+([^<>]*)>'), - lambda x: '<!' + x.group(1) + '>') - ] - - def __init__(self, convertEntities=None, markupMassage=True, - selfClosingTags=None, - smartQuotesTo=Entities.XML_ENTITIES): - HTMLParser.__init__(self) - self.soup = None - self.convertEntities = convertEntities - self.instanceSelfClosingTags = buildSet(selfClosingTags or []) - self.markupMassage = markupMassage - self.smartQuotesTo = smartQuotesTo - self.quoteStack = [] - - # Set the rules for how we'll deal with the entities we - # encounter - if self.convertEntities: - # It doesn't make sense to convert encoded characters to - # entities even while you're converting entities to Unicode. - # Just convert it all to Unicode. - self.smartQuotesTo = None - if convertEntities == self.HTML_ENTITIES: - self.convertXMLEntities = False - self.convertHTMLEntities = True - self.escapeUnrecognizedEntities = True - elif convertEntities == self.XHTML_ENTITIES: - self.convertXMLEntities = True - self.convertHTMLEntities = True - self.escapeUnrecognizedEntities = False - elif convertEntities == self.XML_ENTITIES: - self.convertXMLEntities = True - self.convertHTMLEntities = False - self.escapeUnrecognizedEntities = False - else: - self.convertXMLEntities = False - self.convertHTMLEntities = False - self.escapeUnrecognizedEntities = False - def feed(self, markup): - if markup is not None: - if self.markupMassage: - if not isList(self.markupMassage): - self.markupMassage = self.MARKUP_MASSAGE - for fix, m in self.markupMassage: - markup = fix.sub(m, markup) - # TODO: We get rid of markupMassage so that the - # soup object can be deepcopied later on. Some - # Python installations can't copy regexes. If anyone - # was relying on the existence of markupMassage, this - # might cause problems. - # XXX: This might not be necessary now that we've moved - # the massage code into the builder. - #del(self.markupMassage) - HTMLParser.feed(self, markup) - - def isSelfClosingTag(self, name): - """Returns true iff the given string is the name of a - self-closing tag according to this parser.""" - return (name in self.self_closing_tags - or name in self.instanceSelfClosingTags) - - def handle_starttag(self, name, attrs): - if len(self.quoteStack) > 0: - #This is not a real tag. - #print "<%s> is not real!" % name - attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) - self.handle_data('<%s%s>' % (name, attrs)) - return - if not self.isSelfClosingTag(name): - self.soup.endData() - self._smartPop(name) - tag = self.soup.handle_starttag(name, attrs) - if tag is None: - # The tag was filtered out by the SoupStrainer - return - if name in self.quote_tags: - #print "Beginning quote (%s)" % name - self.quoteStack.append(name) - self.literal = 1 - if self.isSelfClosingTag(name): - self.soup.handle_endtag(name) - - def handle_endtag(self, name): - if self.quoteStack and self.quoteStack[-1] != name: - #This is not a real end tag. - #print "</%s> is not real!" % name - self.handle_data('</%s>' % name) - return - self.soup.handle_endtag(name) - if self.quoteStack and self.quoteStack[-1] == name: - self.quoteStack.pop() - self.literal = (len(self.quoteStack) > 0) - - def handle_data(self, content): - #print "Handling data " + content - self.soup.handle_data(content) - - def handle_pi(self, text): - """Handle a processing instruction as a ProcessingInstruction - object, possibly one with a %SOUP-ENCODING% slot into which an - encoding will be plugged later.""" - if text[:3] == "xml": - text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" - self._toStringSubclass(text, ProcessingInstruction) - - def handle_comment(self, text): - "Handle comments as Comment objects." - self._toStringSubclass(text, Comment) - - def handle_charref(self, ref): - "Handle character references as data." - if self.convertEntities: - data = unichr(int(ref)) - else: - data = '&#%s;' % ref - self.handle_data(data) - - def handle_entityref(self, ref): - """Handle entity references as data, possibly converting known - HTML and/or XML entity references to the corresponding Unicode - characters.""" - data = None - if self.convertHTMLEntities: - try: - data = unichr(name2codepoint[ref]) - except KeyError: - pass - - if not data and self.convertXMLEntities: - data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) - - if not data and self.convertHTMLEntities and \ - not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): - # TODO: We've got a problem here. We're told this is - # an entity reference, but it's not an XML entity - # reference or an HTML entity reference. Nonetheless, - # the logical thing to do is to pass it through as an - # unrecognized entity reference. - # - # Except: when the input is "&carol;" this function - # will be called with input "carol". When the input is - # "AT&T", this function will be called with input - # "T". We have no way of knowing whether a semicolon - # was present originally, so we don't know whether - # this is an unknown entity or just a misplaced - # ampersand. - # - # The more common case is a misplaced ampersand, so I - # escape the ampersand and omit the trailing semicolon. - data = "&%s" % ref - if not data: - # This case is different from the one above, because we - # haven't already gone through a supposedly comprehensive - # mapping of entities to Unicode characters. We might not - # have gone through any mapping at all. So the chances are - # very high that this is a real entity, and not a - # misplaced ampersand. - data = "&%s;" % ref - self.handle_data(data) + raise NotImplementedError() - def handle_decl(self, data): - "Handle DOCTYPEs and the like as Declaration objects." - self._toStringSubclass(data, Declaration) - def _toStringSubclass(self, text, subclass): - """Adds a certain piece of text to the tree as a NavigableString - subclass.""" - self.soup.endData() - self.handle_data(text) - self.soup.endData(subclass) +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. - def _smartPop(self, name): - - """We need to pop up to the previous tag of this type, unless - one of this tag's nesting reset triggers comes between this - tag and the previous tag of this type, OR unless this tag is a - generic nesting trigger and another generic nesting trigger - comes between this tag and the previous tag of this type. - - Examples: - <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. - <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. - <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. - - <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. - <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' - <td><tr><td> *<td>* should pop to 'tr', not the first 'td' - """ - - nestingResetTriggers = self.nestable_tags.get(name) - isNestable = nestingResetTriggers != None - isResetNesting = self.reset_nesting_tags.has_key(name) - popTo = None - inclusive = True - for i in range(len(self.soup.tagStack)-1, 0, -1): - p = self.soup.tagStack[i] - if (not p or p.name == name) and not isNestable: - #Non-nestable tags get popped to the top or to their - #last occurance. - popTo = name - break - if (nestingResetTriggers != None - and p.name in nestingResetTriggers) \ - or (nestingResetTriggers == None and isResetNesting - and self.reset_nesting_tags.has_key(p.name)): - - #If we encounter one of the nesting reset triggers - #peculiar to this tag, or we encounter another tag - #that causes nesting to reset, pop up to but not - #including that tag. - popTo = p.name - inclusive = False - break - p = p.parent - if popTo: - self.soup._popToTag(popTo, inclusive) - - def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA - declaration as a CData object.""" - j = None - if self.rawdata[i:i+9] == '<![CDATA[': - k = self.rawdata.find(']]>', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i+9:k] - j = k+3 - self._toStringSubclass(data, CData) - else: - try: - j = HTMLParser.parse_declaration(self, i) - except HTMLParseError: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) - j = i + len(toHandle) - return j - - -class HTMLParserTreeBuilder(HTMLParserXMLTreeBuilder): - """This builder knows the following facts about HTML: - - * Some tags have no closing tag and should be interpreted as being - closed as soon as they are encountered. - - * The text inside some tags (ie. 'script') may contain tags which - are not really part of the document and which should be parsed - as text, not tags. If you want to parse the text as tags, you can - always fetch it and parse it explicitly. - - * Tag nesting rules: - - Most tags can't be nested at all. For instance, the occurance of - a <p> tag should implicitly close the previous <p> tag. - - <p>Para1<p>Para2 - should be transformed into: - <p>Para1</p><p>Para2 - - Some tags can be nested arbitrarily. For instance, the occurance - of a <blockquote> tag should _not_ implicitly close the previous - <blockquote> tag. - - Alice said: <blockquote>Bob said: <blockquote>Blah - should NOT be transformed into: - Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah - - Some tags can be nested, but the nesting is reset by the - interposition of other tags. For instance, a <tr> tag should - implicitly close the previous <tr> tag within the same <table>, - but not close a <tr> tag in another table. - - <table><tr>Blah<tr>Blah - should be transformed into: - <table><tr>Blah</tr><tr>Blah - but, - <tr>Blah<table><tr>Blah - should NOT be transformed into - <tr>Blah<table></tr><tr>Blah - - Differing assumptions about tag nesting rules are a major source - of problems with the BeautifulSoup class. If BeautifulSoup is not - treating as nestable a tag your page author treats as nestable, - try subclassing this tree builder or using another parser's tree - builder.""" + Such as which tags are self-closing tags. + """ assume_html = True - preserve_whitespace_tags = buildSet(['pre', 'textarea']) - quote_tags = buildSet(['script', 'textarea']) - self_closing_tags = buildSet(['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) - - #According to the HTML standard, each of these inline tags can - #contain another tag of the same type. Furthermore, it's common - #to actually use these tags this way. - nestable_inline_tags = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', - 'center'] - - #According to the HTML standard, these block tags can contain - #another tag of the same type. Furthermore, it's common - #to actually use these tags this way. - nestable_block_tags = ['blockquote', 'div', 'fieldset', 'ins', 'del'] - - #Lists can contain other lists, but there are restrictions. - nestable_list_tags = { 'ol' : [], - 'ul' : [], - 'li' : ['ul', 'ol'], - 'dl' : [], - 'dd' : ['dl'], - 'dt' : ['dl'] } - - #Tables can contain other tables, but there are restrictions. - nestable_table_tags = {'table' : [], - 'tr' : ['table', 'tbody', 'tfoot', 'thead'], - 'td' : ['tr'], - 'th' : ['tr'], - 'thead' : ['table'], - 'tbody' : ['table'], - 'tfoot' : ['table'], - } - - non_nestable_block_tags = ['address', 'form', 'p', 'pre'] - - #If one of these tags is encountered, all tags up to the next tag of - #this type are popped. - reset_nesting_tags = buildTagMap(None, nestable_block_tags, 'noscript', - non_nestable_block_tags, - nestable_list_tags, - nestable_table_tags) - - nestable_tags = buildTagMap([], nestable_inline_tags, nestable_block_tags, - nestable_list_tags, nestable_table_tags) - - - def __init__(self, *args, **kwargs): - if not kwargs.has_key('smartQuotesTo'): - kwargs['smartQuotesTo'] = self.HTML_ENTITIES - HTMLParserXMLTreeBuilder.__init__(self, *args, **kwargs) - -# Some aliases to use if you don't care about the underlying -# implementation. -XMLTreeBuilder = HTMLParserXMLTreeBuilder -HTMLTreeBuilder = HTMLParserTreeBuilder - -class ICantBelieveItsValidHTMLTreeBuilder(HTMLParserTreeBuilder): - """The is oriented towards skipping over - common HTML errors like unclosed tags. However, sometimes it makes - errors of its own. For instance, consider this fragment: - - <b>Foo<b>Bar</b></b> - - This is perfectly valid (if bizarre) HTML. However, the - BeautifulSoup class will implicitly close the first b tag when it - encounters the second 'b'. It will think the author wrote - "<b>Foo<b>Bar", and didn't close the first 'b' tag, because - there's no real-world reason to bold something that's already - bold. When it encounters '</b></b>' it will close two more 'b' - tags, for a grand total of three tags closed instead of two. This - can throw off the rest of your document structure. The same is - true of a number of other tags, listed below. - - It's much more common for someone to forget to close a 'b' tag - than to actually use nested 'b' tags, and the BeautifulSoup class - handles the common case. This class handles the not-co-common - case: where you can't believe someone wrote what they did, but - it's valid HTML and BeautifulSoup screwed up by assuming it - wouldn't be.""" - i_cant_believe_theyre_nestable_inline_tags = \ - ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', - 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', - 'big'] + smart_quotes_to = Entities.HTML_ENTITIES - i_cant_believe_theyre_nestable_block_tags = ['noscript'] + preserve_whitespace_tags = set(['pre', 'textarea']) + self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) - nestable_tags = buildTagMap([], HTMLParserTreeBuilder.nestable_tags, - i_cant_believe_theyre_nestable_block_tags, - i_cant_believe_theyre_nestable_inline_tags) diff --git a/src/beautifulsoup/builder/lxml_builder.py b/src/beautifulsoup/builder/lxml_builder.py index d8bf3f0..4949fea 100644 --- a/src/beautifulsoup/builder/lxml_builder.py +++ b/src/beautifulsoup/builder/lxml_builder.py @@ -1,19 +1,13 @@ from lxml import etree from beautifulsoup.element import Comment -from beautifulsoup.builder import HTMLParserTreeBuilder, TreeBuilder +from beautifulsoup.builder import HTMLTreeBuilder -class LXMLTreeBuilder(TreeBuilder): +class LXMLTreeBuilder(HTMLTreeBuilder): - def __init__(self, parser_class=etree.HTMLParser, self_closing_tags=None): + def __init__(self, parser_class=etree.HTMLParser): self.parser = parser_class(target=self) - if self_closing_tags is None: - self_closing_tags = HTMLParserTreeBuilder.self_closing_tags - self.self_closing_tags = self_closing_tags self.soup = None - def isSelfClosingTag(self, name): - return name in self.self_closing_tags - def feed(self, markup): self.parser.feed(markup) self.parser.close() diff --git a/src/beautifulsoup/tests/helpers.py b/src/beautifulsoup/tests/helpers.py index b9de4f2..219d95c 100644 --- a/src/beautifulsoup/tests/helpers.py +++ b/src/beautifulsoup/tests/helpers.py @@ -3,7 +3,6 @@ import unittest from beautifulsoup import BeautifulSoup from beautifulsoup.element import SoupStrainer -from test_soup import SoupTest class SoupTest(unittest.TestCase): diff --git a/src/beautifulsoup/tests/test_soup.py b/src/beautifulsoup/tests/test_soup.py index c5a02b6..90201a5 100644 --- a/src/beautifulsoup/tests/test_soup.py +++ b/src/beautifulsoup/tests/test_soup.py @@ -9,9 +9,8 @@ import re import unittest from beautifulsoup import * from beautifulsoup.element import CData, Comment, Declaration, SoupStrainer, Tag -from beautifulsoup.builder import ICantBelieveItsValidHTMLTreeBuilder from beautifulsoup.dammit import UnicodeDammit - +from beautifulsoup.builder.html5_builder import HTML5TreeBuilder def additional_tests(): return unittest.TestLoader().loadTestsFromName(__name__) @@ -19,7 +18,7 @@ def additional_tests(): class SoupTest(unittest.TestCase): - default_builder = HTMLParserXMLTreeBuilder() + default_builder = HTML5TreeBuilder() def assertSoupEquals(self, toParse, rep=None, builder=None, encoding=None): |