diff options
-rw-r--r-- | BeautifulSoup.py | 131 |
1 files changed, 73 insertions, 58 deletions
diff --git a/BeautifulSoup.py b/BeautifulSoup.py index 53687b2..7ba4fa7 100644 --- a/BeautifulSoup.py +++ b/BeautifulSoup.py @@ -1136,11 +1136,22 @@ class XMLParserBuilder(HTMLParser, TreeBuilder): attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs)) self.handle_data('<%s%s>' % (name, attrs)) return - self.soup.handle_starttag(name, attrs) + if not self.isSelfClosingTag(name): + self.soup.endData() + self._smartPop(name) + tag = self.soup.handle_starttag(name, attrs) + if tag is None: + # The tag was filtered out by the SoupStrainer + return if name in self.quote_tags: - #print "Beginning quote (%s)" % name + #print "Beginning quote (%s)" % name self.quoteStack.append(name) self.literal = 1 + if self.isSelfClosingTag(name): + self.soup.popTag() + + def handle_startendtag(self, name, attrs): + self.handle_starttag(name, attrs) def handle_endtag(self, name): if self.quoteStack and self.quoteStack[-1] != name: @@ -1154,6 +1165,7 @@ class XMLParserBuilder(HTMLParser, TreeBuilder): self.literal = (len(self.quoteStack) > 0) def handle_data(self, content): + #print "Handling data " + content self.soup.handle_data(content) def handle_pi(self, text): @@ -1230,6 +1242,52 @@ class XMLParserBuilder(HTMLParser, TreeBuilder): self.handle_data(text) self.soup.endData(subclass) + def _smartPop(self, name): + + """We need to pop up to the previous tag of this type, unless + one of this tag's nesting reset triggers comes between this + tag and the previous tag of this type, OR unless this tag is a + generic nesting trigger and another generic nesting trigger + comes between this tag and the previous tag of this type. + + Examples: + <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. + <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. + <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. + + <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. + <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' + <td><tr><td> *<td>* should pop to 'tr', not the first 'td' + """ + + nestingResetTriggers = self.nestable_tags.get(name) + isNestable = nestingResetTriggers != None + isResetNesting = self.reset_nesting_tags.has_key(name) + popTo = None + inclusive = True + for i in range(len(self.soup.tagStack)-1, 0, -1): + p = self.soup.tagStack[i] + if (not p or p.name == name) and not isNestable: + #Non-nestable tags get popped to the top or to their + #last occurance. + popTo = name + break + if (nestingResetTriggers != None + and p.name in nestingResetTriggers) \ + or (nestingResetTriggers == None and isResetNesting + and self.reset_nesting_tags.has_key(p.name)): + + #If we encounter one of the nesting reset triggers + #peculiar to this tag, or we encounter another tag + #that causes nesting to reset, pop up to but not + #including that tag. + popTo = p.name + inclusive = False + break + p = p.parent + if popTo: + self.soup._popToTag(popTo, inclusive) + def parse_declaration(self, i): """Treat a bogus SGML declaration as raw data. Treat a CDATA declaration as a CData object.""" @@ -1361,7 +1419,7 @@ class BeautifulStoneSoup(Tag): feed(markup) The tree builder may call these methods from its feed() implementation: - handle_starttag(name, attrs, selfClosing=False) + handle_starttag(name, attrs) # See note about return value handle_endtag(name) handle_data(data) # Appends to the current data node endData(containerClass=NavigableString) # Ends the current data node @@ -1508,68 +1566,27 @@ class BeautifulStoneSoup(Tag): mostRecentTag = self.popTag() return mostRecentTag - def _smartPop(self, name): - - """We need to pop up to the previous tag of this type, unless - one of this tag's nesting reset triggers comes between this - tag and the previous tag of this type, OR unless this tag is a - generic nesting trigger and another generic nesting trigger - comes between this tag and the previous tag of this type. - - Examples: - <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'. - <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'. - <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'. + def handle_starttag(self, name, attrs): + """Push a start tag on to the stack. - <li><ul><li> *<li>* should pop to 'ul', not the first 'li'. - <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr' - <td><tr><td> *<td>* should pop to 'tr', not the first 'td' + If this method returns None, the tag was rejected by the + SoupStrainer. You should proceed as if the tag had not occured + in the document. For instance, if this was a self-closing tag, + don't call handle_endtag. """ - nestingResetTriggers = self.builder.nestable_tags.get(name) - isNestable = nestingResetTriggers != None - isResetNesting = self.builder.reset_nesting_tags.has_key(name) - popTo = None - inclusive = True - for i in range(len(self.tagStack)-1, 0, -1): - p = self.tagStack[i] - if (not p or p.name == name) and not isNestable: - #Non-nestable tags get popped to the top or to their - #last occurance. - popTo = name - break - if (nestingResetTriggers != None - and p.name in nestingResetTriggers) \ - or (nestingResetTriggers == None and isResetNesting - and self.builder.reset_nesting_tags.has_key(p.name)): - - #If we encounter one of the nesting reset triggers - #peculiar to this tag, or we encounter another tag - #that causes nesting to reset, pop up to but not - #including that tag. - popTo = p.name - inclusive = False - break - p = p.parent - if popTo: - self._popToTag(popTo, inclusive) - - def handle_starttag(self, name, attrs, selfClosing=False): #print "Start tag %s: %s" % (name, attrs) self.endData() + if (self.parseOnlyThese and len(self.tagStack) <= 1 + and (self.parseOnlyThese.text + or not self.parseOnlyThese.searchTag(name, attrs))): + return None + containsSubstitutions = False if name == 'meta' and self.builder.assume_html: containsSubstitutions = self.handleSpecialMetaTag(attrs) - if not self.builder.isSelfClosingTag(name) and not selfClosing: - self._smartPop(name) - - if self.parseOnlyThese and len(self.tagStack) <= 1 \ - and (self.parseOnlyThese.text - or not self.parseOnlyThese.searchTag(name, attrs)): - return - tag = Tag(self, self.builder, name, attrs, self.currentTag, self.previous) tag.containsSubstitutions = containsSubstitutions @@ -1577,12 +1594,10 @@ class BeautifulStoneSoup(Tag): self.previous.next = tag self.previous = tag self.pushTag(tag) - if selfClosing or self.builder.isSelfClosingTag(name): - self.popTag() return tag def handle_endtag(self, name): - #print "End tag %s" % name + #print "End tag: " + name self.endData() self._popToTag(name) |