diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-01-28 11:39:36 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-01-28 11:39:36 -0500 |
commit | 7bbefa1fcc9a6006953eb0a79049ece9f05985de (patch) | |
tree | 8fa1d24a0f6411b7455e76fd87b659049690077f /__init__.py | |
parent | 692fe5201d5dec15a3598578a6f403e67802de0d (diff) |
Moved everything into the top-level directory and got rid of buildout.
Diffstat (limited to '__init__.py')
-rw-r--r-- | __init__.py | 375 |
1 files changed, 375 insertions, 0 deletions
diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..8817164 --- /dev/null +++ b/__init__.py @@ -0,0 +1,375 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup parses a (possibly invalid) XML or HTML document into a +tree representation. It provides methods and Pythonic idioms that make +it easy to navigate, search, and modify the tree. + +A well-formed XML/HTML document yields a well-formed data +structure. An ill-formed XML/HTML document yields a correspondingly +ill-formed data structure. If your document is only locally +well-formed, you can use this library to find and process the +well-formed part of it. + +Beautiful Soup works with Python 2.2 and up. It has no external +dependencies, but you'll have more success at converting data to UTF-8 +if you also install these three packages: + +* chardet, for auto-detecting character encodings + http://chardet.feedparser.org/ +* cjkcodecs and iconv_codec, which add more encodings to the ones supported + by stock Python. + http://cjkpython.i18n.org/ + +Beautiful Soup defines classes for two main parsing strategies: + + * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific + language that kind of looks like XML. + + * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid + or invalid. This class has web browser-like heuristics for + obtaining a sensible parse tree in the face of common HTML errors. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/documentation.html + +Here, have some legalese: + +Copyright (c) 2004-2009, Leonard Richardson + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the the Beautiful Soup Consortium and All + Night Kosher Bakery nor the names of its contributors may be + used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. + +""" +from __future__ import generators + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "4.0.0" +__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson" +__license__ = "New-style BSD" + +__all__ = ['BeautifulSoup', + + # Stuff imported from other packages + 'Entities', + + 'BeautifulStoneSoup', + 'ICantBelieveItsBeautifulSoup'] + +import re + +from util import isList, isString, buildSet +from dammit import UnicodeDammit +from element import Entities, NavigableString, Tag + + +class BeautifulStoneSoup(Tag): + """ + This class defines the basic interface called by the tree builders. + + These methods will be called by the parser: + reset() + feed(markup) + + The tree builder may call these methods from its feed() implementation: + handle_starttag(name, attrs) # See note about return value + handle_endtag(name) + handle_data(data) # Appends to the current data node + endData(containerClass=NavigableString) # Ends the current data node + + No matter how complicated the underlying parser is, you should be + able to build a tree using 'start tag' events, 'end tag' events, + 'data' events, and "done with data" events. + + If you encounter a self-closing tag, call handle_starttag and then + handle_endtag, but note that the tag will not be displayed as a + self-closing tag unless you also have your builder's + isSelfClosingTag() implementation return True when passed the tag + name. + """ + ROOT_TAG_NAME = u'[document]' + + # Used to detect the charset in a META tag; see handleSpecialMetaTag + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + # Used when determining whether a text node is all whitespace and + # can be replaced with a single space. A text node that contains + # fancy Unicode spaces (usually non-breaking) should be left + # alone. + STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } + + @classmethod + def default_builder(self): + from lxml import etree + from builder.lxml_builder import LXMLTreeBuilder + return LXMLTreeBuilder(parser_class=etree.XMLParser) + + def __init__(self, markup="", builder=None, parseOnlyThese=None, + fromEncoding=None): + """The Soup object is initialized as the 'root tag', and the + provided markup (which can be a string or a file-like object) + is fed into the underlying parser.""" + + if builder is None: + builder = self.default_builder() + self.builder = builder + self.builder.soup = self + + self.parseOnlyThese = parseOnlyThese + self.fromEncoding = fromEncoding + + self.reset() + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + self.markup = markup + try: + self._feed(isHTML=self.builder.assume_html) + except StopParsing: + pass + self.markup = None # The markup can now be GCed. + self.builder.soup = None + self.builder = None # So can the builder. + + def _feed(self, inDocumentEncoding=None, isHTML=False): + # Convert the document to Unicode. + markup = self.markup + if isinstance(markup, unicode): + if not hasattr(self, 'originalEncoding'): + self.originalEncoding = None + else: + dammit = UnicodeDammit\ + (markup, [self.fromEncoding, inDocumentEncoding], + smartQuotesTo=self.builder.smart_quotes_to, isHTML=isHTML) + markup = dammit.unicode + self.originalEncoding = dammit.originalEncoding + self.declaredHTMLEncoding = dammit.declaredHTMLEncoding + self.builder.reset() + + self.builder.feed(markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.currentData = [] + self.currentTag = None + self.tagStack = [] + self.pushTag(self) + + def popTag(self): + tag = self.tagStack.pop() + # Tags with just one string-owning child get the child as a + # 'string' property, so that soup.tag.string is shorthand for + # soup.tag.contents[0] + if len(self.currentTag.contents) == 1 and \ + isinstance(self.currentTag.contents[0], NavigableString): + self.currentTag.string = self.currentTag.contents[0] + + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + + def endData(self, containerClass=NavigableString): + if self.currentData: + currentData = u''.join(self.currentData) + if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and + not buildSet([tag.name for tag in self.tagStack]).intersection( + self.builder.preserve_whitespace_tags)): + if '\n' in currentData: + currentData = '\n' + else: + currentData = ' ' + self.currentData = [] + if self.parseOnlyThese and len(self.tagStack) <= 1 and \ + (not self.parseOnlyThese.text or \ + not self.parseOnlyThese.search(currentData)): + return + o = containerClass(currentData) + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) + + + def _popToTag(self, name, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + return + + numPops = 0 + mostRecentTag = None + for i in range(len(self.tagStack)-1, 0, -1): + if name == self.tagStack[i].name: + numPops = len(self.tagStack)-i + break + if not inclusivePop: + numPops = numPops - 1 + + for i in range(0, numPops): + mostRecentTag = self.popTag() + return mostRecentTag + + def handle_starttag(self, name, attrs): + """Push a start tag on to the stack. + + If this method returns None, the tag was rejected by the + SoupStrainer. You should proceed as if the tag had not occured + in the document. For instance, if this was a self-closing tag, + don't call handle_endtag. + """ + + #print "Start tag %s: %s" % (name, attrs) + self.endData() + + if (self.parseOnlyThese and len(self.tagStack) <= 1 + and (self.parseOnlyThese.text + or not self.parseOnlyThese.searchTag(name, attrs))): + return None + + containsSubstitutions = False + if name == 'meta' and self.builder.assume_html: + containsSubstitutions = self.handleSpecialMetaTag(attrs) + + tag = Tag(self, self.builder, name, attrs, self.currentTag, + self.previous) + tag.containsSubstitutions = containsSubstitutions + if self.previous: + self.previous.next = tag + self.previous = tag + self.pushTag(tag) + return tag + + def handle_endtag(self, name): + #print "End tag: " + name + self.endData() + self._popToTag(name) + + def handle_data(self, data): + self.currentData.append(data) + + def handleSpecialMetaTag(self, attrs): + """Beautiful Soup can detect a charset included in a META tag, + try to convert the document to that charset, and re-parse the + document from the beginning. Neither lxml nor html5lib does + this, so the feature is still here.""" + httpEquiv = None + contentType = None + contentTypeIndex = None + tagNeedsEncodingSubstitution = False + + if isinstance(attrs, dict): + httpEquiv = attrs.get('http-equiv') + contentType = attrs.get('content') + else: + # XXX do we need this? + for i in range(0, len(attrs)): + key, value = attrs[i] + key = key.lower() + if key == 'http-equiv': + httpEquiv = value + elif key == 'content': + contentType = value + contentTypeIndex = i + + if httpEquiv and contentType: # It's an interesting meta tag. + match = self.CHARSET_RE.search(contentType) + if match: + if (self.declaredHTMLEncoding is not None or + self.originalEncoding == self.fromEncoding): + # An HTML encoding was sniffed while converting + # the document to Unicode, or an HTML encoding was + # sniffed during a previous pass through the + # document, or an encoding was specified + # explicitly and it worked. Rewrite the meta tag. + def rewrite(match): + return match.group(1) + "%SOUP-ENCODING%" + newAttr = self.CHARSET_RE.sub(rewrite, contentType) + if isinstance(attrs, dict): + attrs['content'] = newAttr + else: + attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], + newAttr) + tagNeedsEncodingSubstitution = True + else: + # This is our first pass through the document. + # Go through it again with the encoding information. + newCharset = match.group(3) + if newCharset and newCharset != self.originalEncoding: + self.declaredHTMLEncoding = newCharset + self._feed(self.declaredHTMLEncoding) + raise StopParsing + pass + return tagNeedsEncodingSubstitution + + +class BeautifulSoup(BeautifulStoneSoup): + """A convenience class for parsing HTML without creating a builder.""" + + @classmethod + def default_builder(self): + try: + from builder.html5_builder import HTML5TreeBuilder + return HTML5TreeBuilder() + except ImportError: + from builder.lxml_builder import LXMLTreeBuilder + return LXMLTreeBuilder() + + +class StopParsing(Exception): + pass + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) + print soup.prettify() |