1 files changed, 375 insertions, 0 deletions
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..8817164
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1,375 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup parses a (possibly invalid) XML or HTML document into a
+tree representation. It provides methods and Pythonic idioms that make
+it easy to navigate, search, and modify the tree.
+
+A well-formed XML/HTML document yields a well-formed data
+structure. An ill-formed XML/HTML document yields a correspondingly
+ill-formed data structure. If your document is only locally
+well-formed, you can use this library to find and process the
+well-formed part of it.
+
+Beautiful Soup works with Python 2.2 and up. It has no external
+dependencies, but you'll have more success at converting data to UTF-8
+if you also install these three packages:
+
+* chardet, for auto-detecting character encodings
+  http://chardet.feedparser.org/
+* cjkcodecs and iconv_codec, which add more encodings to the ones supported
+  by stock Python.
+  http://cjkpython.i18n.org/
+
+Beautiful Soup defines classes for two main parsing strategies:
+
+ * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
+   language that kind of looks like XML.
+
+ * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
+   or invalid. This class has web browser-like heuristics for
+   obtaining a sensible parse tree in the face of common HTML errors.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/documentation.html
+
+Here, have some legalese:
+
+Copyright (c) 2004-2009, Leonard Richardson
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+
+  * Neither the name of the the Beautiful Soup Consortium and All
+    Night Kosher Bakery nor the names of its contributors may be
+    used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
+
+"""
+from __future__ import generators
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "4.0.0"
+__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
+__license__ = "New-style BSD"
+
+__all__ = ['BeautifulSoup',
+
+           # Stuff imported from other packages
+           'Entities',
+
+           'BeautifulStoneSoup',
+           'ICantBelieveItsBeautifulSoup']
+
+import re
+
+from util import isList, isString, buildSet
+from dammit import UnicodeDammit
+from element import Entities, NavigableString, Tag
+
+
+class BeautifulStoneSoup(Tag):
+    """
+    This class defines the basic interface called by the tree builders.
+
+    These methods will be called by the parser:
+      reset()
+      feed(markup)
+
+    The tree builder may call these methods from its feed() implementation:
+      handle_starttag(name, attrs) # See note about return value
+      handle_endtag(name)
+      handle_data(data) # Appends to the current data node
+      endData(containerClass=NavigableString) # Ends the current data node
+
+    No matter how complicated the underlying parser is, you should be
+    able to build a tree using 'start tag' events, 'end tag' events,
+    'data' events, and "done with data" events.
+
+    If you encounter a self-closing tag, call handle_starttag and then
+    handle_endtag, but note that the tag will not be displayed as a
+    self-closing tag unless you also have your builder's
+    isSelfClosingTag() implementation return True when passed the tag
+    name.
+    """
+    ROOT_TAG_NAME = u'[document]'
+
+    # Used to detect the charset in a META tag; see handleSpecialMetaTag
+    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
+    # Used when determining whether a text node is all whitespace and
+    # can be replaced with a single space. A text node that contains
+    # fancy Unicode spaces (usually non-breaking) should be left
+    # alone.
+    STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
+
+    @classmethod
+    def default_builder(self):
+        from lxml import etree
+        from builder.lxml_builder import LXMLTreeBuilder
+        return LXMLTreeBuilder(parser_class=etree.XMLParser)
+
+    def __init__(self, markup="", builder=None, parseOnlyThese=None,
+                 fromEncoding=None):
+        """The Soup object is initialized as the 'root tag', and the
+        provided markup (which can be a string or a file-like object)
+        is fed into the underlying parser."""
+
+        if builder is None:
+            builder = self.default_builder()
+        self.builder = builder
+        self.builder.soup = self
+
+        self.parseOnlyThese = parseOnlyThese
+        self.fromEncoding = fromEncoding
+
+        self.reset()
+
+        if hasattr(markup, 'read'):        # It's a file-type object.
+            markup = markup.read()
+        self.markup = markup
+        try:
+            self._feed(isHTML=self.builder.assume_html)
+        except StopParsing:
+            pass
+        self.markup = None                 # The markup can now be GCed.
+        self.builder.soup = None
+        self.builder = None                # So can the builder.
+
+    def _feed(self, inDocumentEncoding=None, isHTML=False):
+        # Convert the document to Unicode.
+        markup = self.markup
+        if isinstance(markup, unicode):
+            if not hasattr(self, 'originalEncoding'):
+                self.originalEncoding = None
+        else:
+            dammit = UnicodeDammit\
+                     (markup, [self.fromEncoding, inDocumentEncoding],
+                      smartQuotesTo=self.builder.smart_quotes_to, isHTML=isHTML)
+            markup = dammit.unicode
+            self.originalEncoding = dammit.originalEncoding
+            self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
+        self.builder.reset()
+
+        self.builder.feed(markup)
+        # Close out any unfinished strings and close all the open tags.
+        self.endData()
+        while self.currentTag.name != self.ROOT_TAG_NAME:
+            self.popTag()
+
+    def reset(self):
+        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
+        self.hidden = 1
+        self.builder.reset()
+        self.currentData = []
+        self.currentTag = None
+        self.tagStack = []
+        self.pushTag(self)
+
+    def popTag(self):
+        tag = self.tagStack.pop()
+        # Tags with just one string-owning child get the child as a
+        # 'string' property, so that soup.tag.string is shorthand for
+        # soup.tag.contents[0]
+        if len(self.currentTag.contents) == 1 and \
+           isinstance(self.currentTag.contents[0], NavigableString):
+            self.currentTag.string = self.currentTag.contents[0]
+
+        #print "Pop", tag.name
+        if self.tagStack:
+            self.currentTag = self.tagStack[-1]
+        return self.currentTag
+
+    def pushTag(self, tag):
+        #print "Push", tag.name
+        if self.currentTag:
+            self.currentTag.contents.append(tag)
+        self.tagStack.append(tag)
+        self.currentTag = self.tagStack[-1]
+
+    def endData(self, containerClass=NavigableString):
+        if self.currentData:
+            currentData = u''.join(self.currentData)
+            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+                not buildSet([tag.name for tag in self.tagStack]).intersection(
+                    self.builder.preserve_whitespace_tags)):
+                if '\n' in currentData:
+                    currentData = '\n'
+                else:
+                    currentData = ' '
+            self.currentData = []
+            if self.parseOnlyThese and len(self.tagStack) <= 1 and \
+                   (not self.parseOnlyThese.text or \
+                    not self.parseOnlyThese.search(currentData)):
+                return
+            o = containerClass(currentData)
+            o.setup(self.currentTag, self.previous)
+            if self.previous:
+                self.previous.next = o
+            self.previous = o
+            self.currentTag.contents.append(o)
+
+
+    def _popToTag(self, name, inclusivePop=True):
+        """Pops the tag stack up to and including the most recent
+        instance of the given tag. If inclusivePop is false, pops the tag
+        stack up to but *not* including the most recent instqance of
+        the given tag."""
+        #print "Popping to %s" % name
+        if name == self.ROOT_TAG_NAME:
+            return
+
+        numPops = 0
+        mostRecentTag = None
+        for i in range(len(self.tagStack)-1, 0, -1):
+            if name == self.tagStack[i].name:
+                numPops = len(self.tagStack)-i
+                break
+        if not inclusivePop:
+            numPops = numPops - 1
+
+        for i in range(0, numPops):
+            mostRecentTag = self.popTag()
+        return mostRecentTag
+
+    def handle_starttag(self, name, attrs):
+        """Push a start tag on to the stack.
+
+        If this method returns None, the tag was rejected by the
+        SoupStrainer. You should proceed as if the tag had not occured
+        in the document. For instance, if this was a self-closing tag,
+        don't call handle_endtag.
+        """
+
+        #print "Start tag %s: %s" % (name, attrs)
+        self.endData()
+
+        if (self.parseOnlyThese and len(self.tagStack) <= 1
+            and (self.parseOnlyThese.text
+                 or not self.parseOnlyThese.searchTag(name, attrs))):
+            return None
+
+        containsSubstitutions = False
+        if name == 'meta' and self.builder.assume_html:
+            containsSubstitutions = self.handleSpecialMetaTag(attrs)
+
+        tag = Tag(self, self.builder, name, attrs, self.currentTag,
+                  self.previous)
+        tag.containsSubstitutions = containsSubstitutions
+        if self.previous:
+            self.previous.next = tag
+        self.previous = tag
+        self.pushTag(tag)
+        return tag
+
+    def handle_endtag(self, name):
+        #print "End tag: " + name
+        self.endData()
+        self._popToTag(name)
+
+    def handle_data(self, data):
+        self.currentData.append(data)
+
+    def handleSpecialMetaTag(self, attrs):
+        """Beautiful Soup can detect a charset included in a META tag,
+        try to convert the document to that charset, and re-parse the
+        document from the beginning. Neither lxml nor html5lib does
+        this, so the feature is still here."""
+        httpEquiv = None
+        contentType = None
+        contentTypeIndex = None
+        tagNeedsEncodingSubstitution = False
+
+        if isinstance(attrs, dict):
+            httpEquiv = attrs.get('http-equiv')
+            contentType = attrs.get('content')
+        else:
+            # XXX do we need this?
+            for i in range(0, len(attrs)):
+                key, value = attrs[i]
+                key = key.lower()
+                if key == 'http-equiv':
+                    httpEquiv = value
+                elif key == 'content':
+                    contentType = value
+                    contentTypeIndex = i
+
+        if httpEquiv and contentType: # It's an interesting meta tag.
+            match = self.CHARSET_RE.search(contentType)
+            if match:
+                if (self.declaredHTMLEncoding is not None or
+                    self.originalEncoding == self.fromEncoding):
+                    # An HTML encoding was sniffed while converting
+                    # the document to Unicode, or an HTML encoding was
+                    # sniffed during a previous pass through the
+                    # document, or an encoding was specified
+                    # explicitly and it worked. Rewrite the meta tag.
+                    def rewrite(match):
+                        return match.group(1) + "%SOUP-ENCODING%"
+                    newAttr = self.CHARSET_RE.sub(rewrite, contentType)
+                    if isinstance(attrs, dict):
+                        attrs['content'] = newAttr
+                    else:
+                        attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
+                                                   newAttr)
+                    tagNeedsEncodingSubstitution = True
+                else:
+                    # This is our first pass through the document.
+                    # Go through it again with the encoding information.
+                    newCharset = match.group(3)
+                    if newCharset and newCharset != self.originalEncoding:
+                        self.declaredHTMLEncoding = newCharset
+                        self._feed(self.declaredHTMLEncoding)
+                        raise StopParsing
+                    pass
+        return tagNeedsEncodingSubstitution
+
+
+class BeautifulSoup(BeautifulStoneSoup):
+    """A convenience class for parsing HTML without creating a builder."""
+
+    @classmethod
+    def default_builder(self):
+        try:
+            from builder.html5_builder import HTML5TreeBuilder
+            return HTML5TreeBuilder()
+        except ImportError:
+            from builder.lxml_builder import LXMLTreeBuilder
+            return LXMLTreeBuilder()
+
+
+class StopParsing(Exception):
+    pass
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+    import sys
+    soup = BeautifulSoup(sys.stdin)
+    print soup.prettify()