Moved almost all the code to BeautifulStoneSoup. Fixed up docstrings.

author: Leonard Richardson <leonard.richardson@canonical.com> 2009-04-09 08:17:14 -0400
committer: Leonard Richardson <leonard.richardson@canonical.com> 2009-04-09 08:17:14 -0400
commit: eca26d328adc60a7e0abb5edfa925b2ef73dbf89 (patch)
tree: ee8ab280d45cd5d1886c3b801a11265037e4c0e5 /BeautifulSoup.py
parent: 95b59dd7b8709e8fb003bcdc4973682483006a4f (diff)
1 files changed, 68 insertions, 57 deletions
diff --git a/BeautifulSoup.py b/BeautifulSoup.py
index f05e867..79c08d9 100644
--- a/BeautifulSoup.py
+++ b/BeautifulSoup.py
@@ -1025,28 +1025,43 @@ class TreeBuilder(Entities):
 class XMLParserBuilder(HTMLParser, TreeBuilder):
 
     """
-        HTMLParser will process most bad HTML, and the BeautifulSoup
-        class has some tricks for dealing with some HTML that kills
-        HTMLParser, but Beautiful Soup can nonetheless choke or lose data
-        if your data uses self-closing tags or declarations
-        incorrectly.
-
-        By default, Beautiful Soup uses regexes to sanitize input,
-        avoiding the vast majority of these problems. If the problems
-        don't apply to you, pass in False for markupMassage, and
-        you'll get better performance.
-
-        The default parser massage techniques fix the two most common
-        instances of invalid HTML that choke HTMLParser:
-
-         <br/> (No space between name of closing tag and tag close)
-         <! --Comment--> (Extraneous whitespace in declaration)
-
-        You can pass in a custom list of (RE object, replace method)
-        tuples to get Beautiful Soup to scrub your input the way you
-        want.
-        """
+    This class defines a basic tree builder based on Python's built-in
+    HTMLParser. The tree builder knows nothing about tag
+    behavior except for the following:
+
+      You can't close a tag without closing all the tags it encloses.
+      That is, "<foo><bar></foo>" actually means
+      "<foo><bar></bar></foo>".
+
+    [Another possible explanation is "<foo><bar /></foo>", but unless
+    you specify 'bar' in self_closing_tags, this class will never use
+    that explanation.]
+
+    This class is useful for parsing XML or made-up markup languages,
+    or when BeautifulSoup makes an assumption counter to what you were
+    expecting.
+
+
+    HTMLParser will process most bad HTML, and the BeautifulSoup class
+    has some tricks for dealing with some HTML that kills HTMLParser,
+    but Beautiful Soup can nonetheless choke or lose data if your data
+    uses self-closing tags or declarations incorrectly.
+
+    This class uses regexes to sanitize input, avoiding the vast
+    majority of these problems. If the problems don't apply to you,
+    pass in False for markupMassage, and you'll get better
+    performance.
 
+    The default parser massage techniques fix the two most common
+    instances of invalid HTML that choke HTMLParser:
+
+        <br/> (No space between name of closing tag and tag close)
+        <! --Comment--> (Extraneous whitespace in declaration)
+
+    You can pass in a custom list of (RE object, replace method)
+    tuples to get XMLParserBuilder to scrub your input the way you
+    want.
+    """
     reset_nesting_tags = {}
     nestable_tags = {}
 
@@ -1318,33 +1333,32 @@ class HTMLParserBuilder(XMLParserBuilder):
         XMLParserBuilder.__init__(self, *args, **kwargs)
 
     def handle_starttag(self, name, attrs):
-        if name == 'meta':
-            self.soup.handle_metatag(attrs)
-        else:
-            self.soup.handle_starttag(name, attrs)
-
+        self.soup.handle_starttag(name, attrs)
 
 
 class BeautifulStoneSoup(Tag):
+    """
+    This class defines the basic interface called by the tree builders.
 
-    """This class contains the basic parser and search code. It defines
-    a parser that knows nothing about tag behavior except for the
-    following:
-
-      You can't close a tag without closing all the tags it encloses.
-      That is, "<foo><bar></foo>" actually means
-      "<foo><bar></bar></foo>".
-
-    [Another possible explanation is "<foo><bar /></foo>", but since
-    this class defines no self_closing_tags, it will never use that
-    explanation.]
+    These methods will be called by the parser:
+      reset()
+      feed(markup)
 
-    This class is useful for parsing XML or made-up markup languages,
-    or when BeautifulSoup makes an assumption counter to what you were
-    expecting."""
+    The tree builder may call these methods from its feed() implementation:
+      handle_starttag(name, attrs, selfClosing=False)
+      handle_endtag(name)
+      handle_data(data) # Appends to the current data node
+      endData(containerClass=NavigableString) # Ends the current data node
 
+    No matter how complicated the underlying parser is, you should be
+    able to build a tree out of 'start tag' events, 'end tag' events,
+    and 'data' events.
+    """
     ROOT_TAG_NAME = u'[document]'
 
+    # Used to detect the charset in a META tag; see handleSpecialMetaTag
+    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+
     # Used when determining whether a text node is all whitespace and
     # can be replaced with a single space. A text node that contains
     # fancy Unicode spaces (usually non-breaking) should be left
@@ -1525,7 +1539,7 @@ class BeautifulStoneSoup(Tag):
         if popTo:
             self._popToTag(popTo, inclusive)
 
-    def handle_starttag(self, name, attrs, selfClosing=0):
+    def handle_starttag(self, name, attrs, selfClosing=False):
         #print "Start tag %s: %s" % (name, attrs)
         if self.quoteStack:
             #This is not a real tag.
@@ -1535,6 +1549,10 @@ class BeautifulStoneSoup(Tag):
             return
         self.endData()
 
+        containsSubstitutions = False
+        if name == 'meta' and self.builder.assume_html:
+            containsSubstitutions = self.handleSpecialMetaTag(attrs)
+
         if not self.builder.isSelfClosingTag(name) and not selfClosing:
             self._smartPop(name)
 
@@ -1545,6 +1563,7 @@ class BeautifulStoneSoup(Tag):
 
         tag = Tag(self, self.builder, name, attrs, self.currentTag,
                   self.previous)
+        tag.containsSubstitutions = containsSubstitutions
         if self.previous:
             self.previous.next = tag
         self.previous = tag
@@ -1573,19 +1592,7 @@ class BeautifulStoneSoup(Tag):
     def handle_data(self, data):
         self.currentData.append(data)
 
-    def handle_metatag(self, attrs):
-        self.handle_starttag('meta', attrs)
-
-
-class BeautifulSoup(BeautifulStoneSoup):
-
-    def _defaultBuilder(self):
-        return HTMLParserBuilder()
-
-    # Used to detect the charset in a META tag; see start_meta
-    CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
-
-    def handle_metatag(self, attrs):
+    def handleSpecialMetaTag(self, attrs):
         """Beautiful Soup can detect a charset included in a META tag,
         try to convert the document to that charset, and re-parse the
         document from the beginning."""
@@ -1628,9 +1635,13 @@ class BeautifulSoup(BeautifulStoneSoup):
                         self._feed(self.declaredHTMLEncoding)
                         raise StopParsing
                     pass
-        tag = self.handle_starttag("meta", attrs)
-        if tag and tagNeedsEncodingSubstitution:
-            tag.containsSubstitutions = True
+        return tagNeedsEncodingSubstitution
+
+
+class BeautifulSoup(BeautifulStoneSoup):
+
+    def _defaultBuilder(self):
+        return HTMLParserBuilder()
 
 
 class StopParsing(Exception):
author	Leonard Richardson <leonard.richardson@canonical.com>	2009-04-09 08:17:14 -0400
committer	Leonard Richardson <leonard.richardson@canonical.com>	2009-04-09 08:17:14 -0400
commit	eca26d328adc60a7e0abb5edfa925b2ef73dbf89 (patch)
tree	ee8ab280d45cd5d1886c3b801a11265037e4c0e5 /BeautifulSoup.py
parent	95b59dd7b8709e8fb003bcdc4973682483006a4f (diff)