Added an LXML builder and a stupid test.

author: Leonard Richardson <leonard.richardson@canonical.com> 2009-04-08 16:30:23 -0400
committer: Leonard Richardson <leonard.richardson@canonical.com> 2009-04-08 16:30:23 -0400
commit: 6d7cfb69c5c11cb98b10a2de0900b0f9b6ada6b8 (patch)
tree: d43d98672a964b66c0620d773185ee6bca692d74
parent: cafaed172c6fd8b0668f1365bbb97331ca4fa1dc (diff)
3 files changed, 81 insertions, 18 deletions
diff --git a/BeautifulSoup.py b/BeautifulSoup.py
index 67485d1..1fb9a1b 100644
--- a/BeautifulSoup.py
+++ b/BeautifulSoup.py
@@ -104,7 +104,7 @@ DEFAULT_OUTPUT_ENCODING = "utf-8"
 
 # First, the classes that represent markup elements.
 
-class Entities:
+class KnowsEntitiesMixin:
     """Knows about XML entities."""
 
     HTML_ENTITIES = "html"
@@ -469,7 +469,7 @@ class Declaration(NavigableString):
     def decodeGivenEventualEncoding(self, eventualEncoding):
         return u'<!' + self + u'>'
 
-class Tag(PageElement, Entities):
+class Tag(PageElement, KnowsEntitiesMixin):
 
     """Represents a found HTML tag with its attributes and contents."""
 
@@ -1004,7 +1004,24 @@ def buildTagMap(default, *args):
 
 # Now, the parser classes.
 
-class XMLParserBuilder(HTMLParser, Entities):
+class TreeBuilder(KnowsEntitiesMixin):
+
+    smartQuotesTo = KnowsEntitiesMixin.XML_ENTITIES
+    PRESERVE_WHITESPACE_TAGS = set()
+    QUOTE_TAGS = set()
+    self_closing_tags = set()
+
+    def isSelfClosingTag(self, name):
+        return name in self.self_closing_tags
+
+    def reset(self):
+        pass
+
+    def close(self):
+        pass
+
+
+class XMLParserBuilder(TreeBuilder, HTMLParser):
 
     """
         HTMLParser will process most bad HTML, and the BeautifulSoup
@@ -1035,13 +1052,9 @@ class XMLParserBuilder(HTMLParser, Entities):
                        lambda x: '<!' + x.group(1) + '>')
                       ]
 
-    PRESERVE_WHITESPACE_TAGS = set()
-    QUOTE_TAGS = set()
-    SELF_CLOSING_TAGS = set()
-
     def __init__(self, convertEntities=None, markupMassage=True,
                  selfClosingTags=None,
-                 smartQuotesTo=Entities.XML_ENTITIES):
+                 smartQuotesTo=KnowsEntitiesMixin.XML_ENTITIES):
         HTMLParser.__init__(self)
         self.soup = None
         self.convertEntities = convertEntities
@@ -1093,7 +1106,7 @@ class XMLParserBuilder(HTMLParser, Entities):
     def isSelfClosingTag(self, name):
         """Returns true iff the given string is the name of a
         self-closing tag according to this parser."""
-        return (name in self.SELF_CLOSING_TAGS
+        return (name in self.self_closing_tags
                 or name in self.instanceSelfClosingTags)
 
     def handle_starttag(self, name, attrs):
@@ -1105,13 +1118,6 @@ class XMLParserBuilder(HTMLParser, Entities):
     def handle_data(self, content):
         self.soup.handle_data(content)
 
-    def _toStringSubclass(self, text, subclass):
-        """Adds a certain piece of text to the tree as a NavigableString
-        subclass."""
-        self.soup.endData()
-        self.handle_data(text)
-        self.soup.endData(subclass)
-
     def handle_pi(self, text):
         """Handle a processing instruction as a ProcessingInstruction
         object, possibly one with a %SOUP-ENCODING% slot into which an
@@ -1179,6 +1185,13 @@ class XMLParserBuilder(HTMLParser, Entities):
         "Handle DOCTYPEs and the like as Declaration objects."
         self._toStringSubclass(data, Declaration)
 
+    def _toStringSubclass(self, text, subclass):
+        """Adds a certain piece of text to the tree as a NavigableString
+        subclass."""
+        self.soup.endData()
+        self.handle_data(text)
+        self.soup.endData(subclass)
+
     def parse_declaration(self, i):
         """Treat a bogus SGML declaration as raw data. Treat a CDATA
         declaration as a CData object."""
@@ -1204,7 +1217,7 @@ class HTMLParserBuilder(XMLParserBuilder):
 
     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
     QUOTE_TAGS = set(['script', 'textarea'])
-    SELF_CLOSING_TAGS = set(['br' , 'hr', 'input', 'img', 'meta',
+    self_closing_tags = set(['br' , 'hr', 'input', 'img', 'meta',
                              'spacer', 'link', 'frame', 'base'])
 
     def __init__(self, *args, **kwargs):
@@ -1231,7 +1244,7 @@ class BeautifulStoneSoup(Tag):
       "<foo><bar></bar></foo>".
 
     [Another possible explanation is "<foo><bar /></foo>", but since
-    this class defines no SELF_CLOSING_TAGS, it will never use that
+    this class defines no self_closing_tags, it will never use that
     explanation.]
 
     This class is useful for parsing XML or made-up markup languages,
@@ -1276,6 +1289,7 @@ class BeautifulStoneSoup(Tag):
         except StopParsing:
             pass
         self.markup = None                 # The markup can now be GCed.
+        self.builder.close()
         self.builder.soup = None
         self.builder = None                # So can the builder.
 
diff --git a/lxml_builder.py b/lxml_builder.py
new file mode 100644
index 0000000..95988e4
--- /dev/null
+++ b/lxml_builder.py
@@ -0,0 +1,36 @@
+from lxml import etree
+from BeautifulSoup import TreeBuilder
+
+class LXMLBuilder(TreeBuilder):
+
+    def __init__(self, parser_class=etree.XMLParser, self_closing_tags=[]):
+        self.parser = parser_class(target=self)
+        self.self_closing_tags = self_closing_tags
+        self.soup = None
+
+    def isSelfClosingTag(self, name):
+        return name in self.self_closing_tags
+
+    def feed(self, markup):
+        self.parser.feed(markup)
+        self.parser.close()
+
+    def start(self, name, attrs):
+        self.soup.handle_starttag(name, attrs)
+
+    def end(self, name):
+        self.soup.handle_endtag(name)
+
+    def data(self, content):
+        self.soup.handle_data(content)
+
+    def comment(self, content):
+        "Handle comments as Comment objects."
+        self._toStringSubclass(content, Comment)
+
+    def _toStringSubclass(self, text, subclass):
+        """Adds a certain piece of text to the tree as a NavigableString
+        subclass."""
+        self.soup.endData()
+        self.data(text)
+        self.soup.endData(subclass)
diff --git a/lxml_test.py b/lxml_test.py
new file mode 100644
index 0000000..99375dd
--- /dev/null
+++ b/lxml_test.py
@@ -0,0 +1,13 @@
+from BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
+from lxml_builder import LXMLBuilder
+from lxml import etree
+builder = LXMLBuilder()
+soup = BeautifulStoneSoup("<foo>bar</foo>", builder=builder)
+print soup.prettify()
+
+soup = BeautifulSoup("<foo>bar</foo>", builder=builder)
+print soup.prettify()
+
+builder = LXMLBuilder(parser_class=etree.HTMLParser, self_closing_tags=["br"])
+soup = BeautifulSoup("<html><head><title>test<body><h1>page<br />title</h1>", builder=builder)
+print soup.prettify()
author	Leonard Richardson <leonard.richardson@canonical.com>	2009-04-08 16:30:23 -0400
committer	Leonard Richardson <leonard.richardson@canonical.com>	2009-04-08 16:30:23 -0400
commit	6d7cfb69c5c11cb98b10a2de0900b0f9b6ada6b8 (patch)
tree	d43d98672a964b66c0620d773185ee6bca692d74
parent	cafaed172c6fd8b0668f1365bbb97331ca4fa1dc (diff)