Preliminary work for getting XML parsing to work.

author: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-19 21:21:14 -0500
committer: Leonard Richardson <leonard.richardson@canonical.com> 2011-02-19 21:21:14 -0500
commit: 22e7fc268c6150f812e9af55f28dba7aeda4d053 (patch)
tree: 766662556ae441c5474e754fe9d582ffce3ff257
parent: 8249b803d9bab9c06be02a244e629cb732f4f5b1 (diff)
parent: 9a936b48fe05666780662c76d5df3b3de7b48074 (diff)
6 files changed, 42 insertions, 47 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index 5d66bc7..922005c 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -76,16 +76,10 @@ from __future__ import generators
 
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
 __version__ = "4.0.0"
-__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
+__copyright__ = "Copyright (c) 2004-2011 Leonard Richardson"
 __license__ = "New-style BSD"
 
-__all__ = ['BeautifulSoup',
-
-           # Stuff imported from other packages
-           'Entities',
-
-           'BeautifulStoneSoup',
-           'ICantBelieveItsBeautifulSoup']
+__all__ = ['BeautifulSoup']
 
 import re
 
@@ -94,7 +88,7 @@ from dammit import UnicodeDammit
 from element import Entities, NavigableString, Tag
 
 
-class BeautifulStoneSoup(Tag):
+class BeautifulSoup(Tag):
     """
     This class defines the basic interface called by the tree builders.
 
@@ -128,9 +122,12 @@ class BeautifulStoneSoup(Tag):
 
     @classmethod
     def default_builder(self):
-        from lxml import etree
-        from builder.lxml_builder import LXMLTreeBuilder
-        return LXMLTreeBuilder(parser_class=etree.XMLParser)
+        try:
+            from builder.html5_builder import HTML5TreeBuilder
+            return HTML5TreeBuilder()
+        except ImportError:
+            from builder.lxml_builder import LXMLTreeBuilder
+            return LXMLTreeBuilder()
 
     def __init__(self, markup="", builder=None, parseOnlyThese=None,
                  fromEncoding=None):
@@ -278,19 +275,6 @@ class BeautifulStoneSoup(Tag):
         self.currentData.append(data)
 
 
-class BeautifulSoup(BeautifulStoneSoup):
-    """A convenience class for parsing HTML without creating a builder."""
-
-    @classmethod
-    def default_builder(self):
-        try:
-            from builder.html5_builder import HTML5TreeBuilder
-            return HTML5TreeBuilder()
-        except ImportError:
-            from builder.lxml_builder import LXMLTreeBuilder
-            return LXMLTreeBuilder()
-
-
 class StopParsing(Exception):
     pass
 
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index 2c264b3..afdf760 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -1,15 +1,16 @@
 from lxml import etree
 from beautifulsoup.element import Comment, Doctype
-from beautifulsoup.builder import HTMLTreeBuilder
+from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder
 from beautifulsoup.dammit import UnicodeDammit
 
-class LXMLTreeBuilder(HTMLTreeBuilder):
+class LXMLTreeBuilderForXML(TreeBuilder):
+    DEFAULT_PARSER_CLASS = etree.XMLParser
 
-    def __init__(self, parser_class=etree.HTMLParser):
-        # etree.HTMLParser's constructor has an argument strip_cdata,
-        # but it does nothing. CDATA sections are always stripped when
-        # passed through HTMLParser.
-        self.parser = parser_class(target=self)
+    def __init__(self, parser_class=None):
+        # strip_cdata only has an effect on XMLParser. HTMLParser's
+        # constructor accepts strip_cdata but ignores it.
+        parser_class = parser_class or self.DEFAULT_PARSER_CLASS
+        self.parser = parser_class(target=self, strip_cdata=False)
         self.soup = None
 
     def prepare_markup(self, markup, user_specified_encoding=None,
@@ -23,8 +24,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
 
         try_encodings = [user_specified_encoding, document_declared_encoding]
         dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
-        return dammit.markup, dammit.original_encoding, dammit.declared_html_encoding
-
+        return (dammit.markup, dammit.original_encoding,
+                dammit.declared_html_encoding)
 
     def feed(self, markup):
         self.parser.feed(markup)
@@ -60,3 +61,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
         """See `TreeBuilder`."""
         return u'<html><body>%s</body></html>' % fragment
 
+
+class LXMLTreeBuilder(LXMLTreeBuilderForXML, HTMLTreeBuilder):
+
+    DEFAULT_PARSER_CLASS = etree.HTMLParser
diff --git a/beautifulsoup/testing.py b/beautifulsoup/testing.py
index 74937d9..9b1e858 100644
--- a/beautifulsoup/testing.py
+++ b/beautifulsoup/testing.py
@@ -13,7 +13,8 @@ class SoupTest(unittest.TestCase):
 
     def soup(self, markup, **kwargs):
         """Build a Beautiful Soup object from markup."""
-        return BeautifulSoup(markup, builder=self.default_builder, **kwargs)
+        builder = kwargs.pop('builder', self.default_builder)
+        return BeautifulSoup(markup, builder=builder, **kwargs)
 
     def document_for(self, markup):
         """Turn an HTML fragment into a document.
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index 3045b02..336f9a5 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -40,6 +40,12 @@ class TestHTML5Builder(TestLXMLBuilder):
             "<tbody><tr><td>Bar</td></tr></tbody>"
             "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
 
+    def test_literal_in_textarea(self):
+        markup = '<textarea>Junk like <b> tags and <&<&amp;</textarea>'
+        soup = self.soup(markup)
+        self.assertEquals(
+            soup.textarea.contents, ["Junk like <b> tags and <&<&"])
+
     def test_collapsed_whitespace(self):
         """Whitespace is preserved even in tags that don't require it."""
         self.assertSoupEquals("<p>   </p>")
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 7e15dcf..8670806 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -126,12 +126,11 @@ class TestLXMLBuilder(SoupTest):
 
     def test_literal_in_textarea(self):
         # Anything inside a <textarea> is supposed to be treated as
-        # the literal value of the field, (XXX citation needed).
-        #
-        # But, both lxml and html5lib do their best to parse the
-        # contents of a <textarea> as HTML.
+        # the literal value of the field, (XXX citation
+        # needed). html5lib does this correctly. But, lxml does its
+        # best to parse the contents of a <textarea> as HTML.
         text = '<textarea>Junk like <b> tags and <&<&amp;</textarea>'
-        soup = BeautifulSoup(text)
+        soup = self.soup(text)
         self.assertEquals(len(soup.textarea.contents), 2)
         self.assertEquals(soup.textarea.contents[0], u"Junk like ")
         self.assertEquals(soup.textarea.contents[1].name, 'b')
@@ -141,7 +140,7 @@ class TestLXMLBuilder(SoupTest):
         # The contents of a <script> tag are treated as a literal string,
         # even if that string contains HTML.
         javascript = 'if (i < 2) { alert("<b>foo</b>"); }'
-        soup = BeautifulSoup('<script>%s</script>' % javascript)
+        soup = self.soup('<script>%s</script>' % javascript)
         self.assertEquals(soup.script.string, javascript)
 
     def test_naked_ampersands(self):
@@ -300,7 +299,7 @@ class TestLXMLBuilder(SoupTest):
     def test_entities_converted_on_the_way_out(self):
         text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
         expected = u"&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;".encode("utf-8")
-        soup = BeautifulSoup(text)
+        soup = self.soup(text)
         str = soup.p.string
         #self.assertEquals(str.encode("utf-8"), expected)
 
diff --git a/tests/test_tree.py b/tests/test_tree.py
index 02efead..233cb3c 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -524,7 +524,7 @@ class TestTreeModification(SoupTest):
 
     def test_new_tag_creation(self):
         builder = BeautifulSoup.default_builder()
-        soup = BeautifulSoup("", builder=builder)
+        soup = self.soup("", builder=builder)
         a = Tag(soup, builder, 'a')
         ol = Tag(soup, builder, 'ol')
         a['href'] = 'http://foo.com/'
@@ -553,7 +553,7 @@ class TestTreeModification(SoupTest):
 
     def test_replace_tag_with_itself(self):
         text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
-        soup = BeautifulSoup(text)
+        soup = self.soup(text)
         c = soup.c
         soup.c.replaceWith(c)
         self.assertEquals(soup.decode(), self.document_for(text))
@@ -592,7 +592,7 @@ class TestTreeModification(SoupTest):
 
     def test_insert_tag(self):
         builder = self.default_builder
-        soup = BeautifulSoup(
+        soup = self.soup(
             "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
         magic_tag = Tag(soup, builder, 'magictag')
         magic_tag.insert(0, "the")
@@ -636,7 +636,7 @@ class TestTreeModification(SoupTest):
         self.assertEquals(no.nextSibling, " business")
 
     def test_nested_tag_replace_with(self):
-        soup = BeautifulSoup(
+        soup = self.soup(
             """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
 
         # Replace the entire <b> tag and its contents ("reserve the
@@ -853,7 +853,7 @@ class TestSubstitutions(SoupTest):
         # meta tag got filtered out by the strainer. This test makes
         # sure that doesn't happen.
         strainer = SoupStrainer('pre')
-        soup = BeautifulSoup(markup, parseOnlyThese=strainer)
+        soup = self.soup(markup, parseOnlyThese=strainer)
         self.assertEquals(soup.contents[0].name, 'pre')
author	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-19 21:21:14 -0500
committer	Leonard Richardson <leonard.richardson@canonical.com>	2011-02-19 21:21:14 -0500
commit	22e7fc268c6150f812e9af55f28dba7aeda4d053 (patch)
tree	766662556ae441c5474e754fe9d582ffce3ff257
parent	8249b803d9bab9c06be02a244e629cb732f4f5b1 (diff)
parent	9a936b48fe05666780662c76d5df3b3de7b48074 (diff)