diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-19 21:21:14 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-19 21:21:14 -0500 |
commit | 22e7fc268c6150f812e9af55f28dba7aeda4d053 (patch) | |
tree | 766662556ae441c5474e754fe9d582ffce3ff257 | |
parent | 8249b803d9bab9c06be02a244e629cb732f4f5b1 (diff) | |
parent | 9a936b48fe05666780662c76d5df3b3de7b48074 (diff) |
Preliminary work for getting XML parsing to work.
-rw-r--r-- | beautifulsoup/__init__.py | 34 | ||||
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 23 | ||||
-rw-r--r-- | beautifulsoup/testing.py | 3 | ||||
-rw-r--r-- | tests/test_html5lib.py | 6 | ||||
-rw-r--r-- | tests/test_lxml.py | 13 | ||||
-rw-r--r-- | tests/test_tree.py | 10 |
6 files changed, 42 insertions, 47 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index 5d66bc7..922005c 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -76,16 +76,10 @@ from __future__ import generators __author__ = "Leonard Richardson (leonardr@segfault.org)" __version__ = "4.0.0" -__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson" +__copyright__ = "Copyright (c) 2004-2011 Leonard Richardson" __license__ = "New-style BSD" -__all__ = ['BeautifulSoup', - - # Stuff imported from other packages - 'Entities', - - 'BeautifulStoneSoup', - 'ICantBelieveItsBeautifulSoup'] +__all__ = ['BeautifulSoup'] import re @@ -94,7 +88,7 @@ from dammit import UnicodeDammit from element import Entities, NavigableString, Tag -class BeautifulStoneSoup(Tag): +class BeautifulSoup(Tag): """ This class defines the basic interface called by the tree builders. @@ -128,9 +122,12 @@ class BeautifulStoneSoup(Tag): @classmethod def default_builder(self): - from lxml import etree - from builder.lxml_builder import LXMLTreeBuilder - return LXMLTreeBuilder(parser_class=etree.XMLParser) + try: + from builder.html5_builder import HTML5TreeBuilder + return HTML5TreeBuilder() + except ImportError: + from builder.lxml_builder import LXMLTreeBuilder + return LXMLTreeBuilder() def __init__(self, markup="", builder=None, parseOnlyThese=None, fromEncoding=None): @@ -278,19 +275,6 @@ class BeautifulStoneSoup(Tag): self.currentData.append(data) -class BeautifulSoup(BeautifulStoneSoup): - """A convenience class for parsing HTML without creating a builder.""" - - @classmethod - def default_builder(self): - try: - from builder.html5_builder import HTML5TreeBuilder - return HTML5TreeBuilder() - except ImportError: - from builder.lxml_builder import LXMLTreeBuilder - return LXMLTreeBuilder() - - class StopParsing(Exception): pass diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index 2c264b3..afdf760 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -1,15 +1,16 @@ from lxml import etree from beautifulsoup.element import Comment, Doctype -from beautifulsoup.builder import HTMLTreeBuilder +from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder from beautifulsoup.dammit import UnicodeDammit -class LXMLTreeBuilder(HTMLTreeBuilder): +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser - def __init__(self, parser_class=etree.HTMLParser): - # etree.HTMLParser's constructor has an argument strip_cdata, - # but it does nothing. CDATA sections are always stripped when - # passed through HTMLParser. - self.parser = parser_class(target=self) + def __init__(self, parser_class=None): + # strip_cdata only has an effect on XMLParser. HTMLParser's + # constructor accepts strip_cdata but ignores it. + parser_class = parser_class or self.DEFAULT_PARSER_CLASS + self.parser = parser_class(target=self, strip_cdata=False) self.soup = None def prepare_markup(self, markup, user_specified_encoding=None, @@ -23,8 +24,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder): try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, isHTML=True) - return dammit.markup, dammit.original_encoding, dammit.declared_html_encoding - + return (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding) def feed(self, markup): self.parser.feed(markup) @@ -60,3 +61,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder): """See `TreeBuilder`.""" return u'<html><body>%s</body></html>' % fragment + +class LXMLTreeBuilder(LXMLTreeBuilderForXML, HTMLTreeBuilder): + + DEFAULT_PARSER_CLASS = etree.HTMLParser diff --git a/beautifulsoup/testing.py b/beautifulsoup/testing.py index 74937d9..9b1e858 100644 --- a/beautifulsoup/testing.py +++ b/beautifulsoup/testing.py @@ -13,7 +13,8 @@ class SoupTest(unittest.TestCase): def soup(self, markup, **kwargs): """Build a Beautiful Soup object from markup.""" - return BeautifulSoup(markup, builder=self.default_builder, **kwargs) + builder = kwargs.pop('builder', self.default_builder) + return BeautifulSoup(markup, builder=builder, **kwargs) def document_for(self, markup): """Turn an HTML fragment into a document. diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index 3045b02..336f9a5 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -40,6 +40,12 @@ class TestHTML5Builder(TestLXMLBuilder): "<tbody><tr><td>Bar</td></tr></tbody>" "<tfoot><tr><td>Baz</td></tr></tfoot></table>") + def test_literal_in_textarea(self): + markup = '<textarea>Junk like <b> tags and <&<&</textarea>' + soup = self.soup(markup) + self.assertEquals( + soup.textarea.contents, ["Junk like <b> tags and <&<&"]) + def test_collapsed_whitespace(self): """Whitespace is preserved even in tags that don't require it.""" self.assertSoupEquals("<p> </p>") diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 7e15dcf..8670806 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -126,12 +126,11 @@ class TestLXMLBuilder(SoupTest): def test_literal_in_textarea(self): # Anything inside a <textarea> is supposed to be treated as - # the literal value of the field, (XXX citation needed). - # - # But, both lxml and html5lib do their best to parse the - # contents of a <textarea> as HTML. + # the literal value of the field, (XXX citation + # needed). html5lib does this correctly. But, lxml does its + # best to parse the contents of a <textarea> as HTML. text = '<textarea>Junk like <b> tags and <&<&</textarea>' - soup = BeautifulSoup(text) + soup = self.soup(text) self.assertEquals(len(soup.textarea.contents), 2) self.assertEquals(soup.textarea.contents[0], u"Junk like ") self.assertEquals(soup.textarea.contents[1].name, 'b') @@ -141,7 +140,7 @@ class TestLXMLBuilder(SoupTest): # The contents of a <script> tag are treated as a literal string, # even if that string contains HTML. javascript = 'if (i < 2) { alert("<b>foo</b>"); }' - soup = BeautifulSoup('<script>%s</script>' % javascript) + soup = self.soup('<script>%s</script>' % javascript) self.assertEquals(soup.script.string, javascript) def test_naked_ampersands(self): @@ -300,7 +299,7 @@ class TestLXMLBuilder(SoupTest): def test_entities_converted_on_the_way_out(self): text = "<p><<sacré bleu!>></p>" expected = u"<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>".encode("utf-8") - soup = BeautifulSoup(text) + soup = self.soup(text) str = soup.p.string #self.assertEquals(str.encode("utf-8"), expected) diff --git a/tests/test_tree.py b/tests/test_tree.py index 02efead..233cb3c 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -524,7 +524,7 @@ class TestTreeModification(SoupTest): def test_new_tag_creation(self): builder = BeautifulSoup.default_builder() - soup = BeautifulSoup("", builder=builder) + soup = self.soup("", builder=builder) a = Tag(soup, builder, 'a') ol = Tag(soup, builder, 'ol') a['href'] = 'http://foo.com/' @@ -553,7 +553,7 @@ class TestTreeModification(SoupTest): def test_replace_tag_with_itself(self): text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" - soup = BeautifulSoup(text) + soup = self.soup(text) c = soup.c soup.c.replaceWith(c) self.assertEquals(soup.decode(), self.document_for(text)) @@ -592,7 +592,7 @@ class TestTreeModification(SoupTest): def test_insert_tag(self): builder = self.default_builder - soup = BeautifulSoup( + soup = self.soup( "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) magic_tag = Tag(soup, builder, 'magictag') magic_tag.insert(0, "the") @@ -636,7 +636,7 @@ class TestTreeModification(SoupTest): self.assertEquals(no.nextSibling, " business") def test_nested_tag_replace_with(self): - soup = BeautifulSoup( + soup = self.soup( """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""") # Replace the entire <b> tag and its contents ("reserve the @@ -853,7 +853,7 @@ class TestSubstitutions(SoupTest): # meta tag got filtered out by the strainer. This test makes # sure that doesn't happen. strainer = SoupStrainer('pre') - soup = BeautifulSoup(markup, parseOnlyThese=strainer) + soup = self.soup(markup, parseOnlyThese=strainer) self.assertEquals(soup.contents[0].name, 'pre') |