diff options
-rw-r--r-- | beautifulsoup/__init__.py | 14 | ||||
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 4 | ||||
-rw-r--r-- | beautifulsoup/element.py | 10 | ||||
-rw-r--r-- | tests/test_lxml.py | 33 |
4 files changed, 50 insertions, 11 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index 4a7e18b..ddf51f9 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -222,11 +222,15 @@ class BeautifulStoneSoup(Tag): not self.parseOnlyThese.search(currentData)): return o = containerClass(currentData) - o.setup(self.currentTag, self.previous) - if self.previous: - self.previous.next = o - self.previous = o - self.currentTag.contents.append(o) + self.object_was_parsed(o) + + def object_was_parsed(self, o): + """Add an object to the parse tree.""" + o.setup(self.currentTag, self.previous) + if self.previous: + self.previous.next = o + self.previous = o + self.currentTag.contents.append(o) def _popToTag(self, name, inclusivePop=True): diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index 86ac183..9ced9f0 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -32,8 +32,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder): def doctype(self, name, pubid, system): self.soup.endData() - self.soup.handle_data(name) - self.soup.endData(Doctype) + doctype = Doctype.for_name_and_ids(name, pubid, system) + self.soup.object_was_parsed(doctype) def comment(self, content): "Handle comments as Comment objects." diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py index b2e0e12..8749114 100644 --- a/beautifulsoup/element.py +++ b/beautifulsoup/element.py @@ -372,6 +372,16 @@ class Declaration(NavigableString): class Doctype(NavigableString): + @classmethod + def for_name_and_ids(cls, name, pub_id, system_id): + value = name + if pub_id is not None: + value += ' PUBLIC "%s"' % pub_id + if system_id is not None: + value += ' SYSTEM "%s"' % system_id + + return Doctype(value) + def decodeGivenEventualEncoding(self, eventualEncoding): return u'<!DOCTYPE ' + self + u'>' diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 8f36b41..9f002cb 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -4,7 +4,7 @@ import re from beautifulsoup import BeautifulSoup from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder -from beautifulsoup.element import Comment +from beautifulsoup.element import Comment, Doctype from beautifulsoup.testing import SoupTest @@ -201,11 +201,33 @@ class TestLXMLBuilder(SoupTest): markup = "<svg><![CDATA[foobar]]>" self.assertSoupEquals(markup, "<svg></svg>") + def test_namespaced_system_doctype(self): + doctype_str = '<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">' + markup = doctype_str + '<p>foo</p>' + soup = BeautifulSoup(markup) + doctype = soup.contents[0] + self.assertEquals(doctype.__class__, Doctype) + self.assertEquals(doctype, 'xsl:stylesheet SYSTEM "htmlent.dtd"') + self.assertEquals(str(soup)[:len(doctype_str)], doctype_str) + self.assertEquals(soup.p.contents[0], 'foo') + + def test_namespaced_public_doctype(self): + doctype_str = '<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">' + markup = doctype_str + '<p>foo</p>' + soup = BeautifulSoup(markup) + doctype = soup.contents[0] + self.assertEquals(doctype.__class__, Doctype) + self.assertEquals(doctype, 'xsl:stylesheet PUBLIC "htmlent.dtd"') + self.assertEquals(str(soup)[:len(doctype_str)], doctype_str) + self.assertEquals(soup.p.contents[0], 'foo') + # Tests below this line need work. - #def test_doctype(self): - # xml = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"><html>foo</html></p>' - # self.assertSoupEquals(xml) + + def test_doctype(self): + doctype_str = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> + markup = doctype_str + '<p>foo</p>' + self.assertSoupEquals(xml) def test_entities_converted_on_the_way_out(self): text = "<p><<sacré bleu!>></p>" @@ -274,6 +296,9 @@ class TestLXMLBuilderInvalidMarkup(SoupTest): markup = "<p>one<!DOCTYPE foobar>two</p>" self.assertSoupEquals(markup) + #def testJunkInDeclaration(self): + # self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a') + def test_cdata_where_it_doesnt_belong(self): #CDATA sections are ignored. markup = "<div><![CDATA[foo]]>" |