summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--beautifulsoup/__init__.py14
-rw-r--r--beautifulsoup/builder/lxml_builder.py4
-rw-r--r--beautifulsoup/element.py10
-rw-r--r--tests/test_lxml.py33
4 files changed, 50 insertions, 11 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index 4a7e18b..ddf51f9 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -222,11 +222,15 @@ class BeautifulStoneSoup(Tag):
not self.parseOnlyThese.search(currentData)):
return
o = containerClass(currentData)
- o.setup(self.currentTag, self.previous)
- if self.previous:
- self.previous.next = o
- self.previous = o
- self.currentTag.contents.append(o)
+ self.object_was_parsed(o)
+
+ def object_was_parsed(self, o):
+ """Add an object to the parse tree."""
+ o.setup(self.currentTag, self.previous)
+ if self.previous:
+ self.previous.next = o
+ self.previous = o
+ self.currentTag.contents.append(o)
def _popToTag(self, name, inclusivePop=True):
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index 86ac183..9ced9f0 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -32,8 +32,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
def doctype(self, name, pubid, system):
self.soup.endData()
- self.soup.handle_data(name)
- self.soup.endData(Doctype)
+ doctype = Doctype.for_name_and_ids(name, pubid, system)
+ self.soup.object_was_parsed(doctype)
def comment(self, content):
"Handle comments as Comment objects."
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
index b2e0e12..8749114 100644
--- a/beautifulsoup/element.py
+++ b/beautifulsoup/element.py
@@ -372,6 +372,16 @@ class Declaration(NavigableString):
class Doctype(NavigableString):
+ @classmethod
+ def for_name_and_ids(cls, name, pub_id, system_id):
+ value = name
+ if pub_id is not None:
+ value += ' PUBLIC "%s"' % pub_id
+ if system_id is not None:
+ value += ' SYSTEM "%s"' % system_id
+
+ return Doctype(value)
+
def decodeGivenEventualEncoding(self, eventualEncoding):
return u'<!DOCTYPE ' + self + u'>'
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index 8f36b41..9f002cb 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -4,7 +4,7 @@ import re
from beautifulsoup import BeautifulSoup
from beautifulsoup.builder.lxml_builder import LXMLTreeBuilder
-from beautifulsoup.element import Comment
+from beautifulsoup.element import Comment, Doctype
from beautifulsoup.testing import SoupTest
@@ -201,11 +201,33 @@ class TestLXMLBuilder(SoupTest):
markup = "<svg><![CDATA[foobar]]>"
self.assertSoupEquals(markup, "<svg></svg>")
+ def test_namespaced_system_doctype(self):
+ doctype_str = '<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">'
+ markup = doctype_str + '<p>foo</p>'
+ soup = BeautifulSoup(markup)
+ doctype = soup.contents[0]
+ self.assertEquals(doctype.__class__, Doctype)
+ self.assertEquals(doctype, 'xsl:stylesheet SYSTEM "htmlent.dtd"')
+ self.assertEquals(str(soup)[:len(doctype_str)], doctype_str)
+ self.assertEquals(soup.p.contents[0], 'foo')
+
+ def test_namespaced_public_doctype(self):
+ doctype_str = '<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">'
+ markup = doctype_str + '<p>foo</p>'
+ soup = BeautifulSoup(markup)
+ doctype = soup.contents[0]
+ self.assertEquals(doctype.__class__, Doctype)
+ self.assertEquals(doctype, 'xsl:stylesheet PUBLIC "htmlent.dtd"')
+ self.assertEquals(str(soup)[:len(doctype_str)], doctype_str)
+ self.assertEquals(soup.p.contents[0], 'foo')
+
# Tests below this line need work.
- #def test_doctype(self):
- # xml = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"><html>foo</html></p>'
- # self.assertSoupEquals(xml)
+
+ def test_doctype(self):
+ doctype_str = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+ markup = doctype_str + '<p>foo</p>'
+ self.assertSoupEquals(xml)
def test_entities_converted_on_the_way_out(self):
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
@@ -274,6 +296,9 @@ class TestLXMLBuilderInvalidMarkup(SoupTest):
markup = "<p>one<!DOCTYPE foobar>two</p>"
self.assertSoupEquals(markup)
+ #def testJunkInDeclaration(self):
+ # self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a')
+
def test_cdata_where_it_doesnt_belong(self):
#CDATA sections are ignored.
markup = "<div><![CDATA[foo]]>"