diff options
-rw-r--r-- | NEWS.txt | 12 | ||||
-rw-r--r-- | bs4/__init__.py | 23 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 18 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 3 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 62 | ||||
-rw-r--r-- | bs4/element.py | 37 | ||||
-rw-r--r-- | bs4/testing.py | 60 | ||||
-rw-r--r-- | bs4/tests/test_html5lib.py | 6 | ||||
-rw-r--r-- | bs4/tests/test_lxml.py | 12 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 38 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 21 | ||||
-rw-r--r-- | setup.py | 2 |
12 files changed, 246 insertions, 48 deletions
@@ -1,3 +1,15 @@ += 4.0.0b8 () = + +* All tree builders now preserve namespace information in the + documents they parse. + + However, there is no special support for namespace-oriented + searching or tree manipulation. When you search the tree, you need + to use namespace prefixes exactly as they're used in the original + document. + +* The string representation of a DOCTYPE always ends in a newline. + = 4.0.0b7 (20110223) = * Upon decoding to string, any characters that can't be represented in diff --git a/bs4/__init__.py b/bs4/__init__.py index 13dac85..9b5c155 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -17,7 +17,7 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.0.0b7" +__version__ = "4.0.0b8" __copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" __license__ = "MIT" @@ -193,9 +193,9 @@ class BeautifulSoup(Tag): self.tagStack = [] self.pushTag(self) - def new_tag(self, name, **attrs): + def new_tag(self, name, namespace=None, nsprefix=None, **attrs): """Create a new tag associated with this soup.""" - return Tag(None, self.builder, name, attrs) + return Tag(None, self.builder, name, namespace, nsprefix, attrs) def new_string(self, s): """Create a new NavigableString associated with this soup.""" @@ -249,7 +249,7 @@ class BeautifulSoup(Tag): self.previous_element = o self.currentTag.contents.append(o) - def _popToTag(self, name, inclusivePop=True): + def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent instance of the given tag. If inclusivePop is false, pops the tag stack up to but *not* including the most recent instqance of @@ -262,7 +262,8 @@ class BeautifulSoup(Tag): mostRecentTag = None for i in range(len(self.tagStack) - 1, 0, -1): - if name == self.tagStack[i].name: + if (name == self.tagStack[i].name + and nsprefix == self.tagStack[i].nsprefix == nsprefix): numPops = len(self.tagStack) - i break if not inclusivePop: @@ -272,7 +273,7 @@ class BeautifulSoup(Tag): mostRecentTag = self.popTag() return mostRecentTag - def handle_starttag(self, name, attrs): + def handle_starttag(self, name, namespace, nsprefix, attrs): """Push a start tag on to the stack. If this method returns None, the tag was rejected by the @@ -281,7 +282,7 @@ class BeautifulSoup(Tag): don't call handle_endtag. """ - #print "Start tag %s: %s" % (name, attrs) + # print "Start tag %s: %s" % (name, attrs) self.endData() if (self.parse_only and len(self.tagStack) <= 1 @@ -289,8 +290,8 @@ class BeautifulSoup(Tag): or not self.parse_only.search_tag(name, attrs))): return None - tag = Tag(self, self.builder, name, attrs, self.currentTag, - self.previous_element) + tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, + self.currentTag, self.previous_element) if tag is None: return tag if self.previous_element: @@ -299,10 +300,10 @@ class BeautifulSoup(Tag): self.pushTag(tag) return tag - def handle_endtag(self, name): + def handle_endtag(self, name, nsprefix=None): #print "End tag: " + name self.endData() - self._popToTag(name) + self._popToTag(name, nsprefix) def handle_data(self, data): self.currentData.append(data) diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 0d7a1a9..26b1773 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -8,12 +8,9 @@ from bs4.builder import ( HTML_5, HTMLTreeBuilder, ) +from bs4.element import NamespacedAttribute import html5lib -from html5lib.constants import ( - DataLossWarning, - namespaces, - ) -import warnings +from html5lib.constants import namespaces from bs4.element import ( Comment, Doctype, @@ -58,9 +55,6 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): def __init__(self, soup, namespaceHTMLElements): self.soup = soup - if namespaceHTMLElements: - warnings.warn("namespaceHTMLElements not supported yet", - DataLossWarning) super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): @@ -76,9 +70,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): self.soup.object_was_parsed(doctype) def elementClass(self, name, namespace): - if namespace is not None: - warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) - tag = self.soup.new_tag(name) + tag = self.soup.new_tag(name, namespace) return Element(tag, self.soup, namespace) def commentClass(self, data): @@ -144,6 +136,8 @@ class Element(html5lib.treebuilders._base.Node): def setAttributes(self, attributes): if attributes is not None and attributes != {}: for name, value in list(attributes.items()): + if isinstance(name, tuple): + name = NamespacedAttribute(*name) self.element[name] = value # The attributes may contain variables that need substitution. # Call set_up_substitutions manually. @@ -189,7 +183,7 @@ class Element(html5lib.treebuilders._base.Node): TextNode(child, self.soup)) def cloneNode(self): - tag = self.soup.new_tag(self.element.name) + tag = self.soup.new_tag(self.element.name, self.namespace) node = Element(tag, self.soup, self.namespace) for key,value in self.attributes: node.attributes[key] = value diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 62473cf..c307ff8 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -40,7 +40,8 @@ HTMLPARSER = 'html.parser' class BeautifulSoupHTMLParser(HTMLParser): def handle_starttag(self, name, attrs): - self.soup.handle_starttag(name, dict(attrs)) + # XXX namespace + self.soup.handle_starttag(name, None, None, dict(attrs)) def handle_endtag(self, name): self.soup.handle_endtag(name) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index cc3cb86..e5e30d4 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -5,7 +5,7 @@ __all__ = [ import collections from lxml import etree -from bs4.element import Comment, Doctype +from bs4.element import Comment, Doctype, NamespacedAttribute from bs4.builder import ( FAST, HTML, @@ -42,6 +42,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): parser = parser(target=self, strip_cdata=False) self.parser = parser self.soup = None + self.nsmaps = None + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): @@ -63,15 +72,56 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.parser.close() def close(self): - pass - - def start(self, name, attrs): - self.soup.handle_starttag(name, attrs) + self.nsmaps = None + + def start(self, name, attrs, nsmap={}): + nsprefix = None + # Invert each namespace map as it comes in. + if len(nsmap) == 0 and self.nsmaps != None: + # There are no new namespaces for this tag, but namespaces + # are in play, so we need a separate tag stack to know + # when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + if self.nsmaps is None: + self.nsmaps = [] + inverted_nsmap = dict((value, key) for key, value in nsmap.items()) + self.nsmaps.append(inverted_nsmap) + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in nsmap.items(): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace + namespace, name = self._getNsTag(name) + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_starttag(name, namespace, nsprefix, attrs) def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] - self.soup.handle_endtag(name) + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) + if self.nsmaps != None: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + self.nsmaps.pop() + if len(self.nsmaps) == 0: + # Namespaces are no longer in play, so don't bother keeping + # track of the namespace stack. + self.nsmaps = None def pi(self, target, data): pass diff --git a/bs4/element.py b/bs4/element.py index 997378a..efc6ec7 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -22,6 +22,19 @@ def _alias(attr): return alias +class NamespacedAttribute(unicode): + + def __new__(cls, prefix, name, namespace=None): + if name is None: + obj = unicode.__new__(cls, prefix) + else: + obj = unicode.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + + class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -500,15 +513,15 @@ class Doctype(NavigableString): return Doctype(value) PREFIX = u'<!DOCTYPE ' - SUFFIX = u'>' + SUFFIX = u'>\n' class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" - def __init__(self, parser=None, builder=None, name=None, attrs=None, - parent=None, previous=None): + def __init__(self, parser=None, builder=None, name=None, namespace=None, + nsprefix=None, attrs=None, parent=None, previous=None): "Basic constructor." if parser is None: @@ -520,6 +533,8 @@ class Tag(PageElement): if name is None: raise ValueError("No value provided for new tag's name.") self.name = name + self.namespace = namespace + self.nsprefix = nsprefix if attrs is None: attrs = {} else: @@ -659,6 +674,9 @@ class Tag(PageElement): def has_attr(self, key): return key in self.attrs + def __hash__(self): + return str(self).__hash__() + def __getitem__(self, key): """tag[key] returns the value of the 'key' attribute for the tag, and throws an exception if it's not there.""" @@ -779,7 +797,7 @@ class Tag(PageElement): and '%SOUP-ENCODING%' in val): val = self.substitute_encoding(val, eventual_encoding) - decoded = (key + '=' + decoded = (str(key) + '=' + EntitySubstitution.substitute_xml(val, True)) attrs.append(decoded) close = '' @@ -789,6 +807,10 @@ class Tag(PageElement): else: closeTag = '</%s>' % self.name + prefix = '' + if self.nsprefix: + prefix = self.nsprefix + ":" + pretty_print = (indent_level is not None) if pretty_print: space = (' ' * (indent_level - 1)) @@ -809,7 +831,8 @@ class Tag(PageElement): attribute_string = ' ' + ' '.join(attrs) if pretty_print: s.append(space) - s.append('<%s%s%s>' % (self.name, attribute_string, close)) + s.append('<%s%s%s%s>' % ( + prefix, self.name, attribute_string, close)) if pretty_print: s.append("\n") s.append(contents) @@ -986,7 +1009,7 @@ class SoupStrainer(object): searchTag = search_tag def search(self, markup): - #print 'looking for %s in %s' % (self, markup) + # print 'looking for %s in %s' % (self, markup) found = None # If given a list of items, scan it for a text element that # matches. @@ -1012,7 +1035,7 @@ class SoupStrainer(object): return found def _matches(self, markup, match_against): - #print "Matching %s against %s" % (markup, match_against) + # print "Matching %s against %s" % (markup, match_against) result = False if isinstance(markup, list) or isinstance(markup, tuple): diff --git a/bs4/testing.py b/bs4/testing.py index dc20812..d7b01aa 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -358,6 +358,66 @@ class HTMLTreeBuilderSmokeTest(object): # For the rest of the story, see TestSubstitutions in # test_tree.py. +class XMLTreeBuilderSmokeTest(object): + + def test_docstring_generated(self): + soup = self.soup("<root/>") + self.assertEqual( + soup.encode(), b'<?xml version="1.0" encoding="utf-8">\n<root/>') + + def test_docstring_includes_correct_encoding(self): + soup = self.soup("<root/>") + self.assertEqual( + soup.encode("latin1"), + b'<?xml version="1.0" encoding="latin1">\n<root/>') + + def test_real_xhtml_document(self): + """A real XHTML document should come out the same as it went in.""" + markup = b"""<?xml version="1.0" encoding="utf-8"> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head><title>Hello.</title></head> +<body>Goodbye.</body> +</html>""" + soup = self.soup(markup) + self.assertEqual(soup.encode("utf-8"), markup) + + + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): + self.assertSoupEquals("<p>", "<p/>") + self.assertSoupEquals("<p>foo</p>") + + def test_namespaces_are_preserved(self): + markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>' + soup = self.soup(markup) + root = soup.root + self.assertEqual("http://example.com/", root['xmlns:a']) + self.assertEqual("http://example.net/", root['xmlns:b']) + + +class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): + """Smoke test for a tree builder that supports HTML5.""" + + def test_html_tags_have_namespace(self): + markup = "<a>" + soup = self.soup(markup) + self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) + + def test_svg_tags_have_namespace(self): + markup = '<svg><circle/></svg>' + soup = self.soup(markup) + namespace = "http://www.w3.org/2000/svg" + self.assertEqual(namespace, soup.svg.namespace) + self.assertEqual(namespace, soup.circle.namespace) + + + def test_mathml_tags_have_namespace(self): + markup = '<math><msqrt>5</msqrt></math>' + soup = self.soup(markup) + namespace = 'http://www.w3.org/1998/Math/MathML' + self.assertEqual(namespace, soup.math.namespace) + self.assertEqual(namespace, soup.msqrt.namespace) + def skipIf(condition, reason): def nothing(test, *args, **kwargs): diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py index f1edddf..0828cfd 100644 --- a/bs4/tests/test_html5lib.py +++ b/bs4/tests/test_html5lib.py @@ -7,7 +7,7 @@ except ImportError, e: HTML5LIB_PRESENT = False from bs4.element import SoupStrainer from bs4.testing import ( - HTMLTreeBuilderSmokeTest, + HTML5TreeBuilderSmokeTest, SoupTest, skipIf, ) @@ -15,8 +15,8 @@ from bs4.testing import ( @skipIf( not HTML5LIB_PRESENT, "html5lib seems not to be present, not testing its tree builder.") -class HTML5LibBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): - """See ``HTMLTreeBuilderSmokeTest``.""" +class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): + """See ``HTML5TreeBuilderSmokeTest``.""" @property def default_builder(self): diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py index 92b7389..27ec570 100644 --- a/bs4/tests/test_lxml.py +++ b/bs4/tests/test_lxml.py @@ -14,6 +14,7 @@ from bs4.testing import skipIf from bs4.tests import test_htmlparser from bs4.testing import ( HTMLTreeBuilderSmokeTest, + XMLTreeBuilderSmokeTest, SoupTest, skipIf, ) @@ -35,3 +36,14 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): "<p>foo�bar</p>", "<p>foobar</p>") self.assertSoupEquals( "<p>foo�bar</p>", "<p>foobar</p>") + +@skipIf( + not LXML_PRESENT, + "lxml seems not to be present, not testing its XML tree builder.") +class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): + """See ``HTMLTreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return LXMLTreeBuilderForXML() + diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 2b7c003..33ab0fa 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -3,7 +3,10 @@ import unittest from bs4 import BeautifulSoup -from bs4.element import SoupStrainer +from bs4.element import ( + SoupStrainer, + NamespacedAttribute, + ) from bs4.dammit import EntitySubstitution, UnicodeDammit from bs4.testing import SoupTest import warnings @@ -16,7 +19,7 @@ class TestDeprecatedConstructorArguments(SoupTest): msg = str(w[0].message) self.assertTrue("parseOnlyThese" in msg) self.assertTrue("parse_only" in msg) - self.assertEquals(b"<b></b>", soup.encode()) + self.assertEqual(b"<b></b>", soup.encode()) def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: @@ -25,7 +28,7 @@ class TestDeprecatedConstructorArguments(SoupTest): msg = str(w[0].message) self.assertTrue("fromEncoding" in msg) self.assertTrue("from_encoding" in msg) - self.assertEquals("utf8", soup.original_encoding) + self.assertEqual("utf8", soup.original_encoding) def test_unrecognized_keyword_argument(self): self.assertRaises( @@ -206,7 +209,7 @@ class TestUnicodeDammit(unittest.TestCase): b"<html><meta charset=euc-jp /></html>", b"<html><meta charset=euc-jp/></html>"): dammit = UnicodeDammit(data, is_html=True) - self.assertEquals( + self.assertEqual( "euc-jp", dammit.original_encoding) def test_last_ditch_entity_replacement(self): @@ -233,3 +236,30 @@ class TestUnicodeDammit(unittest.TestCase): msg = w[0].message self.assertTrue(isinstance(msg, UnicodeWarning)) self.assertTrue("Some characters could not be decoded" in str(msg)) + + +class TestNamedspacedAttribute(SoupTest): + + def test_name_may_be_none(self): + a = NamespacedAttribute("xmlns", None) + self.assertEqual(a, "xmlns") + + def test_attribute_is_equivalent_to_colon_separated_string(self): + a = NamespacedAttribute("a", "b") + self.assertEqual("a:b", a) + + def test_attributes_are_equivalent_if_prefix_and_name_identical(self): + a = NamespacedAttribute("a", "b", "c") + b = NamespacedAttribute("a", "b", "c") + self.assertEqual(a, b) + + # The actual namespace is not considered. + c = NamespacedAttribute("a", "b", None) + self.assertEqual(a, c) + + # But name and prefix are important. + d = NamespacedAttribute("a", "z", "c") + self.assertNotEqual(a, d) + + e = NamespacedAttribute("z", "b", "c") + self.assertNotEqual(a, e) diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 6aa02cb..c75b561 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -18,7 +18,13 @@ from bs4.builder import ( builder_registry, HTMLParserTreeBuilder, ) -from bs4.element import CData, NavigableString, SoupStrainer, Tag +from bs4.element import ( + CData, + Doctype, + NavigableString, + SoupStrainer, + Tag, +) from bs4.testing import ( SoupTest, skipIf, @@ -97,8 +103,8 @@ class TestFindAllBasicNamespaces(TreeTest): def test_find_by_namespaced_name(self): soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">') - self.assertEquals("4", soup.find("mathml:msqrt").string) - self.assertEquals("a", soup.find(attrs= { "svg:fill" : "red" }).name) + self.assertEqual("4", soup.find("mathml:msqrt").string) + self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) class TestFindAllByName(TreeTest): @@ -1277,3 +1283,12 @@ class TestNavigableStringSubclasses(SoupTest): self.assertEqual(str(soup), "<![CDATA[foo]]>") self.assertEqual(soup.find(text="foo"), "foo") self.assertEqual(soup.contents[0], "foo") + + def test_doctype_ends_in_newline(self): + # Unlike other NavigableString subclasses, a DOCTYPE always ends + # in a newline. + doctype = Doctype("foo") + soup = self.soup("") + soup.insert(1, doctype) + self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") + @@ -7,7 +7,7 @@ except ImportError: from distutils.command.build_py import build_py setup(name="beautifulsoup4", - version = "4.0.0b7", + version = "4.0.0b8", author="Leonard Richardson", author_email='leonardr@segfault.org', url="http://www.crummy.com/software/BeautifulSoup/bs4/", |