diff options
Diffstat (limited to 'bs4')
-rw-r--r-- | bs4/__init__.py | 12 | ||||
-rw-r--r-- | bs4/builder/_html5lib.py | 12 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 3 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 53 | ||||
-rw-r--r-- | bs4/element.py | 33 | ||||
-rw-r--r-- | bs4/testing.py | 15 | ||||
-rw-r--r-- | bs4/tests/test_htmlparser.py | 1 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 9 |
8 files changed, 112 insertions, 26 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 13dac85..2dd0521 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -193,9 +193,9 @@ class BeautifulSoup(Tag): self.tagStack = [] self.pushTag(self) - def new_tag(self, name, **attrs): + def new_tag(self, name, namespace=None, nsprefix=None, **attrs): """Create a new tag associated with this soup.""" - return Tag(None, self.builder, name, attrs) + return Tag(None, self.builder, name, namespace, nsprefix, attrs) def new_string(self, s): """Create a new NavigableString associated with this soup.""" @@ -272,7 +272,7 @@ class BeautifulSoup(Tag): mostRecentTag = self.popTag() return mostRecentTag - def handle_starttag(self, name, attrs): + def handle_starttag(self, name, namespace, nsprefix, attrs): """Push a start tag on to the stack. If this method returns None, the tag was rejected by the @@ -281,7 +281,7 @@ class BeautifulSoup(Tag): don't call handle_endtag. """ - #print "Start tag %s: %s" % (name, attrs) + # print "Start tag %s: %s" % (name, attrs) self.endData() if (self.parse_only and len(self.tagStack) <= 1 @@ -289,8 +289,8 @@ class BeautifulSoup(Tag): or not self.parse_only.search_tag(name, attrs))): return None - tag = Tag(self, self.builder, name, attrs, self.currentTag, - self.previous_element) + tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, + self.currentTag, self.previous_element) if tag is None: return tag if self.previous_element: diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py index 0d7a1a9..7ce69aa 100644 --- a/bs4/builder/_html5lib.py +++ b/bs4/builder/_html5lib.py @@ -8,6 +8,7 @@ from bs4.builder import ( HTML_5, HTMLTreeBuilder, ) +from bs4.element import NamespacedAttribute import html5lib from html5lib.constants import ( DataLossWarning, @@ -58,9 +59,6 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): def __init__(self, soup, namespaceHTMLElements): self.soup = soup - if namespaceHTMLElements: - warnings.warn("namespaceHTMLElements not supported yet", - DataLossWarning) super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): @@ -76,9 +74,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): self.soup.object_was_parsed(doctype) def elementClass(self, name, namespace): - if namespace is not None: - warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) - tag = self.soup.new_tag(name) + tag = self.soup.new_tag(name, namespace) return Element(tag, self.soup, namespace) def commentClass(self, data): @@ -144,6 +140,8 @@ class Element(html5lib.treebuilders._base.Node): def setAttributes(self, attributes): if attributes is not None and attributes != {}: for name, value in list(attributes.items()): + if isinstance(name, tuple): + name = NamespacedAttribute(*name) self.element[name] = value # The attributes may contain variables that need substitution. # Call set_up_substitutions manually. @@ -189,7 +187,7 @@ class Element(html5lib.treebuilders._base.Node): TextNode(child, self.soup)) def cloneNode(self): - tag = self.soup.new_tag(self.element.name) + tag = self.soup.new_tag(self.element.name, self.namespace) node = Element(tag, self.soup, self.namespace) for key,value in self.attributes: node.attributes[key] = value diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 62473cf..c307ff8 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -40,7 +40,8 @@ HTMLPARSER = 'html.parser' class BeautifulSoupHTMLParser(HTMLParser): def handle_starttag(self, name, attrs): - self.soup.handle_starttag(name, dict(attrs)) + # XXX namespace + self.soup.handle_starttag(name, None, None, dict(attrs)) def handle_endtag(self, name): self.soup.handle_endtag(name) diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index cc3cb86..870d59e 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -5,7 +5,7 @@ __all__ = [ import collections from lxml import etree -from bs4.element import Comment, Doctype +from bs4.element import Comment, Doctype, NamespacedAttribute from bs4.builder import ( FAST, HTML, @@ -42,6 +42,15 @@ class LXMLTreeBuilderForXML(TreeBuilder): parser = parser(target=self, strip_cdata=False) self.parser = parser self.soup = None + self.nsmaps = None + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) def prepare_markup(self, markup, user_specified_encoding=None, document_declared_encoding=None): @@ -63,15 +72,49 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.parser.close() def close(self): - pass - - def start(self, name, attrs): - self.soup.handle_starttag(name, attrs) + self.nsmaps = None + + def start(self, name, attrs, nsmap={}): + nsprefix = None + # Invert each namespace map as it comes in. + if len(nsmap) == 0 and self.nsmaps != None: + # There are no new namespaces for this tag, but namespaces + # are in play, so we need a separate tag stack to know + # when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + if self.nsmaps is None: + self.nsmaps = [] + inverted_nsmap = dict((value, key) for key, value in nsmap.items()) + self.nsmaps.append(inverted_nsmap) + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in nsmap.items(): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace + namespace, name = self._getNsTag(name) + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_starttag(name, namespace, nsprefix, attrs) def end(self, name): self.soup.endData() completed_tag = self.soup.tagStack[-1] self.soup.handle_endtag(name) + if self.nsmaps != None: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + self.nsmaps.pop() + if len(self.nsmaps) == 0: + # Namespaces are no longer in play, so don't bother keeping + # track of the namespace stack. + self.nsmaps = None def pi(self, target, data): pass diff --git a/bs4/element.py b/bs4/element.py index 997378a..fdb90e0 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -22,6 +22,20 @@ def _alias(attr): return alias +class NamespacedAttribute(object): + + def __init__(self, namespace_abbreviation, name, namespace): + self.namespace_abbreviation = namespace_abbreviation + self.name = name + self.namespace = namespace + + def __str__(self): + name = self.name + if self.namespace_abbreviation: + name = self.namespace_abbreviation + ":" + name + return name + + class PageElement(object): """Contains the navigational information for some part of the page (either a tag or a piece of text)""" @@ -507,8 +521,8 @@ class Tag(PageElement): """Represents a found HTML tag with its attributes and contents.""" - def __init__(self, parser=None, builder=None, name=None, attrs=None, - parent=None, previous=None): + def __init__(self, parser=None, builder=None, name=None, namespace=None, + nsprefix=None, attrs=None, parent=None, previous=None): "Basic constructor." if parser is None: @@ -520,6 +534,8 @@ class Tag(PageElement): if name is None: raise ValueError("No value provided for new tag's name.") self.name = name + self.namespace = namespace + self.nsprefix = nsprefix if attrs is None: attrs = {} else: @@ -779,7 +795,7 @@ class Tag(PageElement): and '%SOUP-ENCODING%' in val): val = self.substitute_encoding(val, eventual_encoding) - decoded = (key + '=' + decoded = (str(key) + '=' + EntitySubstitution.substitute_xml(val, True)) attrs.append(decoded) close = '' @@ -789,6 +805,10 @@ class Tag(PageElement): else: closeTag = '</%s>' % self.name + prefix = '' + if self.nsprefix: + prefix = self.nsprefix + ":" + pretty_print = (indent_level is not None) if pretty_print: space = (' ' * (indent_level - 1)) @@ -809,7 +829,8 @@ class Tag(PageElement): attribute_string = ' ' + ' '.join(attrs) if pretty_print: s.append(space) - s.append('<%s%s%s>' % (self.name, attribute_string, close)) + s.append('<%s%s%s%s>' % ( + prefix, self.name, attribute_string, close)) if pretty_print: s.append("\n") s.append(contents) @@ -986,7 +1007,7 @@ class SoupStrainer(object): searchTag = search_tag def search(self, markup): - #print 'looking for %s in %s' % (self, markup) + # print 'looking for %s in %s' % (self, markup) found = None # If given a list of items, scan it for a text element that # matches. @@ -1012,7 +1033,7 @@ class SoupStrainer(object): return found def _matches(self, markup, match_against): - #print "Matching %s against %s" % (markup, match_against) + # print "Matching %s against %s" % (markup, match_against) result = False if isinstance(markup, list) or isinstance(markup, tuple): diff --git a/bs4/testing.py b/bs4/testing.py index cc30e17..dc20812 100644 --- a/bs4/testing.py +++ b/bs4/testing.py @@ -198,6 +198,21 @@ class HTMLTreeBuilderSmokeTest(object): self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) + def test_basic_namespaces(self): + """Parsers don't need to *understand* namespaces, but at the + very least they should not choke on namespaces or lose + data.""" + + markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>' + soup = self.soup(markup) + self.assertEquals(markup, soup.encode()) + html = soup.html + self.assertEquals('http://www.w3.org/1999/xhtml', soup.html['xmlns']) + self.assertEquals( + 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) + self.assertEquals( + 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) + # # Generally speaking, tests below this point are more tests of # Beautiful Soup than tests of the tree builders. But parsers are diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py index bcb5ed2..6215185 100644 --- a/bs4/tests/test_htmlparser.py +++ b/bs4/tests/test_htmlparser.py @@ -17,3 +17,4 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): def test_namespaced_public_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass + diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index f39826a..6aa02cb 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -63,7 +63,6 @@ class TestFind(TreeTest): soup = self.soup(u'<h1>Räksmörgås</h1>') self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') - class TestFindAll(TreeTest): """Basic tests of the find_all() method.""" @@ -94,6 +93,14 @@ class TestFindAll(TreeTest): self.assertSelects( soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) +class TestFindAllBasicNamespaces(TreeTest): + + def test_find_by_namespaced_name(self): + soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">') + self.assertEquals("4", soup.find("mathml:msqrt").string) + self.assertEquals("a", soup.find(attrs= { "svg:fill" : "red" }).name) + + class TestFindAllByName(TreeTest): """Test ways of finding tags by tag name.""" |