summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.txt11
-rw-r--r--bs4/builder/_html5lib.py233
-rw-r--r--bs4/builder/_lxml.py108
-rw-r--r--bs4/element.py855
-rw-r--r--bs4/testing.py37
-rw-r--r--tests/test_builder_registry.py4
-rw-r--r--tests/test_html5lib.py4
-rw-r--r--tests/test_lxml.py8
-rw-r--r--tests/test_soup.py6
-rw-r--r--tests/test_tree.py8
10 files changed, 1256 insertions, 18 deletions
diff --git a/README.txt b/README.txt
index ff83212..6e789c2 100644
--- a/README.txt
+++ b/README.txt
@@ -1,7 +1,12 @@
-Introduction
-============
+= About Beautiful Soup 4 =
- >>> from beautifulsoup import BeautifulSoup
+Earlier versions of Beautiful Soup included a custom HTML
+parser. Beautiful Soup 4 does not include a parser. You'll need to
+install either lxml or html5lib.
+
+= Introduction =
+
+ >>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML")
>>> print soup.prettify()
<html>
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
new file mode 100644
index 0000000..d74c4b0
--- /dev/null
+++ b/bs4/builder/_html5lib.py
@@ -0,0 +1,233 @@
+__all__ = [
+ 'HTML5TreeBuilder',
+ ]
+
+from bs4.builder import (
+ PERMISSIVE,
+ HTML,
+ HTML_5,
+ HTMLTreeBuilder,
+ )
+import html5lib
+from html5lib.constants import DataLossWarning
+import warnings
+from bs4.element import (
+ Comment,
+ Doctype,
+ NavigableString,
+ Tag,
+ )
+
+class HTML5TreeBuilder(HTMLTreeBuilder):
+ """Use html5lib to build a tree."""
+
+ features = ['html5lib', PERMISSIVE, HTML_5, HTML]
+
+ def prepare_markup(self, markup, user_specified_encoding):
+ # Store the user-specified encoding for use later on.
+ self.user_specified_encoding = user_specified_encoding
+ return markup, None, None
+
+ # These methods are defined by Beautiful Soup.
+ def feed(self, markup):
+ parser = html5lib.HTMLParser(tree=self.create_treebuilder)
+ doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+ # Set the character encoding detected by the tokenizer.
+ if isinstance(markup, unicode):
+ # We need to special-case this because html5lib sets
+ # charEncoding to UTF-8 if it gets Unicode input.
+ doc.original_encoding = None
+ else:
+ doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+
+ def create_treebuilder(self, namespaceHTMLElements):
+ self.underlying_builder = TreeBuilderForHtml5lib(
+ self.soup, namespaceHTMLElements)
+ return self.underlying_builder
+
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'<html><head></head><body>%s</body></html>' % fragment
+
+
+class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+
+ def __init__(self, soup, namespaceHTMLElements):
+ self.soup = soup
+ if namespaceHTMLElements:
+ warnings.warn("namespaceHTMLElements not supported yet",
+ DataLossWarning)
+ super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
+
+ def documentClass(self):
+ self.soup.reset()
+ return Element(self.soup, self.soup, None)
+
+ def insertDoctype(self, token):
+ name = token["name"]
+ publicId = token["publicId"]
+ systemId = token["systemId"]
+
+ doctype = Doctype.for_name_and_ids(name, publicId, systemId)
+ self.soup.object_was_parsed(doctype)
+
+ def elementClass(self, name, namespace):
+ if namespace is not None:
+ warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
+ return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace)
+
+ def commentClass(self, data):
+ return TextNode(Comment(data), self.soup)
+
+ def fragmentClass(self):
+ self.soup = BeautifulSoup("")
+ self.soup.name = "[document_fragment]"
+ return Element(self.soup, self.soup, None)
+
+ def appendChild(self, node):
+ self.soup.insert(len(self.soup.contents), node.element)
+
+ def testSerializer(self, element):
+ return testSerializer(element)
+
+ def getDocument(self):
+ return self.soup
+
+ def getFragment(self):
+ return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+
+class AttrList(object):
+ def __init__(self, element):
+ self.element = element
+ self.attrs = dict(self.element.attrs)
+ def __iter__(self):
+ return self.attrs.items().__iter__()
+ def __setitem__(self, name, value):
+ "set attr", name, value
+ self.element[name] = value
+ def items(self):
+ return self.attrs.items()
+ def keys(self):
+ return self.attrs.keys()
+ def __getitem__(self, name):
+ return self.attrs[name]
+ def __contains__(self, name):
+ return name in self.attrs.keys()
+
+
+class Element(html5lib.treebuilders._base.Node):
+ def __init__(self, element, soup, namespace):
+ html5lib.treebuilders._base.Node.__init__(self, element.name)
+ self.element = element
+ self.soup = soup
+ self.namespace = namespace
+
+ def _nodeIndex(self, node, refNode):
+ # Finds a node by identity rather than equality
+ for index in range(len(self.element.contents)):
+ if id(self.element.contents[index]) == id(refNode.element):
+ return index
+ return None
+
+ def appendChild(self, node):
+ if (node.element.__class__ == NavigableString and self.element.contents
+ and self.element.contents[-1].__class__ == NavigableString):
+ # Concatenate new text onto old text node
+ # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
+ newStr = NavigableString(self.element.contents[-1]+node.element)
+
+ # Remove the old text node
+ # (Can't simply use .extract() by itself, because it fails if
+ # an equal text node exists within the parent node)
+ oldElement = self.element.contents[-1]
+ del self.element.contents[-1]
+ oldElement.parent = None
+ oldElement.extract()
+
+ self.element.insert(len(self.element.contents), newStr)
+ else:
+ self.element.insert(len(self.element.contents), node.element)
+ node.parent = self
+
+ def getAttributes(self):
+ return AttrList(self.element)
+
+ def setAttributes(self, attributes):
+ if attributes is not None and attributes != {}:
+ for name, value in attributes.items():
+ self.element[name] = value
+ # The attributes may contain variables that need substitution.
+ # Call set_up_substitutions manually.
+ # The Tag constructor calls this method automatically,
+ # but html5lib creates a Tag object before setting up
+ # the attributes.
+ self.element.contains_substitutions = (
+ self.soup.builder.set_up_substitutions(
+ self.element))
+ attributes = property(getAttributes, setAttributes)
+
+ def insertText(self, data, insertBefore=None):
+ text = TextNode(NavigableString(data), self.soup)
+ if insertBefore:
+ self.insertBefore(text, insertBefore)
+ else:
+ self.appendChild(text)
+
+ def insertBefore(self, node, refNode):
+ index = self._nodeIndex(node, refNode)
+ if (node.element.__class__ == NavigableString and self.element.contents
+ and self.element.contents[index-1].__class__ == NavigableString):
+ # (See comments in appendChild)
+ newStr = NavigableString(self.element.contents[index-1]+node.element)
+ oldNode = self.element.contents[index-1]
+ del self.element.contents[index-1]
+ oldNode.parent = None
+ oldNode.extract()
+
+ self.element.insert(index-1, newStr)
+ else:
+ self.element.insert(index, node.element)
+ node.parent = self
+
+ def removeChild(self, node):
+ index = self._nodeIndex(node.parent, node)
+ del node.parent.element.contents[index]
+ node.element.parent = None
+ node.element.extract()
+ node.parent = None
+
+ def reparentChildren(self, newParent):
+ while self.element.contents:
+ child = self.element.contents[0]
+ child.extract()
+ if isinstance(child, Tag):
+ newParent.appendChild(Element(child, self.soup, namespaces["html"]))
+ else:
+ newParent.appendChild(TextNode(child, self.soup))
+
+ def cloneNode(self):
+ node = Element(Tag(self.soup, self.soup.builder, self.element.name), self.soup, self.namespace)
+ for key,value in self.attributes:
+ node.attributes[key] = value
+ return node
+
+ def hasContent(self):
+ return self.element.contents
+
+ def getNameTuple(self):
+ if self.namespace == None:
+ return namespaces["html"], self.name
+ else:
+ return self.namespace, self.name
+
+ nameTuple = property(getNameTuple)
+
+class TextNode(Element):
+ def __init__(self, element, soup):
+ html5lib.treebuilders._base.Node.__init__(self, None)
+ self.element = element
+ self.soup = soup
+
+ def cloneNode(self):
+ raise NotImplementedError
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
new file mode 100644
index 0000000..5c9bc57
--- /dev/null
+++ b/bs4/builder/_lxml.py
@@ -0,0 +1,108 @@
+__all__ = [
+ 'LXMLTreeBuilderForXML',
+ 'LXMLTreeBuilder',
+ ]
+
+from lxml import etree
+from bs4.element import Comment, Doctype
+from bs4.builder import (
+ FAST,
+ HTML,
+ HTMLTreeBuilder,
+ PERMISSIVE,
+ TreeBuilder,
+ XML)
+from bs4.dammit import UnicodeDammit
+import types
+
+LXML = 'lxml'
+
+class LXMLTreeBuilderForXML(TreeBuilder):
+ DEFAULT_PARSER_CLASS = etree.XMLParser
+
+ is_xml = True
+
+ # Well, it's permissive by XML parser standards.
+ features = [LXML, XML, FAST, PERMISSIVE]
+
+ @property
+ def default_parser(self):
+ # This can either return a parser object or a class, which
+ # will be instantiated with default arguments.
+ return etree.XMLParser(target=self, strip_cdata=False, recover=True)
+
+ def __init__(self, parser=None, empty_element_tags=None):
+ if empty_element_tags is not None:
+ self.empty_element_tags = set(empty_element_tags)
+ if parser is None:
+ # Use the default parser.
+ parser = self.default_parser
+ if callable(parser):
+ # Instantiate the parser with default arguments
+ parser = parser(target=self, strip_cdata=False)
+ self.parser = parser
+ self.soup = None
+
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ document_declared_encoding=None):
+ """
+ :return: A 3-tuple (markup, original encoding, encoding
+ declared within markup).
+ """
+ if isinstance(markup, unicode):
+ return markup, None, None
+
+ try_encodings = [user_specified_encoding, document_declared_encoding]
+ dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
+ return (dammit.markup, dammit.original_encoding,
+ dammit.declared_html_encoding)
+
+ def feed(self, markup):
+ self.parser.feed(markup)
+ self.parser.close()
+
+ def close(self):
+ pass
+
+ def start(self, name, attrs):
+ self.soup.handle_starttag(name, attrs)
+
+ def end(self, name):
+ self.soup.endData()
+ completed_tag = self.soup.tagStack[-1]
+ self.soup.handle_endtag(name)
+
+ def pi(self, target, data):
+ pass
+
+ def data(self, content):
+ self.soup.handle_data(content)
+
+ def doctype(self, name, pubid, system):
+ self.soup.endData()
+ doctype = Doctype.for_name_and_ids(name, pubid, system)
+ self.soup.object_was_parsed(doctype)
+
+ def comment(self, content):
+ "Handle comments as Comment objects."
+ self.soup.endData()
+ self.soup.handle_data(content)
+ self.soup.endData(Comment)
+
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'<?xml version="1.0" encoding="utf-8">\n%s' % fragment
+
+
+class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+
+ features = [LXML, HTML, FAST]
+ is_xml = False
+
+ @property
+ def default_parser(self):
+ return etree.HTMLParser
+
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'<html><body>%s</body></html>' % fragment
diff --git a/bs4/element.py b/bs4/element.py
new file mode 100644
index 0000000..315ee63
--- /dev/null
+++ b/bs4/element.py
@@ -0,0 +1,855 @@
+import re
+import types
+try:
+ from htmlentitydefs import name2codepoint
+except ImportError:
+ name2codepoint = {}
+from bs4.dammit import EntitySubstitution
+
+from util import isList
+
+DEFAULT_OUTPUT_ENCODING = "utf-8"
+
+
+class PageElement(object):
+ """Contains the navigational information for some part of the page
+ (either a tag or a piece of text)"""
+
+ def setup(self, parent=None, previous=None):
+ """Sets up the initial relations between this element and
+ other elements."""
+ self.parent = parent
+ self.previous = previous
+ self.next = None
+ self.previousSibling = None
+ self.nextSibling = None
+ if self.parent and self.parent.contents:
+ self.previousSibling = self.parent.contents[-1]
+ self.previousSibling.nextSibling = self
+
+ def replaceWith(self, replaceWith):
+ oldParent = self.parent
+ myIndex = self.parent.contents.index(self)
+ if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
+ # We're replacing this element with one of its siblings.
+ index = self.parent.contents.index(replaceWith)
+ if index and index < myIndex:
+ # Furthermore, it comes before this element. That
+ # means that when we extract it, the index of this
+ # element will change.
+ myIndex = myIndex - 1
+ self.extract()
+ oldParent.insert(myIndex, replaceWith)
+
+ def extract(self):
+ """Destructively rips this element out of the tree."""
+ if self.parent:
+ try:
+ self.parent.contents.remove(self)
+ except ValueError:
+ pass
+
+ #Find the two elements that would be next to each other if
+ #this element (and any children) hadn't been parsed. Connect
+ #the two.
+ lastChild = self._lastRecursiveChild()
+ nextElement = lastChild.next
+
+ if self.previous:
+ self.previous.next = nextElement
+ if nextElement:
+ nextElement.previous = self.previous
+ self.previous = None
+ lastChild.next = None
+
+ self.parent = None
+ if self.previousSibling:
+ self.previousSibling.nextSibling = self.nextSibling
+ if self.nextSibling:
+ self.nextSibling.previousSibling = self.previousSibling
+ self.previousSibling = self.nextSibling = None
+ return self
+
+ def _lastRecursiveChild(self):
+ "Finds the last element beneath this object to be parsed."
+ lastChild = self
+ while hasattr(lastChild, 'contents') and lastChild.contents:
+ lastChild = lastChild.contents[-1]
+ return lastChild
+
+ def insert(self, position, newChild):
+ if (isinstance(newChild, basestring)
+ or isinstance(newChild, unicode)) \
+ and not isinstance(newChild, NavigableString):
+ newChild = NavigableString(newChild)
+
+ position = min(position, len(self.contents))
+ if hasattr(newChild, 'parent') and newChild.parent != None:
+ # We're 'inserting' an element that's already one
+ # of this object's children.
+ if newChild.parent == self:
+ index = self.find(newChild)
+ if index and index < position:
+ # Furthermore we're moving it further down the
+ # list of this object's children. That means that
+ # when we extract this element, our target index
+ # will jump down one.
+ position = position - 1
+ newChild.extract()
+
+ newChild.parent = self
+ previousChild = None
+ if position == 0:
+ newChild.previousSibling = None
+ newChild.previous = self
+ else:
+ previousChild = self.contents[position-1]
+ newChild.previousSibling = previousChild
+ newChild.previousSibling.nextSibling = newChild
+ newChild.previous = previousChild._lastRecursiveChild()
+ if newChild.previous:
+ newChild.previous.next = newChild
+
+ newChildsLastElement = newChild._lastRecursiveChild()
+
+ if position >= len(self.contents):
+ newChild.nextSibling = None
+
+ parent = self
+ parentsNextSibling = None
+ while not parentsNextSibling:
+ parentsNextSibling = parent.nextSibling
+ parent = parent.parent
+ if not parent: # This is the last element in the document.
+ break
+ if parentsNextSibling:
+ newChildsLastElement.next = parentsNextSibling
+ else:
+ newChildsLastElement.next = None
+ else:
+ nextChild = self.contents[position]
+ newChild.nextSibling = nextChild
+ if newChild.nextSibling:
+ newChild.nextSibling.previousSibling = newChild
+ newChildsLastElement.next = nextChild
+
+ if newChildsLastElement.next:
+ newChildsLastElement.next.previous = newChildsLastElement
+ self.contents.insert(position, newChild)
+
+ def append(self, tag):
+ """Appends the given tag to the contents of this tag."""
+ self.insert(len(self.contents), tag)
+
+ def find_next(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears after this Tag in the document."""
+ return self._findOne(self.find_all_next, name, attrs, text, **kwargs)
+ findNext = find_next # BS3
+
+ def find_all_next(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ after this Tag in the document."""
+ return self._find_all(name, attrs, text, limit, self.next_elements,
+ **kwargs)
+ findAllNext = find_all_next # BS3
+
+ def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears after this Tag in the document."""
+ return self._findOne(self.find_next_siblings, name, attrs, text,
+ **kwargs)
+ findNextSibling = find_next_sibling # BS3
+
+ def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear after this Tag in the document."""
+ return self._find_all(name, attrs, text, limit,
+ self.next_siblings, **kwargs)
+ findNextSiblings = find_next_siblings # BS3
+ fetchNextSiblings = find_next_siblings # BS2
+
+ def find_previous(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the first item that matches the given criteria and
+ appears before this Tag in the document."""
+ return self._findOne(
+ self.find_all_previous, name, attrs, text, **kwargs)
+ findPrevious = find_previous # BS3
+
+ def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
+ **kwargs):
+ """Returns all items that match the given criteria and appear
+ before this Tag in the document."""
+ return self._find_all(name, attrs, text, limit, self.previous_elements,
+ **kwargs)
+ findAllPrevious = find_all_previous # BS3
+ fetchPrevious = find_all_previous # BS2
+
+ def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
+ """Returns the closest sibling to this Tag that matches the
+ given criteria and appears before this Tag in the document."""
+ return self._findOne(self.find_previous_siblings, name, attrs, text,
+ **kwargs)
+ findPreviousSibling = find_previous_sibling # BS3
+
+ def find_previous_siblings(self, name=None, attrs={}, text=None,
+ limit=None, **kwargs):
+ """Returns the siblings of this Tag that match the given
+ criteria and appear before this Tag in the document."""
+ return self._find_all(name, attrs, text, limit,
+ self.previous_siblings, **kwargs)
+ findPreviousSiblings = find_previous_siblings # BS3
+ fetchPreviousSiblings = find_previous_siblings # BS2
+
+ def find_parent(self, name=None, attrs={}, **kwargs):
+ """Returns the closest parent of this Tag that matches the given
+ criteria."""
+ # NOTE: We can't use _findOne because findParents takes a different
+ # set of arguments.
+ r = None
+ l = self.find_parents(name, attrs, 1)
+ if l:
+ r = l[0]
+ return r
+ findParent = find_parent # BS3
+
+ def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
+ """Returns the parents of this Tag that match the given
+ criteria."""
+
+ return self._find_all(name, attrs, None, limit, self.parents,
+ **kwargs)
+ findParents = find_parents # BS3
+ fetchParents = find_parents # BS2
+
+ #These methods do the real heavy lifting.
+
+ def _findOne(self, method, name, attrs, text, **kwargs):
+ r = None
+ l = method(name, attrs, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+
+ def _find_all(self, name, attrs, text, limit, generator, **kwargs):
+ "Iterates over a generator looking for things that match."
+
+ if isinstance(name, SoupStrainer):
+ strainer = name
+ else:
+ # Build a SoupStrainer
+ strainer = SoupStrainer(name, attrs, text, **kwargs)
+ results = ResultSet(strainer)
+ while True:
+ try:
+ i = generator.next()
+ except StopIteration:
+ break
+ if i:
+ found = strainer.search(i)
+ if found:
+ results.append(found)
+ if limit and len(results) >= limit:
+ break
+ return results
+
+ #These generators can be used to navigate starting from both
+ #NavigableStrings and Tags.
+ @property
+ def next_elements(self):
+ i = self
+ while i:
+ i = i.next
+ yield i
+
+ @property
+ def next_siblings(self):
+ i = self
+ while i:
+ i = i.nextSibling
+ yield i
+
+ @property
+ def previous_elements(self):
+ i = self
+ while i:
+ i = i.previous
+ yield i
+
+ @property
+ def previous_siblings(self):
+ i = self
+ while i:
+ i = i.previousSibling
+ yield i
+
+ @property
+ def parents(self):
+ i = self
+ while i:
+ i = i.parent
+ yield i
+
+ # Old non-property versions of the generators, for backwards
+ # compatibility with BS3.
+ def nextGenerator(self):
+ return self.next_elements
+
+ def nextSiblingGenerator(self):
+ return self.next_siblings
+
+ def previousGenerator(self):
+ return self.previous_elements
+
+ def previousSiblingGenerator(self):
+ return self.previous_siblings
+
+ def parentGenerator(self):
+ return self.parents
+
+ # Utility methods
+ def substituteEncoding(self, str, encoding=None):
+ encoding = encoding or "utf-8"
+ return str.replace("%SOUP-ENCODING%", encoding)
+
+ def toEncoding(self, s, encoding=None):
+ """Encodes an object to a string in some encoding, or to Unicode.
+ ."""
+ if isinstance(s, unicode):
+ if encoding:
+ s = s.encode(encoding)
+ elif isinstance(s, str):
+ if encoding:
+ s = s.encode(encoding)
+ else:
+ s = unicode(s)
+ else:
+ if encoding:
+ s = self.toEncoding(str(s), encoding)
+ else:
+ s = unicode(s)
+ return s
+
+class NavigableString(unicode, PageElement):
+
+ PREFIX = ''
+ SUFFIX = ''
+
+ def __new__(cls, value):
+ """Create a new NavigableString.
+
+ When unpickling a NavigableString, this method is called with
+ the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
+ passed in to the superclass's __new__ or the superclass won't know
+ how to handle non-ASCII characters.
+ """
+ if isinstance(value, unicode):
+ return unicode.__new__(cls, value)
+ return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
+
+ def __getnewargs__(self):
+ return (unicode(self),)
+
+ def __getattr__(self, attr):
+ """text.string gives you text. This is for backwards
+ compatibility for Navigable*String, but for CData* it lets you
+ get the string without the CData wrapper."""
+ if attr == 'string':
+ return self
+ else:
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
+
+ def output_ready(self, substitute_html_entities=False):
+ if substitute_html_entities:
+ output = EntitySubstitution.substitute_html(self)
+ else:
+ output = self
+ return self.PREFIX + output + self.SUFFIX
+
+
+class CData(NavigableString):
+
+ PREFIX = u'<![CDATA['
+ SUFFIX = u']]>'
+
+
+class ProcessingInstruction(NavigableString):
+
+ PREFIX = u'<?'
+ SUFFIX = u'?>'
+
+
+class Comment(NavigableString):
+
+ PREFIX = u'<!--'
+ SUFFIX = u'-->'
+
+class Declaration(NavigableString):
+ PREFIX = u'<!'
+ SUFFIX = u'!>'
+
+
+class Doctype(NavigableString):
+
+ @classmethod
+ def for_name_and_ids(cls, name, pub_id, system_id):
+ value = name
+ if pub_id is not None:
+ value += ' PUBLIC "%s"' % pub_id
+ if system_id is not None:
+ value += ' SYSTEM "%s"' % system_id
+
+ return Doctype(value)
+
+ PREFIX = u'<!DOCTYPE '
+ SUFFIX = u'>'
+
+
+class Tag(PageElement):
+
+ """Represents a found HTML tag with its attributes and contents."""
+
+ def __init__(self, parser, builder, name, attrs=None, parent=None,
+ previous=None):
+ "Basic constructor."
+
+ # We don't actually store the parser object: that lets extracted
+ # chunks be garbage-collected.
+ self.parserClass = parser.__class__
+ self.name = name
+ if attrs == None:
+ attrs = {}
+ else:
+ attrs = dict(attrs)
+ self.attrs = attrs
+ self.contents = []
+ self.setup(parent, previous)
+ self.hidden = False
+
+ # Set up any substitutions, such as the charset in a META tag.
+ self.contains_substitutions = builder.set_up_substitutions(self)
+
+ self.can_be_empty_element = builder.can_be_empty_element(name)
+
+ @property
+ def is_empty_element(self):
+ """Is this tag an empty-element tag? (aka a self-closing tag)
+
+ A tag that has contents is never an empty-element tag.
+
+ A tag that has no contents may or may not be an empty-element
+ tag. It depends on the builder used to create the tag. If the
+ builder has a designated list of empty-element tags, then only
+ a tag whose name shows up in that list is considered an
+ empty-element tag.
+
+ If the builder has no designated list of empty-element tags,
+ then any tag with no contents is an empty-element tag.
+ """
+ return len(self.contents) == 0 and self.can_be_empty_element
+ isSelfClosing = is_empty_element # BS3
+
+
+ @property
+ def string(self):
+ """Convenience property to get the single string within this tag.
+
+ :Return: If this tag has a single string child, return value
+ is that string. If this tag has no children, or more than one
+ child, return value is None. If this tag has one child tag,
+ return value is the 'string' attribute of the child tag,
+ recursively.
+ """
+ if len(self.contents) != 1:
+ return None
+ child = self.contents[0]
+ if isinstance(child, NavigableString):
+ return child
+ return child.string
+
+ def get(self, key, default=None):
+ """Returns the value of the 'key' attribute for the tag, or
+ the value given for 'default' if it doesn't have that
+ attribute."""
+ return self.attrs.get(key, default)
+
+ def has_key(self, key):
+ return self.attrs.has_key(key)
+
+ def __getitem__(self, key):
+ """tag[key] returns the value of the 'key' attribute for the tag,
+ and throws an exception if it's not there."""
+ return self.attrs[key]
+
+ def __iter__(self):
+ "Iterating over a tag iterates over its contents."
+ return iter(self.contents)
+
+ def __len__(self):
+ "The length of a tag is the length of its list of contents."
+ return len(self.contents)
+
+ def __contains__(self, x):
+ return x in self.contents
+
+ def __nonzero__(self):
+ "A tag is non-None even if it has no contents."
+ return True
+
+ def __setitem__(self, key, value):
+ """Setting tag[key] sets the value of the 'key' attribute for the
+ tag."""
+ self.attrs[key] = value
+
+ def __delitem__(self, key):
+ "Deleting tag[key] deletes all 'key' attributes for the tag."
+ if self.attrs.has_key(key):
+ del self.attrs[key]
+
+ def __call__(self, *args, **kwargs):
+ """Calling a tag like a function is the same as calling its
+ find_all() method. Eg. tag('a') returns a list of all the A tags
+ found within this tag."""
+ return apply(self.find_all, args, kwargs)
+
+ def __getattr__(self, tag):
+ #print "Getattr %s.%s" % (self.__class__, tag)
+ if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
+ return self.find(tag[:-3])
+ elif tag.find('__') != 0:
+ return self.find(tag)
+ raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
+
+ def __eq__(self, other):
+ """Returns true iff this tag has the same name, the same attributes,
+ and the same contents (recursively) as the given tag.
+
+ XXX: right now this will return false if two tags have the
+ same attributes in a different order. Should this be fixed?"""
+ if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
+ return False
+ for i in range(0, len(self.contents)):
+ if self.contents[i] != other.contents[i]:
+ return False
+ return True
+
+ def __ne__(self, other):
+ """Returns true iff this tag is not identical to the other tag,
+ as defined in __eq__."""
+ return not self == other
+
+ def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ """Renders this tag as a string."""
+ return self.encode(encoding)
+
+ def __unicode__(self):
+ return self.decode()
+
+ def __str__(self):
+ return self.encode()
+
+ def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
+ indent_level=None, substitute_html_entities=False):
+ return self.decode(indent_level, encoding,
+ substitute_html_entities).encode(encoding)
+
+ def decode(self, indent_level=None,
+ eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+ substitute_html_entities=False):
+ """Returns a Unicode representation of this tag and its contents.
+
+ :param eventual_encoding: The tag is destined to be
+ encoded into this encoding. This method is _not_
+ responsible for performing that encoding. This information
+ is passed in so that it can be substituted in if the
+ document contains a <META> tag that mentions the document's
+ encoding.
+ """
+ attrs = []
+ if self.attrs:
+ for key, val in sorted(self.attrs.items()):
+ if val is None:
+ decoded = key
+ else:
+ if not isinstance(val, basestring):
+ val = str(val)
+ if (self.contains_substitutions
+ and eventual_encoding is not None
+ and '%SOUP-ENCODING%' in val):
+ val = self.substituteEncoding(val, eventual_encoding)
+
+ decoded = (key + '='
+ + EntitySubstitution.substitute_xml(val, True))
+ attrs.append(decoded)
+ close = ''
+ closeTag = ''
+ if self.is_empty_element:
+ close = ' /'
+ else:
+ closeTag = '</%s>' % self.name
+
+ pretty_print = (indent_level is not None)
+ if pretty_print:
+ space = (' ' * (indent_level-1))
+ indent_contents = indent_level + 1
+ else:
+ space = ''
+ indent_contents = None
+ contents = self.decode_contents(
+ indent_contents, eventual_encoding, substitute_html_entities)
+
+ if self.hidden:
+ # This is the 'document root' object.
+ s = contents
+ else:
+ s = []
+ attributeString = ''
+ if attrs:
+ attributeString = ' ' + ' '.join(attrs)
+ if pretty_print:
+ s.append(space)
+ s.append('<%s%s%s>' % (self.name, attributeString, close))
+ if pretty_print:
+ s.append("\n")
+ s.append(contents)
+ if pretty_print and contents and contents[-1] != "\n":
+ s.append("\n")
+ if pretty_print and closeTag:
+ s.append(space)
+ s.append(closeTag)
+ if pretty_print and closeTag and self.nextSibling:
+ s.append("\n")
+ s = ''.join(s)
+ return s
+
+ def decompose(self):
+ """Recursively destroys the contents of this tree."""
+ contents = [i for i in self.contents]
+ for i in contents:
+ if isinstance(i, Tag):
+ i.decompose()
+ else:
+ i.extract()
+ self.extract()
+
+ def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
+ return self.encode(encoding, True)
+
+ def decode_contents(self, indent_level=None,
+ eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+ substitute_html_entities=False):
+ """Renders the contents of this tag as a Unicode string.
+
+ :param eventual_encoding: The tag is destined to be
+ encoded into this encoding. This method is _not_
+ responsible for performing that encoding. This information
+ is passed in so that it can be substituted in if the
+ document contains a <META> tag that mentions the document's
+ encoding.
+ """
+ pretty_print = (indent_level is not None)
+ s=[]
+ for c in self:
+ text = None
+ if isinstance(c, NavigableString):
+ text = c.output_ready(substitute_html_entities)
+ elif isinstance(c, Tag):
+ s.append(c.decode(indent_level, eventual_encoding,
+ substitute_html_entities))
+ if text and indent_level:
+ text = text.strip()
+ if text:
+ if pretty_print:
+ s.append(" " * (indent_level-1))
+ s.append(text)
+ if pretty_print:
+ s.append("\n")
+ return ''.join(s)
+
+ #Soup methods
+
+ def find(self, name=None, attrs={}, recursive=True, text=None,
+ **kwargs):
+ """Return only the first child of this Tag matching the given
+ criteria."""
+ r = None
+ l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
+ if l:
+ r = l[0]
+ return r
+ findChild = find
+
+ def find_all(self, name=None, attrs={}, recursive=True, text=None,
+ limit=None, **kwargs):
+ """Extracts a list of Tag objects that match the given
+ criteria. You can specify the name of the Tag and any
+ attributes you want the Tag to have.
+
+ The value of a key-value pair in the 'attrs' map can be a
+ string, a list of strings, a regular expression object, or a
+ callable that takes a string and returns whether or not the
+ string matches for some custom definition of 'matches'. The
+ same is true of the tag name."""
+ generator = self.recursive_children
+ if not recursive:
+ generator = self.children
+ return self._find_all(name, attrs, text, limit, generator, **kwargs)
+ findAll = find_all # BS3
+ findChildren = find_all # BS2
+
+ #Generator methods
+ @property
+ def children(self):
+ for i in range(0, len(self.contents)):
+ yield self.contents[i]
+ raise StopIteration
+
+ @property
+ def recursive_children(self):
+ if not len(self.contents):
+ raise StopIteration
+ stopNode = self._lastRecursiveChild().next
+ current = self.contents[0]
+ while current is not stopNode:
+ yield current
+ current = current.next
+
+ # Old names for backwards compatibility
+ def childGenerator(self):
+ return self.children
+
+ def recursiveChildGenerator(self):
+ return self.recursive_children
+
+
+# Next, a couple classes to represent queries and their results.
+class SoupStrainer(object):
+ """Encapsulates a number of ways of matching a markup element (tag or
+ text)."""
+
+ def __init__(self, name=None, attrs={}, text=None, **kwargs):
+ self.name = name
+ if isinstance(attrs, basestring):
+ kwargs['class'] = attrs
+ attrs = None
+ if kwargs:
+ if attrs:
+ attrs = attrs.copy()
+ attrs.update(kwargs)
+ else:
+ attrs = kwargs
+ self.attrs = attrs
+ self.text = text
+
+ def __str__(self):
+ if self.text:
+ return self.text
+ else:
+ return "%s|%s" % (self.name, self.attrs)
+
+ def searchTag(self, markupName=None, markupAttrs={}):
+ found = None
+ markup = None
+ if isinstance(markupName, Tag):
+ markup = markupName
+ markupAttrs = markup
+ callFunctionWithTagData = callable(self.name) \
+ and not isinstance(markupName, Tag)
+
+ if (not self.name) \
+ or callFunctionWithTagData \
+ or (markup and self._matches(markup, self.name)) \
+ or (not markup and self._matches(markupName, self.name)):
+ if callFunctionWithTagData:
+ match = self.name(markupName, markupAttrs)
+ else:
+ match = True
+ markupAttrMap = None
+ for attr, matchAgainst in self.attrs.items():
+ if not markupAttrMap:
+ if hasattr(markupAttrs, 'get'):
+ markupAttrMap = markupAttrs
+ else:
+ markupAttrMap = {}
+ for k,v in markupAttrs:
+ markupAttrMap[k] = v
+ attrValue = markupAttrMap.get(attr)
+ if not self._matches(attrValue, matchAgainst):
+ match = False
+ break
+ if match:
+ if markup:
+ found = markup
+ else:
+ found = markupName
+ return found
+
+ def search(self, markup):
+ #print 'looking for %s in %s' % (self, markup)
+ found = None
+ # If given a list of items, scan it for a text element that
+ # matches.
+ if isList(markup) and not isinstance(markup, Tag):
+ for element in markup:
+ if isinstance(element, NavigableString) \
+ and self.search(element):
+ found = element
+ break
+ # If it's a Tag, make sure its name or attributes match.
+ # Don't bother with Tags if we're searching for text.
+ elif isinstance(markup, Tag):
+ if not self.text:
+ found = self.searchTag(markup)
+ # If it's text, make sure the text matches.
+ elif isinstance(markup, NavigableString) or \
+ isinstance(markup, basestring):
+ if self._matches(markup, self.text):
+ found = markup
+ else:
+ raise Exception, "I don't know how to match against a %s" \
+ % markup.__class__
+ return found
+
+ def _matches(self, markup, matchAgainst):
+ #print "Matching %s against %s" % (markup, matchAgainst)
+ result = False
+ if matchAgainst == True and type(matchAgainst) == types.BooleanType:
+ result = markup != None
+ elif callable(matchAgainst):
+ result = matchAgainst(markup)
+ else:
+ #Custom match methods take the tag as an argument, but all
+ #other ways of matching match the tag name as a string.
+ if isinstance(markup, Tag):
+ markup = markup.name
+ if markup is not None and not isinstance(markup, basestring):
+ markup = unicode(markup)
+ #Now we know that chunk is either a string, or None.
+ if hasattr(matchAgainst, 'match'):
+ # It's a regexp object.
+ result = markup and matchAgainst.search(markup)
+ elif (isList(matchAgainst)
+ and (markup is not None
+ or not isinstance(matchAgainst, basestring))):
+ result = markup in matchAgainst
+ elif hasattr(matchAgainst, 'items'):
+ result = markup.has_key(matchAgainst)
+ elif matchAgainst and isinstance(markup, basestring):
+ if isinstance(markup, unicode):
+ matchAgainst = unicode(matchAgainst)
+ else:
+ matchAgainst = str(matchAgainst)
+
+ if not result:
+ result = matchAgainst == markup
+ return result
+
+
+class ResultSet(list):
+ """A ResultSet is just a list that keeps track of the SoupStrainer
+ that created it."""
+ def __init__(self, source):
+ list.__init__([])
+ self.source = source
diff --git a/bs4/testing.py b/bs4/testing.py
new file mode 100644
index 0000000..9d9c26a
--- /dev/null
+++ b/bs4/testing.py
@@ -0,0 +1,37 @@
+"""Helper classes for tests."""
+
+import unittest
+from bs4 import BeautifulSoup
+from bs4.element import Comment, SoupStrainer
+from bs4.builder import LXMLTreeBuilder
+
+class SoupTest(unittest.TestCase):
+
+ @property
+ def default_builder(self):
+ return LXMLTreeBuilder()
+
+ def soup(self, markup, **kwargs):
+ """Build a Beautiful Soup object from markup."""
+ builder = kwargs.pop('builder', self.default_builder)
+ return BeautifulSoup(markup, builder=builder, **kwargs)
+
+ def document_for(self, markup):
+ """Turn an HTML fragment into a document.
+
+ The details depend on the builder.
+ """
+ return self.default_builder.test_fragment_to_document(markup)
+
+ def assertSoupEquals(self, to_parse, compare_parsed_to=None):
+ builder = self.default_builder
+ obj = BeautifulSoup(to_parse, builder=builder)
+ if compare_parsed_to is None:
+ compare_parsed_to = to_parse
+
+ self.assertEquals(obj.decode(), self.document_for(compare_parsed_to))
+
+
+
+
+
diff --git a/tests/test_builder_registry.py b/tests/test_builder_registry.py
index 17e3fb1..ee5b2da 100644
--- a/tests/test_builder_registry.py
+++ b/tests/test_builder_registry.py
@@ -2,8 +2,8 @@
import unittest
-from beautifulsoup import BeautifulSoup
-from beautifulsoup.builder import (
+from bs4 import BeautifulSoup
+from bs4.builder import (
builder_registry as registry,
LXMLTreeBuilderForXML,
LXMLTreeBuilder,
diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py
index 5abc29d..b9bdd1d 100644
--- a/tests/test_html5lib.py
+++ b/tests/test_html5lib.py
@@ -1,5 +1,5 @@
-from beautifulsoup.builder import HTML5TreeBuilder
-from beautifulsoup.element import Comment, SoupStrainer
+from bs4.builder import HTML5TreeBuilder
+from bs4.element import Comment, SoupStrainer
from test_lxml import (
TestLXMLBuilder,
TestLXMLBuilderInvalidMarkup,
diff --git a/tests/test_lxml.py b/tests/test_lxml.py
index a96fbbb..0eec688 100644
--- a/tests/test_lxml.py
+++ b/tests/test_lxml.py
@@ -2,10 +2,10 @@
import re
-from beautifulsoup import BeautifulSoup
-from beautifulsoup.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
-from beautifulsoup.element import Comment, Doctype, SoupStrainer
-from beautifulsoup.testing import SoupTest
+from bs4 import BeautifulSoup
+from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
+from bs4.element import Comment, Doctype, SoupStrainer
+from bs4.testing import SoupTest
class TestLXMLBuilder(SoupTest):
diff --git a/tests/test_soup.py b/tests/test_soup.py
index 690db39..d283b8a 100644
--- a/tests/test_soup.py
+++ b/tests/test_soup.py
@@ -2,9 +2,9 @@
"""Tests of Beautiful Soup as a whole."""
import unittest
-from beautifulsoup.element import SoupStrainer
-from beautifulsoup.dammit import EntitySubstitution, UnicodeDammit
-from beautifulsoup.testing import SoupTest
+from bs4.element import SoupStrainer
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+from bs4.testing import SoupTest
class TestSelectiveParsing(SoupTest):
diff --git a/tests/test_tree.py b/tests/test_tree.py
index ea10367..c61b326 100644
--- a/tests/test_tree.py
+++ b/tests/test_tree.py
@@ -12,10 +12,10 @@ methods tested here.
import copy
import cPickle as pickle
import re
-from beautifulsoup import BeautifulSoup
-from beautifulsoup.builder import builder_registry
-from beautifulsoup.element import CData, SoupStrainer, Tag
-from beautifulsoup.testing import SoupTest
+from bs4 import BeautifulSoup
+from bs4.builder import builder_registry
+from bs4.element import CData, SoupStrainer, Tag
+from bs4.testing import SoupTest
class TreeTest(SoupTest):