summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--beautifulsoup/__init__.py295
-rw-r--r--beautifulsoup/builder/__init__.py259
-rw-r--r--beautifulsoup/builder/_html5lib.py233
-rw-r--r--beautifulsoup/builder/_lxml.py108
-rw-r--r--beautifulsoup/dammit.py410
-rw-r--r--beautifulsoup/element.py855
-rw-r--r--beautifulsoup/testing.py37
-rw-r--r--beautifulsoup/util.py21
8 files changed, 0 insertions, 2218 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
deleted file mode 100644
index 518e95f..0000000
--- a/beautifulsoup/__init__.py
+++ /dev/null
@@ -1,295 +0,0 @@
-"""Beautiful Soup
-Elixir and Tonic
-"The Screen-Scraper's Friend"
-http://www.crummy.com/software/BeautifulSoup/
-
-Beautiful Soup uses a plug-in parser to parse a (possibly invalid) XML
-or HTML document into a tree representation. The parser does the work
-of building a parse tree, and Beautiful Soup provides provides methods
-and Pythonic idioms that make it easy to navigate, search, and modify
-the parse tree.
-
-Beautiful Soup works with Python 2.5 and up. To get it to work, you
-must install either lxml or html5lib.
-
-For more than you ever wanted to know about Beautiful Soup, see the
-documentation:
-http://www.crummy.com/software/BeautifulSoup/documentation.html
-
-Here, have some legalese:
-
-Copyright (c) 2004-2011, Leonard Richardson
-
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials provided
- with the distribution.
-
- * Neither the name of the the Beautiful Soup Consortium and All
- Night Kosher Bakery nor the names of its contributors may be
- used to endorse or promote products derived from this software
- without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
-
-"""
-from __future__ import generators
-
-__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.0.0"
-__copyright__ = "Copyright (c) 2004-2011 Leonard Richardson"
-__license__ = "New-style BSD"
-
-__all__ = ['BeautifulSoup']
-
-import re
-
-from util import isList, buildSet
-from builder import builder_registry
-from dammit import UnicodeDammit
-from element import DEFAULT_OUTPUT_ENCODING, NavigableString, Tag
-
-
-class BeautifulSoup(Tag):
- """
- This class defines the basic interface called by the tree builders.
-
- These methods will be called by the parser:
- reset()
- feed(markup)
-
- The tree builder may call these methods from its feed() implementation:
- handle_starttag(name, attrs) # See note about return value
- handle_endtag(name)
- handle_data(data) # Appends to the current data node
- endData(containerClass=NavigableString) # Ends the current data node
-
- No matter how complicated the underlying parser is, you should be
- able to build a tree using 'start tag' events, 'end tag' events,
- 'data' events, and "done with data" events.
-
- If you encounter an empty-element tag (aka a self-closing tag,
- like HTML's <br> tag), call handle_starttag and then
- handle_endtag.
- """
- ROOT_TAG_NAME = u'[document]'
-
- # If the end-user gives no indication which tree builder they
- # want, look for one with these features.
- DEFAULT_BUILDER_FEATURES = ['html']
-
- # Used when determining whether a text node is all whitespace and
- # can be replaced with a single space. A text node that contains
- # fancy Unicode spaces (usually non-breaking) should be left
- # alone.
- STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
-
- def __init__(self, markup="", features=None, builder=None,
- parse_only=None, from_encoding=None):
- """The Soup object is initialized as the 'root tag', and the
- provided markup (which can be a string or a file-like object)
- is fed into the underlying parser."""
-
- if builder is None:
- if isinstance(features, basestring):
- features = [features]
- if features is None or len(features) == 0:
- features = self.DEFAULT_BUILDER_FEATURES
- builder_class = builder_registry.lookup(*features)
- if builder_class is None:
- raise ValueError(
- "Couldn't find a tree builder with the features you "
- "requested: %s. Do you need to install a parser library?"
- % ",".join(features))
- builder = builder_class()
- self.builder = builder
- self.is_xml = builder.is_xml
- self.builder.soup = self
-
- self.parse_only = parse_only
-
- self.reset()
-
- if hasattr(markup, 'read'): # It's a file-type object.
- markup = markup.read()
- self.markup, self.original_encoding, self.declared_html_encoding = (
- self.builder.prepare_markup(markup, from_encoding))
-
- try:
- self._feed()
- except StopParsing:
- pass
-
- # Clear out the markup and the builder so they can be CGed.
- self.markup = None
- self.builder.soup = None
- self.builder = None
-
- def _feed(self):
- # Convert the document to Unicode.
- self.builder.reset()
-
- self.builder.feed(self.markup)
- # Close out any unfinished strings and close all the open tags.
- self.endData()
- while self.currentTag.name != self.ROOT_TAG_NAME:
- self.popTag()
-
- def reset(self):
- Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
- self.hidden = 1
- self.builder.reset()
- self.currentData = []
- self.currentTag = None
- self.tagStack = []
- self.pushTag(self)
-
- def popTag(self):
- tag = self.tagStack.pop()
- #print "Pop", tag.name
- if self.tagStack:
- self.currentTag = self.tagStack[-1]
- return self.currentTag
-
- def pushTag(self, tag):
- #print "Push", tag.name
- if self.currentTag:
- self.currentTag.contents.append(tag)
- self.tagStack.append(tag)
- self.currentTag = self.tagStack[-1]
-
- def endData(self, containerClass=NavigableString):
- if self.currentData:
- currentData = u''.join(self.currentData)
- if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
- not buildSet([tag.name for tag in self.tagStack]).intersection(
- self.builder.preserve_whitespace_tags)):
- if '\n' in currentData:
- currentData = '\n'
- else:
- currentData = ' '
- self.currentData = []
- if self.parse_only and len(self.tagStack) <= 1 and \
- (not self.parse_only.text or \
- not self.parse_only.search(currentData)):
- return
- o = containerClass(currentData)
- self.object_was_parsed(o)
-
- def object_was_parsed(self, o):
- """Add an object to the parse tree."""
- o.setup(self.currentTag, self.previous)
- if self.previous:
- self.previous.next = o
- self.previous = o
- self.currentTag.contents.append(o)
-
-
- def _popToTag(self, name, inclusivePop=True):
- """Pops the tag stack up to and including the most recent
- instance of the given tag. If inclusivePop is false, pops the tag
- stack up to but *not* including the most recent instqance of
- the given tag."""
- #print "Popping to %s" % name
- if name == self.ROOT_TAG_NAME:
- return
-
- numPops = 0
- mostRecentTag = None
- for i in range(len(self.tagStack)-1, 0, -1):
- if name == self.tagStack[i].name:
- numPops = len(self.tagStack)-i
- break
- if not inclusivePop:
- numPops = numPops - 1
-
- for i in range(0, numPops):
- mostRecentTag = self.popTag()
- return mostRecentTag
-
- def handle_starttag(self, name, attrs):
- """Push a start tag on to the stack.
-
- If this method returns None, the tag was rejected by the
- SoupStrainer. You should proceed as if the tag had not occured
- in the document. For instance, if this was a self-closing tag,
- don't call handle_endtag.
- """
-
- #print "Start tag %s: %s" % (name, attrs)
- self.endData()
-
- if (self.parse_only and len(self.tagStack) <= 1
- and (self.parse_only.text
- or not self.parse_only.searchTag(name, attrs))):
- return None
-
- tag = Tag(self, self.builder, name, attrs, self.currentTag,
- self.previous)
- if tag is None:
- return tag
- if self.previous:
- self.previous.next = tag
- self.previous = tag
- self.pushTag(tag)
- return tag
-
-
- def handle_endtag(self, name):
- #print "End tag: " + name
- self.endData()
- self._popToTag(name)
-
- def handle_data(self, data):
- self.currentData.append(data)
-
- def decode(self, pretty_print=False,
- eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- substitute_html_entities=False):
- """Returns a string or Unicode representation of this document.
- To get Unicode, pass None for encoding."""
- if self.is_xml:
- # Print the XML declaration
- encoding_part = ''
- if eventual_encoding != None:
- encoding_part = ' encoding="%s"' % eventual_encoding
- prefix = u'<?xml version="1.0"%s>\n' % encoding_part
- else:
- prefix = u''
- if not pretty_print:
- indent_level = None
- else:
- indent_level = 0
- return prefix + super(BeautifulSoup, self).decode(
- indent_level, eventual_encoding,
- substitute_html_entities)
-
-
-class StopParsing(Exception):
- pass
-
-
-#By default, act as an HTML pretty-printer.
-if __name__ == '__main__':
- import sys
- soup = BeautifulSoup(sys.stdin)
- print soup.prettify()
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
deleted file mode 100644
index 10c6b7f..0000000
--- a/beautifulsoup/builder/__init__.py
+++ /dev/null
@@ -1,259 +0,0 @@
-from collections import defaultdict
-import re
-import sys
-
-__all__ = [
- 'HTMLTreeBuilder',
- 'SAXTreeBuilder',
- 'TreeBuilder',
- 'TreeBuilderRegistry',
- ]
-
-# Some useful features for a TreeBuilder to have.
-FAST = 'fast'
-PERMISSIVE = 'permissive'
-XML = 'xml'
-HTML = 'html'
-HTML_5 = 'html5'
-
-
-class TreeBuilderRegistry(object):
-
- def __init__(self):
- self.builders_for_feature = defaultdict(list)
- self.builders = []
-
- def register(self, treebuilder_class):
- """Register a treebuilder based on its advertised features."""
- for feature in treebuilder_class.features:
- self.builders_for_feature[feature].insert(0, treebuilder_class)
- self.builders.insert(0, treebuilder_class)
-
- def lookup(self, *features):
- if len(self.builders) == 0:
- # There are no builders at all.
- return None
-
- if len(features) == 0:
- # They didn't ask for any features. Give them the most
- # recently registered builder.
- return self.builders[0]
-
- # Go down the list of features in order, and eliminate any builders
- # that don't match every feature.
- features = list(features)
- features.reverse()
- candidates = None
- candidate_set = None
- while len(features) > 0:
- feature = features.pop()
- we_have_the_feature = self.builders_for_feature.get(feature, [])
- if len(we_have_the_feature) > 0:
- if candidates is None:
- candidates = we_have_the_feature
- candidate_set = set(candidates)
- else:
- # Eliminate any candidates that don't have this feature.
- candidate_set = candidate_set.intersection(
- set(we_have_the_feature))
-
- # The only valid candidates are the ones in candidate_set.
- # Go through the original list of candidates and pick the first one
- # that's in candidate_set.
- if candidate_set is None:
- return None
- for candidate in candidates:
- if candidate in candidate_set:
- return candidate
- return None
-
-# The BeautifulSoup class will take feature lists from developers and use them
-# to look up builders in this registry.
-builder_registry = TreeBuilderRegistry()
-
-
-class TreeBuilder(object):
- """Turn a document into a Beautiful Soup object tree."""
-
- features = []
-
- is_xml = False
- preserve_whitespace_tags = set()
- empty_element_tags = None # A tag will be considered an empty-element
- # tag when and only when it has no contents.
-
- def __init__(self):
- self.soup = None
-
- def reset(self):
- pass
-
- def can_be_empty_element(self, tag_name):
- """Might a tag with this name be an empty-element tag?
-
- The final markup may or may not actually present this tag as
- self-closing.
-
- For instance: an HTMLBuilder does not consider a <p> tag to be
- an empty-element tag (it's not in
- HTMLBuilder.empty_element_tags). This means an empty <p> tag
- will be presented as "<p></p>", not "<p />".
-
- The default implementation has no opinion about which tags are
- empty-element tags, so a tag will be presented as an
- empty-element tag if and only if it has no contents.
- "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
- be left alone.
- """
- if self.empty_element_tags is None:
- return True
- return tag_name in self.empty_element_tags
-
- def feed(self, markup):
- raise NotImplementedError()
-
- def prepare_markup(self, markup, user_specified_encoding=None,
- document_declared_encoding=None):
- return markup, None, None
-
- def test_fragment_to_document(self, fragment):
- """Wrap an HTML fragment to make it look like a document.
-
- Different parsers do this differently. For instance, lxml
- introduces an empty <head> tag, and html5lib
- doesn't. Abstracting this away lets us write simple tests
- which run HTML fragments through the parser and compare the
- results against other HTML fragments.
-
- This method should not be used outside of tests.
- """
- return fragment
-
- def set_up_substitutions(self, tag):
- pass
-
-
-class SAXTreeBuilder(TreeBuilder):
- """A Beautiful Soup treebuilder that listens for SAX events."""
-
- def feed(self, markup):
- raise NotImplementedError()
-
- def close(self):
- pass
-
- def startElement(self, name, attrs):
- attrs = dict((key[1], value) for key, value in attrs.items())
- #print "Start %s, %r" % (name, attrs)
- self.soup.handle_starttag(name, attrs)
-
- def endElement(self, name):
- #print "End %s" % name
- self.soup.handle_endtag(name)
-
- def startElementNS(self, nsTuple, nodeName, attrs):
- # Throw away (ns, nodeName) for now.
- self.startElement(nodeName, attrs)
-
- def endElementNS(self, nsTuple, nodeName):
- # Throw away (ns, nodeName) for now.
- self.endElement(nodeName)
- #handler.endElementNS((ns, node.nodeName), node.nodeName)
-
- def startPrefixMapping(self, prefix, nodeValue):
- # Ignore the prefix for now.
- pass
-
- def endPrefixMapping(self, prefix):
- # Ignore the prefix for now.
- # handler.endPrefixMapping(prefix)
- pass
-
- def characters(self, content):
- self.soup.handle_data(content)
-
- def startDocument(self):
- pass
-
- def endDocument(self):
- pass
-
-
-class HTMLTreeBuilder(TreeBuilder):
- """This TreeBuilder knows facts about HTML.
-
- Such as which tags are empty-element tags.
- """
-
- preserve_whitespace_tags = set(['pre', 'textarea'])
- empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
- 'spacer', 'link', 'frame', 'base'])
-
- # Used by set_up_substitutions to detect the charset in a META tag
- CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
-
- def set_up_substitutions(self, tag):
- if tag.name != 'meta':
- return False
-
- http_equiv = tag.get('http-equiv')
- content = tag.get('content')
-
- if (http_equiv is not None
- and content is not None
- and http_equiv.lower() == 'content-type'):
- # This is an interesting meta tag.
- match = self.CHARSET_RE.search(content)
- if match:
- if (self.soup.declared_html_encoding is not None or
- self.soup.original_encoding == self.soup.from_encoding):
- # An HTML encoding was sniffed while converting
- # the document to Unicode, or an HTML encoding was
- # sniffed during a previous pass through the
- # document, or an encoding was specified
- # explicitly and it worked. Rewrite the meta tag.
- def rewrite(match):
- return match.group(1) + "%SOUP-ENCODING%"
- tag['content'] = self.CHARSET_RE.sub(rewrite, content)
- return True
- else:
- # This is our first pass through the document.
- # Go through it again with the encoding information.
- new_charset = match.group(3)
- if (new_charset is not None
- and new_charset != self.soup.original_encoding):
- self.soup.declared_html_encoding = new_charset
- self.soup._feed(self.soup.declared_html_encoding)
- raise StopParsing
- pass
- return False
-
-
-def register_treebuilders_from(module):
- """Copy TreeBuilders from the given module into this module."""
- # I'm fairly sure this is not the best way to do this.
- this_module = sys.modules[__package__]
- for name in module.__all__:
- obj = getattr(module, name)
-
- if issubclass(obj, TreeBuilder):
- setattr(this_module, name, obj)
- this_module.__all__.append(name)
- # Register the builder while we're at it.
- this_module.builder_registry.register(obj)
-
-# Builders are registered in reverse order of priority, so that custom
-# builder registrations will take precedence. In general, we want
-# html5lib to take precedence over lxml, because it's more reliable.
-try:
- import _lxml
- register_treebuilders_from(_lxml)
-except ImportError:
- # They don't have lxml installed.
- pass
-try:
- import _html5lib
- register_treebuilders_from(_html5lib)
-except ImportError:
- # They don't have html5lib installed.
- pass
diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py
deleted file mode 100644
index f8a7a40..0000000
--- a/beautifulsoup/builder/_html5lib.py
+++ /dev/null
@@ -1,233 +0,0 @@
-__all__ = [
- 'HTML5TreeBuilder',
- ]
-
-from beautifulsoup.builder import (
- PERMISSIVE,
- HTML,
- HTML_5,
- HTMLTreeBuilder,
- )
-import html5lib
-from html5lib.constants import DataLossWarning
-import warnings
-from beautifulsoup.element import (
- Comment,
- Doctype,
- NavigableString,
- Tag,
- )
-
-class HTML5TreeBuilder(HTMLTreeBuilder):
- """Use html5lib to build a tree."""
-
- features = ['html5lib', PERMISSIVE, HTML_5, HTML]
-
- def prepare_markup(self, markup, user_specified_encoding):
- # Store the user-specified encoding for use later on.
- self.user_specified_encoding = user_specified_encoding
- return markup, None, None
-
- # These methods are defined by Beautiful Soup.
- def feed(self, markup):
- parser = html5lib.HTMLParser(tree=self.create_treebuilder)
- doc = parser.parse(markup, encoding=self.user_specified_encoding)
-
- # Set the character encoding detected by the tokenizer.
- if isinstance(markup, unicode):
- # We need to special-case this because html5lib sets
- # charEncoding to UTF-8 if it gets Unicode input.
- doc.original_encoding = None
- else:
- doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
-
- def create_treebuilder(self, namespaceHTMLElements):
- self.underlying_builder = TreeBuilderForHtml5lib(
- self.soup, namespaceHTMLElements)
- return self.underlying_builder
-
- def test_fragment_to_document(self, fragment):
- """See `TreeBuilder`."""
- return u'<html><head></head><body>%s</body></html>' % fragment
-
-
-class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
-
- def __init__(self, soup, namespaceHTMLElements):
- self.soup = soup
- if namespaceHTMLElements:
- warnings.warn("namespaceHTMLElements not supported yet",
- DataLossWarning)
- super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
-
- def documentClass(self):
- self.soup.reset()
- return Element(self.soup, self.soup, None)
-
- def insertDoctype(self, token):
- name = token["name"]
- publicId = token["publicId"]
- systemId = token["systemId"]
-
- doctype = Doctype.for_name_and_ids(name, publicId, systemId)
- self.soup.object_was_parsed(doctype)
-
- def elementClass(self, name, namespace):
- if namespace is not None:
- warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
- return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace)
-
- def commentClass(self, data):
- return TextNode(Comment(data), self.soup)
-
- def fragmentClass(self):
- self.soup = BeautifulSoup("")
- self.soup.name = "[document_fragment]"
- return Element(self.soup, self.soup, None)
-
- def appendChild(self, node):
- self.soup.insert(len(self.soup.contents), node.element)
-
- def testSerializer(self, element):
- return testSerializer(element)
-
- def getDocument(self):
- return self.soup
-
- def getFragment(self):
- return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
-
-class AttrList(object):
- def __init__(self, element):
- self.element = element
- self.attrs = dict(self.element.attrs)
- def __iter__(self):
- return self.attrs.items().__iter__()
- def __setitem__(self, name, value):
- "set attr", name, value
- self.element[name] = value
- def items(self):
- return self.attrs.items()
- def keys(self):
- return self.attrs.keys()
- def __getitem__(self, name):
- return self.attrs[name]
- def __contains__(self, name):
- return name in self.attrs.keys()
-
-
-class Element(html5lib.treebuilders._base.Node):
- def __init__(self, element, soup, namespace):
- html5lib.treebuilders._base.Node.__init__(self, element.name)
- self.element = element
- self.soup = soup
- self.namespace = namespace
-
- def _nodeIndex(self, node, refNode):
- # Finds a node by identity rather than equality
- for index in range(len(self.element.contents)):
- if id(self.element.contents[index]) == id(refNode.element):
- return index
- return None
-
- def appendChild(self, node):
- if (node.element.__class__ == NavigableString and self.element.contents
- and self.element.contents[-1].__class__ == NavigableString):
- # Concatenate new text onto old text node
- # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
- newStr = NavigableString(self.element.contents[-1]+node.element)
-
- # Remove the old text node
- # (Can't simply use .extract() by itself, because it fails if
- # an equal text node exists within the parent node)
- oldElement = self.element.contents[-1]
- del self.element.contents[-1]
- oldElement.parent = None
- oldElement.extract()
-
- self.element.insert(len(self.element.contents), newStr)
- else:
- self.element.insert(len(self.element.contents), node.element)
- node.parent = self
-
- def getAttributes(self):
- return AttrList(self.element)
-
- def setAttributes(self, attributes):
- if attributes is not None and attributes != {}:
- for name, value in attributes.items():
- self.element[name] = value
- # The attributes may contain variables that need substitution.
- # Call set_up_substitutions manually.
- # The Tag constructor calls this method automatically,
- # but html5lib creates a Tag object before setting up
- # the attributes.
- self.element.contains_substitutions = (
- self.soup.builder.set_up_substitutions(
- self.element))
- attributes = property(getAttributes, setAttributes)
-
- def insertText(self, data, insertBefore=None):
- text = TextNode(NavigableString(data), self.soup)
- if insertBefore:
- self.insertBefore(text, insertBefore)
- else:
- self.appendChild(text)
-
- def insertBefore(self, node, refNode):
- index = self._nodeIndex(node, refNode)
- if (node.element.__class__ == NavigableString and self.element.contents
- and self.element.contents[index-1].__class__ == NavigableString):
- # (See comments in appendChild)
- newStr = NavigableString(self.element.contents[index-1]+node.element)
- oldNode = self.element.contents[index-1]
- del self.element.contents[index-1]
- oldNode.parent = None
- oldNode.extract()
-
- self.element.insert(index-1, newStr)
- else:
- self.element.insert(index, node.element)
- node.parent = self
-
- def removeChild(self, node):
- index = self._nodeIndex(node.parent, node)
- del node.parent.element.contents[index]
- node.element.parent = None
- node.element.extract()
- node.parent = None
-
- def reparentChildren(self, newParent):
- while self.element.contents:
- child = self.element.contents[0]
- child.extract()
- if isinstance(child, Tag):
- newParent.appendChild(Element(child, self.soup, namespaces["html"]))
- else:
- newParent.appendChild(TextNode(child, self.soup))
-
- def cloneNode(self):
- node = Element(Tag(self.soup, self.soup.builder, self.element.name), self.soup, self.namespace)
- for key,value in self.attributes:
- node.attributes[key] = value
- return node
-
- def hasContent(self):
- return self.element.contents
-
- def getNameTuple(self):
- if self.namespace == None:
- return namespaces["html"], self.name
- else:
- return self.namespace, self.name
-
- nameTuple = property(getNameTuple)
-
-class TextNode(Element):
- def __init__(self, element, soup):
- html5lib.treebuilders._base.Node.__init__(self, None)
- self.element = element
- self.soup = soup
-
- def cloneNode(self):
- raise NotImplementedError
diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py
deleted file mode 100644
index 23ac485..0000000
--- a/beautifulsoup/builder/_lxml.py
+++ /dev/null
@@ -1,108 +0,0 @@
-__all__ = [
- 'LXMLTreeBuilderForXML',
- 'LXMLTreeBuilder',
- ]
-
-from lxml import etree
-from beautifulsoup.element import Comment, Doctype
-from beautifulsoup.builder import (
- FAST,
- HTML,
- HTMLTreeBuilder,
- PERMISSIVE,
- TreeBuilder,
- XML)
-from beautifulsoup.dammit import UnicodeDammit
-import types
-
-LXML = 'lxml'
-
-class LXMLTreeBuilderForXML(TreeBuilder):
- DEFAULT_PARSER_CLASS = etree.XMLParser
-
- is_xml = True
-
- # Well, it's permissive by XML parser standards.
- features = [LXML, XML, FAST, PERMISSIVE]
-
- @property
- def default_parser(self):
- # This can either return a parser object or a class, which
- # will be instantiated with default arguments.
- return etree.XMLParser(target=self, strip_cdata=False, recover=True)
-
- def __init__(self, parser=None, empty_element_tags=None):
- if empty_element_tags is not None:
- self.empty_element_tags = set(empty_element_tags)
- if parser is None:
- # Use the default parser.
- parser = self.default_parser
- if callable(parser):
- # Instantiate the parser with default arguments
- parser = parser(target=self, strip_cdata=False)
- self.parser = parser
- self.soup = None
-
- def prepare_markup(self, markup, user_specified_encoding=None,
- document_declared_encoding=None):
- """
- :return: A 3-tuple (markup, original encoding, encoding
- declared within markup).
- """
- if isinstance(markup, unicode):
- return markup, None, None
-
- try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
- return (dammit.markup, dammit.original_encoding,
- dammit.declared_html_encoding)
-
- def feed(self, markup):
- self.parser.feed(markup)
- self.parser.close()
-
- def close(self):
- pass
-
- def start(self, name, attrs):
- self.soup.handle_starttag(name, attrs)
-
- def end(self, name):
- self.soup.endData()
- completed_tag = self.soup.tagStack[-1]
- self.soup.handle_endtag(name)
-
- def pi(self, target, data):
- pass
-
- def data(self, content):
- self.soup.handle_data(content)
-
- def doctype(self, name, pubid, system):
- self.soup.endData()
- doctype = Doctype.for_name_and_ids(name, pubid, system)
- self.soup.object_was_parsed(doctype)
-
- def comment(self, content):
- "Handle comments as Comment objects."
- self.soup.endData()
- self.soup.handle_data(content)
- self.soup.endData(Comment)
-
- def test_fragment_to_document(self, fragment):
- """See `TreeBuilder`."""
- return u'<?xml version="1.0" encoding="utf-8">\n%s' % fragment
-
-
-class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
-
- features = [LXML, HTML, FAST]
- is_xml = False
-
- @property
- def default_parser(self):
- return etree.HTMLParser
-
- def test_fragment_to_document(self, fragment):
- """See `TreeBuilder`."""
- return u'<html><body>%s</body></html>' % fragment
diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py
deleted file mode 100644
index 4483118..0000000
--- a/beautifulsoup/dammit.py
+++ /dev/null
@@ -1,410 +0,0 @@
-"""Beautiful Soup bonus library: Unicode, Dammit
-
-This class forces XML data into a standard format (usually to UTF-8 or
-Unicode). It is heavily based on code from Mark Pilgrim's Universal
-Feed Parser. It does not rewrite the XML or HTML to reflect a new
-encoding; that's the tree builder's job.
-"""
-
-import codecs
-from htmlentitydefs import codepoint2name
-import re
-import types
-
-# Autodetects character encodings. Very useful.
-# Download from http://chardet.feedparser.org/
-# or 'apt-get install python-chardet'
-# or 'easy_install chardet'
-try:
- import chardet
- #import chardet.constants
- #chardet.constants._debug = 1
-except ImportError:
- chardet = None
-
-# Available from http://cjkpython.i18n.org/.
-try:
- import iconv_codec
-except ImportError:
- pass
-
-
-class EntitySubstitution(object):
-
- """Substitute XML or HTML entities for the corresponding characters."""
-
- def _populate_class_variables():
- lookup = {}
- characters = []
- for codepoint, name in codepoint2name.items():
- if codepoint == 34:
- # There's no point in turning the quotation mark into
- # &quot;, unless it happens within an attribute value, which
- # is handled elsewhere.
- continue;
- character = unichr(codepoint)
- characters.append(character)
- lookup[character] = name
- re_definition = "[%s]" % "".join(characters)
- return lookup, re.compile(re_definition)
- CHARACTER_TO_HTML_ENTITY, CHARACTER_TO_HTML_ENTITY_RE = (
- _populate_class_variables())
-
-
- CHARACTER_TO_XML_ENTITY = {
- "'" : "apos",
- '"' : "quot",
- "&" : "amp",
- "<" : "lt",
- ">" : "gt",
- }
-
- BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
- "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
- ")")
-
- @classmethod
- def _substitute_html_entity(cls, matchobj):
- entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
- return "&%s;" % entity
-
- @classmethod
- def _substitute_xml_entity(cls, matchobj):
- """Used with a regular expression to substitute the
- appropriate XML entity for an XML special character."""
- entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
- return "&%s;" % entity
-
- @classmethod
- def substitute_xml(cls, value, make_quoted_attribute=False):
- """Substitute XML entities for special XML characters.
-
- :param value: A string to be substituted. The less-than sign will
- become &lt;, the greater-than sign will become &gt;, and any
- ampersands that are not part of an entity defition will
- become &amp;.
-
- :param make_quoted_attribute: If True, then the string will be
- quoted, as befits an attribute value.
-
- Ordinarily, the string will be quoted using double quotes.
-
- Bob's Bar -> "Bob's Bar"
-
- If the string contains double quotes, it will be quoted using
- single quotes.
-
- Welcome to "my bar" -> 'Welcome to "my bar"'
-
- If the string contains both single and double quotes, the
- double quotes will be escaped, and the string will be quoted
- using double quotes.
-
- Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
- """
- if make_quoted_attribute:
- quote_with = '"'
- if '"' in value:
- if "'" in value:
- # The string contains both single and double
- # quotes. Turn the double quotes into
- # entities. We quote the double quotes rather than
- # the single quotes because the entity name is
- # "&quot;" whether this is HTML or XML. If we
- # quoted the single quotes, we'd have to decide
- # between &apos; and &squot;.
- replace_with = "&quot;"
- value = value.replace('"', replace_with)
- else:
- # There are double quotes but no single quotes.
- # We can use single quotes to quote the attribute.
- quote_with = "'"
-
- # Escape angle brackets, and ampersands that aren't part of
- # entities.
- value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
- cls._substitute_xml_entity, value)
- if make_quoted_attribute:
- return quote_with + value + quote_with
- else:
- return value
-
- @classmethod
- def substitute_html(cls, s):
- """Replace certain Unicode characters with named HTML entities.
-
- This differs from data.encode(encoding, 'xmlcharrefreplace')
- in that the goal is to make the result more readable (to those
- with ASCII displays) rather than to recover from
- errors. There's absolutely nothing wrong with a UTF-8 string
- containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
- character with "&eacute;" will make it more readable to some
- people.
- """
- return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
- cls._substitute_html_entity, s)
-
-
-class UnicodeDammit:
- """A class for detecting the encoding of a *ML document and
- converting it to a Unicode string. If the source encoding is
- windows-1252, can replace MS smart quotes with their HTML or XML
- equivalents."""
-
- # This dictionary maps commonly seen values for "charset" in HTML
- # meta tags to the corresponding Python codec names. It only covers
- # values that aren't in Python's aliases and can't be determined
- # by the heuristics in find_codec.
- CHARSET_ALIASES = { "macintosh" : "mac-roman",
- "x-sjis" : "shift-jis" }
-
- ENCODINGS_WITH_SMART_QUOTES = [
- "windows-1252",
- "iso-8859-1",
- "iso-8859-2",
- ]
-
- def __init__(self, markup, override_encodings=[],
- smart_quotes_to=None, isHTML=False):
- self.declared_html_encoding = None
- self.markup, document_encoding, sniffed_encoding = \
- self._detectEncoding(markup, isHTML)
- self.smart_quotes_to = smart_quotes_to
- self.tried_encodings = []
- if markup == '' or isinstance(markup, unicode):
- self.original_encoding = None
- self.unicode = unicode(markup)
- return
-
- u = None
- for proposed_encoding in (
- override_encodings + [document_encoding, sniffed_encoding]):
- if proposed_encoding is not None:
- u = self._convert_from(proposed_encoding)
- if u:
- break
-
- # If no luck and we have auto-detection library, try that:
- if not u and chardet and not isinstance(self.markup, unicode):
- u = self._convert_from(chardet.detect(self.markup)['encoding'])
-
- # As a last resort, try utf-8 and windows-1252:
- if not u:
- for proposed_encoding in ("utf-8", "windows-1252"):
- u = self._convert_from(proposed_encoding)
- if u:
- break
-
- self.unicode = u
- if not u: self.original_encoding = None
-
- def _sub_ms_char(self, match):
- """Changes a MS smart quote character to an XML or HTML
- entity."""
- orig = match.group(1)
- sub = self.MS_CHARS.get(orig)
- if type(sub) == types.TupleType:
- if self.smart_quotes_to == 'xml':
- sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
- else:
- sub = '&'.encode() + sub[0].encode() + ';'.encode()
- else:
- sub = sub.encode()
- return sub
-
- def _convert_from(self, proposed):
- proposed = self.find_codec(proposed)
- if not proposed or proposed in self.tried_encodings:
- return None
- self.tried_encodings.append(proposed)
- markup = self.markup
-
- # Convert smart quotes to HTML if coming from an encoding
- # that might have them.
- if (self.smart_quotes_to is not None
- and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
- smart_quotes_re = "([\x80-\x9f])"
- smart_quotes_compiled = re.compile(smart_quotes_re)
- markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
-
- try:
- # print "Trying to convert document to %s" % proposed
- u = self._to_unicode(markup, proposed)
- self.markup = u
- self.original_encoding = proposed
- except Exception, e:
- # print "That didn't work!"
- # print e
- return None
- #print "Correct encoding: %s" % proposed
- return self.markup
-
- def _to_unicode(self, data, encoding):
- '''Given a string and its encoding, decodes the string into Unicode.
- %encoding is a string recognized by encodings.aliases'''
-
- # strip Byte Order Mark (if present)
- if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
- and (data[2:4] != '\x00\x00'):
- encoding = 'utf-16be'
- data = data[2:]
- elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
- and (data[2:4] != '\x00\x00'):
- encoding = 'utf-16le'
- data = data[2:]
- elif data[:3] == '\xef\xbb\xbf':
- encoding = 'utf-8'
- data = data[3:]
- elif data[:4] == '\x00\x00\xfe\xff':
- encoding = 'utf-32be'
- data = data[4:]
- elif data[:4] == '\xff\xfe\x00\x00':
- encoding = 'utf-32le'
- data = data[4:]
- newdata = unicode(data, encoding)
- return newdata
-
- def _detectEncoding(self, xml_data, isHTML=False):
- """Given a document, tries to detect its XML encoding."""
- xml_encoding = sniffed_xml_encoding = None
- try:
- if xml_data[:4] == '\x4c\x6f\xa7\x94':
- # EBCDIC
- xml_data = self._ebcdic_to_ascii(xml_data)
- elif xml_data[:4] == '\x00\x3c\x00\x3f':
- # UTF-16BE
- sniffed_xml_encoding = 'utf-16be'
- xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
- elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
- and (xml_data[2:4] != '\x00\x00'):
- # UTF-16BE with BOM
- sniffed_xml_encoding = 'utf-16be'
- xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
- elif xml_data[:4] == '\x3c\x00\x3f\x00':
- # UTF-16LE
- sniffed_xml_encoding = 'utf-16le'
- xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
- elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
- (xml_data[2:4] != '\x00\x00'):
- # UTF-16LE with BOM
- sniffed_xml_encoding = 'utf-16le'
- xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
- elif xml_data[:4] == '\x00\x00\x00\x3c':
- # UTF-32BE
- sniffed_xml_encoding = 'utf-32be'
- xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
- elif xml_data[:4] == '\x3c\x00\x00\x00':
- # UTF-32LE
- sniffed_xml_encoding = 'utf-32le'
- xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
- elif xml_data[:4] == '\x00\x00\xfe\xff':
- # UTF-32BE with BOM
- sniffed_xml_encoding = 'utf-32be'
- xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
- elif xml_data[:4] == '\xff\xfe\x00\x00':
- # UTF-32LE with BOM
- sniffed_xml_encoding = 'utf-32le'
- xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
- elif xml_data[:3] == '\xef\xbb\xbf':
- # UTF-8 with BOM
- sniffed_xml_encoding = 'utf-8'
- xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
- else:
- sniffed_xml_encoding = 'ascii'
- pass
- except:
- xml_encoding_match = None
- xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
- xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
- if not xml_encoding_match and isHTML:
- meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
- regexp = re.compile(meta_re, re.I)
- xml_encoding_match = regexp.search(xml_data)
- if xml_encoding_match is not None:
- xml_encoding = xml_encoding_match.groups()[0].decode(
- 'ascii').lower()
- if isHTML:
- self.declared_html_encoding = xml_encoding
- if sniffed_xml_encoding and \
- (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
- 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
- 'utf-16', 'utf-32', 'utf_16', 'utf_32',
- 'utf16', 'u16')):
- xml_encoding = sniffed_xml_encoding
- return xml_data, xml_encoding, sniffed_xml_encoding
-
-
- def find_codec(self, charset):
- return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
- or (charset and self._codec(charset.replace("-", ""))) \
- or (charset and self._codec(charset.replace("-", "_"))) \
- or charset
-
- def _codec(self, charset):
- if not charset: return charset
- codec = None
- try:
- codecs.lookup(charset)
- codec = charset
- except (LookupError, ValueError):
- pass
- return codec
-
- EBCDIC_TO_ASCII_MAP = None
- def _ebcdic_to_ascii(self, s):
- c = self.__class__
- if not c.EBCDIC_TO_ASCII_MAP:
- emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
- 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
- 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
- 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
- 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
- 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
- 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
- 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
- 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
- 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
- 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
- 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
- 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
- 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
- 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
- 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
- 250,251,252,253,254,255)
- import string
- c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
- ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
- return s.translate(c.EBCDIC_TO_ASCII_MAP)
-
- MS_CHARS = { '\x80' : ('euro', '20AC'),
- '\x81' : ' ',
- '\x82' : ('sbquo', '201A'),
- '\x83' : ('fnof', '192'),
- '\x84' : ('bdquo', '201E'),
- '\x85' : ('hellip', '2026'),
- '\x86' : ('dagger', '2020'),
- '\x87' : ('Dagger', '2021'),
- '\x88' : ('circ', '2C6'),
- '\x89' : ('permil', '2030'),
- '\x8A' : ('Scaron', '160'),
- '\x8B' : ('lsaquo', '2039'),
- '\x8C' : ('OElig', '152'),
- '\x8D' : '?',
- '\x8E' : ('#x17D', '17D'),
- '\x8F' : '?',
- '\x90' : '?',
- '\x91' : ('lsquo', '2018'),
- '\x92' : ('rsquo', '2019'),
- '\x93' : ('ldquo', '201C'),
- '\x94' : ('rdquo', '201D'),
- '\x95' : ('bull', '2022'),
- '\x96' : ('ndash', '2013'),
- '\x97' : ('mdash', '2014'),
- '\x98' : ('tilde', '2DC'),
- '\x99' : ('trade', '2122'),
- '\x9a' : ('scaron', '161'),
- '\x9b' : ('rsaquo', '203A'),
- '\x9c' : ('oelig', '153'),
- '\x9d' : '?',
- '\x9e' : ('#x17E', '17E'),
- '\x9f' : ('Yuml', ''),}
diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py
deleted file mode 100644
index 61ed4ab..0000000
--- a/beautifulsoup/element.py
+++ /dev/null
@@ -1,855 +0,0 @@
-import re
-import types
-try:
- from htmlentitydefs import name2codepoint
-except ImportError:
- name2codepoint = {}
-from beautifulsoup.dammit import EntitySubstitution
-
-from util import isList
-
-DEFAULT_OUTPUT_ENCODING = "utf-8"
-
-
-class PageElement(object):
- """Contains the navigational information for some part of the page
- (either a tag or a piece of text)"""
-
- def setup(self, parent=None, previous=None):
- """Sets up the initial relations between this element and
- other elements."""
- self.parent = parent
- self.previous = previous
- self.next = None
- self.previousSibling = None
- self.nextSibling = None
- if self.parent and self.parent.contents:
- self.previousSibling = self.parent.contents[-1]
- self.previousSibling.nextSibling = self
-
- def replaceWith(self, replaceWith):
- oldParent = self.parent
- myIndex = self.parent.contents.index(self)
- if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
- # We're replacing this element with one of its siblings.
- index = self.parent.contents.index(replaceWith)
- if index and index < myIndex:
- # Furthermore, it comes before this element. That
- # means that when we extract it, the index of this
- # element will change.
- myIndex = myIndex - 1
- self.extract()
- oldParent.insert(myIndex, replaceWith)
-
- def extract(self):
- """Destructively rips this element out of the tree."""
- if self.parent:
- try:
- self.parent.contents.remove(self)
- except ValueError:
- pass
-
- #Find the two elements that would be next to each other if
- #this element (and any children) hadn't been parsed. Connect
- #the two.
- lastChild = self._lastRecursiveChild()
- nextElement = lastChild.next
-
- if self.previous:
- self.previous.next = nextElement
- if nextElement:
- nextElement.previous = self.previous
- self.previous = None
- lastChild.next = None
-
- self.parent = None
- if self.previousSibling:
- self.previousSibling.nextSibling = self.nextSibling
- if self.nextSibling:
- self.nextSibling.previousSibling = self.previousSibling
- self.previousSibling = self.nextSibling = None
- return self
-
- def _lastRecursiveChild(self):
- "Finds the last element beneath this object to be parsed."
- lastChild = self
- while hasattr(lastChild, 'contents') and lastChild.contents:
- lastChild = lastChild.contents[-1]
- return lastChild
-
- def insert(self, position, newChild):
- if (isinstance(newChild, basestring)
- or isinstance(newChild, unicode)) \
- and not isinstance(newChild, NavigableString):
- newChild = NavigableString(newChild)
-
- position = min(position, len(self.contents))
- if hasattr(newChild, 'parent') and newChild.parent != None:
- # We're 'inserting' an element that's already one
- # of this object's children.
- if newChild.parent == self:
- index = self.find(newChild)
- if index and index < position:
- # Furthermore we're moving it further down the
- # list of this object's children. That means that
- # when we extract this element, our target index
- # will jump down one.
- position = position - 1
- newChild.extract()
-
- newChild.parent = self
- previousChild = None
- if position == 0:
- newChild.previousSibling = None
- newChild.previous = self
- else:
- previousChild = self.contents[position-1]
- newChild.previousSibling = previousChild
- newChild.previousSibling.nextSibling = newChild
- newChild.previous = previousChild._lastRecursiveChild()
- if newChild.previous:
- newChild.previous.next = newChild
-
- newChildsLastElement = newChild._lastRecursiveChild()
-
- if position >= len(self.contents):
- newChild.nextSibling = None
-
- parent = self
- parentsNextSibling = None
- while not parentsNextSibling:
- parentsNextSibling = parent.nextSibling
- parent = parent.parent
- if not parent: # This is the last element in the document.
- break
- if parentsNextSibling:
- newChildsLastElement.next = parentsNextSibling
- else:
- newChildsLastElement.next = None
- else:
- nextChild = self.contents[position]
- newChild.nextSibling = nextChild
- if newChild.nextSibling:
- newChild.nextSibling.previousSibling = newChild
- newChildsLastElement.next = nextChild
-
- if newChildsLastElement.next:
- newChildsLastElement.next.previous = newChildsLastElement
- self.contents.insert(position, newChild)
-
- def append(self, tag):
- """Appends the given tag to the contents of this tag."""
- self.insert(len(self.contents), tag)
-
- def find_next(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the first item that matches the given criteria and
- appears after this Tag in the document."""
- return self._findOne(self.find_all_next, name, attrs, text, **kwargs)
- findNext = find_next # BS3
-
- def find_all_next(self, name=None, attrs={}, text=None, limit=None,
- **kwargs):
- """Returns all items that match the given criteria and appear
- after this Tag in the document."""
- return self._find_all(name, attrs, text, limit, self.next_elements,
- **kwargs)
- findAllNext = find_all_next # BS3
-
- def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the closest sibling to this Tag that matches the
- given criteria and appears after this Tag in the document."""
- return self._findOne(self.find_next_siblings, name, attrs, text,
- **kwargs)
- findNextSibling = find_next_sibling # BS3
-
- def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
- **kwargs):
- """Returns the siblings of this Tag that match the given
- criteria and appear after this Tag in the document."""
- return self._find_all(name, attrs, text, limit,
- self.next_siblings, **kwargs)
- findNextSiblings = find_next_siblings # BS3
- fetchNextSiblings = find_next_siblings # BS2
-
- def find_previous(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the first item that matches the given criteria and
- appears before this Tag in the document."""
- return self._findOne(
- self.find_all_previous, name, attrs, text, **kwargs)
- findPrevious = find_previous # BS3
-
- def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
- **kwargs):
- """Returns all items that match the given criteria and appear
- before this Tag in the document."""
- return self._find_all(name, attrs, text, limit, self.previous_elements,
- **kwargs)
- findAllPrevious = find_all_previous # BS3
- fetchPrevious = find_all_previous # BS2
-
- def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
- """Returns the closest sibling to this Tag that matches the
- given criteria and appears before this Tag in the document."""
- return self._findOne(self.find_previous_siblings, name, attrs, text,
- **kwargs)
- findPreviousSibling = find_previous_sibling # BS3
-
- def find_previous_siblings(self, name=None, attrs={}, text=None,
- limit=None, **kwargs):
- """Returns the siblings of this Tag that match the given
- criteria and appear before this Tag in the document."""
- return self._find_all(name, attrs, text, limit,
- self.previous_siblings, **kwargs)
- findPreviousSiblings = find_previous_siblings # BS3
- fetchPreviousSiblings = find_previous_siblings # BS2
-
- def find_parent(self, name=None, attrs={}, **kwargs):
- """Returns the closest parent of this Tag that matches the given
- criteria."""
- # NOTE: We can't use _findOne because findParents takes a different
- # set of arguments.
- r = None
- l = self.find_parents(name, attrs, 1)
- if l:
- r = l[0]
- return r
- findParent = find_parent # BS3
-
- def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
- """Returns the parents of this Tag that match the given
- criteria."""
-
- return self._find_all(name, attrs, None, limit, self.parents,
- **kwargs)
- findParents = find_parents # BS3
- fetchParents = find_parents # BS2
-
- #These methods do the real heavy lifting.
-
- def _findOne(self, method, name, attrs, text, **kwargs):
- r = None
- l = method(name, attrs, text, 1, **kwargs)
- if l:
- r = l[0]
- return r
-
- def _find_all(self, name, attrs, text, limit, generator, **kwargs):
- "Iterates over a generator looking for things that match."
-
- if isinstance(name, SoupStrainer):
- strainer = name
- else:
- # Build a SoupStrainer
- strainer = SoupStrainer(name, attrs, text, **kwargs)
- results = ResultSet(strainer)
- while True:
- try:
- i = generator.next()
- except StopIteration:
- break
- if i:
- found = strainer.search(i)
- if found:
- results.append(found)
- if limit and len(results) >= limit:
- break
- return results
-
- #These generators can be used to navigate starting from both
- #NavigableStrings and Tags.
- @property
- def next_elements(self):
- i = self
- while i:
- i = i.next
- yield i
-
- @property
- def next_siblings(self):
- i = self
- while i:
- i = i.nextSibling
- yield i
-
- @property
- def previous_elements(self):
- i = self
- while i:
- i = i.previous
- yield i
-
- @property
- def previous_siblings(self):
- i = self
- while i:
- i = i.previousSibling
- yield i
-
- @property
- def parents(self):
- i = self
- while i:
- i = i.parent
- yield i
-
- # Old non-property versions of the generators, for backwards
- # compatibility with BS3.
- def nextGenerator(self):
- return self.next_elements
-
- def nextSiblingGenerator(self):
- return self.next_siblings
-
- def previousGenerator(self):
- return self.previous_elements
-
- def previousSiblingGenerator(self):
- return self.previous_siblings
-
- def parentGenerator(self):
- return self.parents
-
- # Utility methods
- def substituteEncoding(self, str, encoding=None):
- encoding = encoding or "utf-8"
- return str.replace("%SOUP-ENCODING%", encoding)
-
- def toEncoding(self, s, encoding=None):
- """Encodes an object to a string in some encoding, or to Unicode.
- ."""
- if isinstance(s, unicode):
- if encoding:
- s = s.encode(encoding)
- elif isinstance(s, str):
- if encoding:
- s = s.encode(encoding)
- else:
- s = unicode(s)
- else:
- if encoding:
- s = self.toEncoding(str(s), encoding)
- else:
- s = unicode(s)
- return s
-
-class NavigableString(unicode, PageElement):
-
- PREFIX = ''
- SUFFIX = ''
-
- def __new__(cls, value):
- """Create a new NavigableString.
-
- When unpickling a NavigableString, this method is called with
- the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
- passed in to the superclass's __new__ or the superclass won't know
- how to handle non-ASCII characters.
- """
- if isinstance(value, unicode):
- return unicode.__new__(cls, value)
- return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
-
- def __getnewargs__(self):
- return (unicode(self),)
-
- def __getattr__(self, attr):
- """text.string gives you text. This is for backwards
- compatibility for Navigable*String, but for CData* it lets you
- get the string without the CData wrapper."""
- if attr == 'string':
- return self
- else:
- raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
-
- def output_ready(self, substitute_html_entities=False):
- if substitute_html_entities:
- output = EntitySubstitution.substitute_html(self)
- else:
- output = self
- return self.PREFIX + output + self.SUFFIX
-
-
-class CData(NavigableString):
-
- PREFIX = u'<![CDATA['
- SUFFIX = u']]>'
-
-
-class ProcessingInstruction(NavigableString):
-
- PREFIX = u'<?'
- SUFFIX = u'?>'
-
-
-class Comment(NavigableString):
-
- PREFIX = u'<!--'
- SUFFIX = u'-->'
-
-class Declaration(NavigableString):
- PREFIX = u'<!'
- SUFFIX = u'!>'
-
-
-class Doctype(NavigableString):
-
- @classmethod
- def for_name_and_ids(cls, name, pub_id, system_id):
- value = name
- if pub_id is not None:
- value += ' PUBLIC "%s"' % pub_id
- if system_id is not None:
- value += ' SYSTEM "%s"' % system_id
-
- return Doctype(value)
-
- PREFIX = u'<!DOCTYPE '
- SUFFIX = u'>'
-
-
-class Tag(PageElement):
-
- """Represents a found HTML tag with its attributes and contents."""
-
- def __init__(self, parser, builder, name, attrs=None, parent=None,
- previous=None):
- "Basic constructor."
-
- # We don't actually store the parser object: that lets extracted
- # chunks be garbage-collected.
- self.parserClass = parser.__class__
- self.name = name
- if attrs == None:
- attrs = {}
- else:
- attrs = dict(attrs)
- self.attrs = attrs
- self.contents = []
- self.setup(parent, previous)
- self.hidden = False
-
- # Set up any substitutions, such as the charset in a META tag.
- self.contains_substitutions = builder.set_up_substitutions(self)
-
- self.can_be_empty_element = builder.can_be_empty_element(name)
-
- @property
- def is_empty_element(self):
- """Is this tag an empty-element tag? (aka a self-closing tag)
-
- A tag that has contents is never an empty-element tag.
-
- A tag that has no contents may or may not be an empty-element
- tag. It depends on the builder used to create the tag. If the
- builder has a designated list of empty-element tags, then only
- a tag whose name shows up in that list is considered an
- empty-element tag.
-
- If the builder has no designated list of empty-element tags,
- then any tag with no contents is an empty-element tag.
- """
- return len(self.contents) == 0 and self.can_be_empty_element
- isSelfClosing = is_empty_element # BS3
-
-
- @property
- def string(self):
- """Convenience property to get the single string within this tag.
-
- :Return: If this tag has a single string child, return value
- is that string. If this tag has no children, or more than one
- child, return value is None. If this tag has one child tag,
- return value is the 'string' attribute of the child tag,
- recursively.
- """
- if len(self.contents) != 1:
- return None
- child = self.contents[0]
- if isinstance(child, NavigableString):
- return child
- return child.string
-
- def get(self, key, default=None):
- """Returns the value of the 'key' attribute for the tag, or
- the value given for 'default' if it doesn't have that
- attribute."""
- return self.attrs.get(key, default)
-
- def has_key(self, key):
- return self.attrs.has_key(key)
-
- def __getitem__(self, key):
- """tag[key] returns the value of the 'key' attribute for the tag,
- and throws an exception if it's not there."""
- return self.attrs[key]
-
- def __iter__(self):
- "Iterating over a tag iterates over its contents."
- return iter(self.contents)
-
- def __len__(self):
- "The length of a tag is the length of its list of contents."
- return len(self.contents)
-
- def __contains__(self, x):
- return x in self.contents
-
- def __nonzero__(self):
- "A tag is non-None even if it has no contents."
- return True
-
- def __setitem__(self, key, value):
- """Setting tag[key] sets the value of the 'key' attribute for the
- tag."""
- self.attrs[key] = value
-
- def __delitem__(self, key):
- "Deleting tag[key] deletes all 'key' attributes for the tag."
- if self.attrs.has_key(key):
- del self.attrs[key]
-
- def __call__(self, *args, **kwargs):
- """Calling a tag like a function is the same as calling its
- find_all() method. Eg. tag('a') returns a list of all the A tags
- found within this tag."""
- return apply(self.find_all, args, kwargs)
-
- def __getattr__(self, tag):
- #print "Getattr %s.%s" % (self.__class__, tag)
- if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
- return self.find(tag[:-3])
- elif tag.find('__') != 0:
- return self.find(tag)
- raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
-
- def __eq__(self, other):
- """Returns true iff this tag has the same name, the same attributes,
- and the same contents (recursively) as the given tag.
-
- XXX: right now this will return false if two tags have the
- same attributes in a different order. Should this be fixed?"""
- if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
- return False
- for i in range(0, len(self.contents)):
- if self.contents[i] != other.contents[i]:
- return False
- return True
-
- def __ne__(self, other):
- """Returns true iff this tag is not identical to the other tag,
- as defined in __eq__."""
- return not self == other
-
- def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
- """Renders this tag as a string."""
- return self.encode(encoding)
-
- def __unicode__(self):
- return self.decode()
-
- def __str__(self):
- return self.encode()
-
- def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
- indent_level=None, substitute_html_entities=False):
- return self.decode(indent_level, encoding,
- substitute_html_entities).encode(encoding)
-
- def decode(self, indent_level=None,
- eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- substitute_html_entities=False):
- """Returns a Unicode representation of this tag and its contents.
-
- :param eventual_encoding: The tag is destined to be
- encoded into this encoding. This method is _not_
- responsible for performing that encoding. This information
- is passed in so that it can be substituted in if the
- document contains a <META> tag that mentions the document's
- encoding.
- """
- attrs = []
- if self.attrs:
- for key, val in sorted(self.attrs.items()):
- if val is None:
- decoded = key
- else:
- if not isinstance(val, basestring):
- val = str(val)
- if (self.contains_substitutions
- and eventual_encoding is not None
- and '%SOUP-ENCODING%' in val):
- val = self.substituteEncoding(val, eventual_encoding)
-
- decoded = (key + '='
- + EntitySubstitution.substitute_xml(val, True))
- attrs.append(decoded)
- close = ''
- closeTag = ''
- if self.is_empty_element:
- close = ' /'
- else:
- closeTag = '</%s>' % self.name
-
- pretty_print = (indent_level is not None)
- if pretty_print:
- space = (' ' * (indent_level-1))
- indent_contents = indent_level + 1
- else:
- space = ''
- indent_contents = None
- contents = self.decode_contents(
- indent_contents, eventual_encoding, substitute_html_entities)
-
- if self.hidden:
- # This is the 'document root' object.
- s = contents
- else:
- s = []
- attributeString = ''
- if attrs:
- attributeString = ' ' + ' '.join(attrs)
- if pretty_print:
- s.append(space)
- s.append('<%s%s%s>' % (self.name, attributeString, close))
- if pretty_print:
- s.append("\n")
- s.append(contents)
- if pretty_print and contents and contents[-1] != "\n":
- s.append("\n")
- if pretty_print and closeTag:
- s.append(space)
- s.append(closeTag)
- if pretty_print and closeTag and self.nextSibling:
- s.append("\n")
- s = ''.join(s)
- return s
-
- def decompose(self):
- """Recursively destroys the contents of this tree."""
- contents = [i for i in self.contents]
- for i in contents:
- if isinstance(i, Tag):
- i.decompose()
- else:
- i.extract()
- self.extract()
-
- def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
- return self.encode(encoding, True)
-
- def decode_contents(self, indent_level=None,
- eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- substitute_html_entities=False):
- """Renders the contents of this tag as a Unicode string.
-
- :param eventual_encoding: The tag is destined to be
- encoded into this encoding. This method is _not_
- responsible for performing that encoding. This information
- is passed in so that it can be substituted in if the
- document contains a <META> tag that mentions the document's
- encoding.
- """
- pretty_print = (indent_level is not None)
- s=[]
- for c in self:
- text = None
- if isinstance(c, NavigableString):
- text = c.output_ready(substitute_html_entities)
- elif isinstance(c, Tag):
- s.append(c.decode(indent_level, eventual_encoding,
- substitute_html_entities))
- if text and indent_level:
- text = text.strip()
- if text:
- if pretty_print:
- s.append(" " * (indent_level-1))
- s.append(text)
- if pretty_print:
- s.append("\n")
- return ''.join(s)
-
- #Soup methods
-
- def find(self, name=None, attrs={}, recursive=True, text=None,
- **kwargs):
- """Return only the first child of this Tag matching the given
- criteria."""
- r = None
- l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
- if l:
- r = l[0]
- return r
- findChild = find
-
- def find_all(self, name=None, attrs={}, recursive=True, text=None,
- limit=None, **kwargs):
- """Extracts a list of Tag objects that match the given
- criteria. You can specify the name of the Tag and any
- attributes you want the Tag to have.
-
- The value of a key-value pair in the 'attrs' map can be a
- string, a list of strings, a regular expression object, or a
- callable that takes a string and returns whether or not the
- string matches for some custom definition of 'matches'. The
- same is true of the tag name."""
- generator = self.recursive_children
- if not recursive:
- generator = self.children
- return self._find_all(name, attrs, text, limit, generator, **kwargs)
- findAll = find_all # BS3
- findChildren = find_all # BS2
-
- #Generator methods
- @property
- def children(self):
- for i in range(0, len(self.contents)):
- yield self.contents[i]
- raise StopIteration
-
- @property
- def recursive_children(self):
- if not len(self.contents):
- raise StopIteration
- stopNode = self._lastRecursiveChild().next
- current = self.contents[0]
- while current is not stopNode:
- yield current
- current = current.next
-
- # Old names for backwards compatibility
- def childGenerator(self):
- return self.children
-
- def recursiveChildGenerator(self):
- return self.recursive_children
-
-
-# Next, a couple classes to represent queries and their results.
-class SoupStrainer(object):
- """Encapsulates a number of ways of matching a markup element (tag or
- text)."""
-
- def __init__(self, name=None, attrs={}, text=None, **kwargs):
- self.name = name
- if isinstance(attrs, basestring):
- kwargs['class'] = attrs
- attrs = None
- if kwargs:
- if attrs:
- attrs = attrs.copy()
- attrs.update(kwargs)
- else:
- attrs = kwargs
- self.attrs = attrs
- self.text = text
-
- def __str__(self):
- if self.text:
- return self.text
- else:
- return "%s|%s" % (self.name, self.attrs)
-
- def searchTag(self, markupName=None, markupAttrs={}):
- found = None
- markup = None
- if isinstance(markupName, Tag):
- markup = markupName
- markupAttrs = markup
- callFunctionWithTagData = callable(self.name) \
- and not isinstance(markupName, Tag)
-
- if (not self.name) \
- or callFunctionWithTagData \
- or (markup and self._matches(markup, self.name)) \
- or (not markup and self._matches(markupName, self.name)):
- if callFunctionWithTagData:
- match = self.name(markupName, markupAttrs)
- else:
- match = True
- markupAttrMap = None
- for attr, matchAgainst in self.attrs.items():
- if not markupAttrMap:
- if hasattr(markupAttrs, 'get'):
- markupAttrMap = markupAttrs
- else:
- markupAttrMap = {}
- for k,v in markupAttrs:
- markupAttrMap[k] = v
- attrValue = markupAttrMap.get(attr)
- if not self._matches(attrValue, matchAgainst):
- match = False
- break
- if match:
- if markup:
- found = markup
- else:
- found = markupName
- return found
-
- def search(self, markup):
- #print 'looking for %s in %s' % (self, markup)
- found = None
- # If given a list of items, scan it for a text element that
- # matches.
- if isList(markup) and not isinstance(markup, Tag):
- for element in markup:
- if isinstance(element, NavigableString) \
- and self.search(element):
- found = element
- break
- # If it's a Tag, make sure its name or attributes match.
- # Don't bother with Tags if we're searching for text.
- elif isinstance(markup, Tag):
- if not self.text:
- found = self.searchTag(markup)
- # If it's text, make sure the text matches.
- elif isinstance(markup, NavigableString) or \
- isinstance(markup, basestring):
- if self._matches(markup, self.text):
- found = markup
- else:
- raise Exception, "I don't know how to match against a %s" \
- % markup.__class__
- return found
-
- def _matches(self, markup, matchAgainst):
- #print "Matching %s against %s" % (markup, matchAgainst)
- result = False
- if matchAgainst == True and type(matchAgainst) == types.BooleanType:
- result = markup != None
- elif callable(matchAgainst):
- result = matchAgainst(markup)
- else:
- #Custom match methods take the tag as an argument, but all
- #other ways of matching match the tag name as a string.
- if isinstance(markup, Tag):
- markup = markup.name
- if markup is not None and not isinstance(markup, basestring):
- markup = unicode(markup)
- #Now we know that chunk is either a string, or None.
- if hasattr(matchAgainst, 'match'):
- # It's a regexp object.
- result = markup and matchAgainst.search(markup)
- elif (isList(matchAgainst)
- and (markup is not None
- or not isinstance(matchAgainst, basestring))):
- result = markup in matchAgainst
- elif hasattr(matchAgainst, 'items'):
- result = markup.has_key(matchAgainst)
- elif matchAgainst and isinstance(markup, basestring):
- if isinstance(markup, unicode):
- matchAgainst = unicode(matchAgainst)
- else:
- matchAgainst = str(matchAgainst)
-
- if not result:
- result = matchAgainst == markup
- return result
-
-
-class ResultSet(list):
- """A ResultSet is just a list that keeps track of the SoupStrainer
- that created it."""
- def __init__(self, source):
- list.__init__([])
- self.source = source
diff --git a/beautifulsoup/testing.py b/beautifulsoup/testing.py
deleted file mode 100644
index 8fd9abf..0000000
--- a/beautifulsoup/testing.py
+++ /dev/null
@@ -1,37 +0,0 @@
-"""Helper classes for tests."""
-
-import unittest
-from beautifulsoup import BeautifulSoup
-from beautifulsoup.element import Comment, SoupStrainer
-from beautifulsoup.builder import LXMLTreeBuilder
-
-class SoupTest(unittest.TestCase):
-
- @property
- def default_builder(self):
- return LXMLTreeBuilder()
-
- def soup(self, markup, **kwargs):
- """Build a Beautiful Soup object from markup."""
- builder = kwargs.pop('builder', self.default_builder)
- return BeautifulSoup(markup, builder=builder, **kwargs)
-
- def document_for(self, markup):
- """Turn an HTML fragment into a document.
-
- The details depend on the builder.
- """
- return self.default_builder.test_fragment_to_document(markup)
-
- def assertSoupEquals(self, to_parse, compare_parsed_to=None):
- builder = self.default_builder
- obj = BeautifulSoup(to_parse, builder=builder)
- if compare_parsed_to is None:
- compare_parsed_to = to_parse
-
- self.assertEquals(obj.decode(), self.document_for(compare_parsed_to))
-
-
-
-
-
diff --git a/beautifulsoup/util.py b/beautifulsoup/util.py
deleted file mode 100644
index 5978865..0000000
--- a/beautifulsoup/util.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Helper functions and mixin classes for Beautiful Soup
-
-import types
-try:
- set
-except NameError:
- from sets import Set as set
-
-def isList(l):
- """Convenience method that works with all 2.x versions of Python
- to determine whether or not something is listlike."""
- return ((hasattr(l, '__iter__') and not isinstance(l, basestring))
- or (type(l) in (types.ListType, types.TupleType)))
-
-def buildSet(args=None):
- """Turns a list or a string into a set."""
- if isinstance(args, str):
- return set([args])
- if args is None:
- return set()
- return set(args)