diff options
-rw-r--r-- | beautifulsoup/__init__.py | 295 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 259 | ||||
-rw-r--r-- | beautifulsoup/builder/_html5lib.py | 233 | ||||
-rw-r--r-- | beautifulsoup/builder/_lxml.py | 108 | ||||
-rw-r--r-- | beautifulsoup/dammit.py | 410 | ||||
-rw-r--r-- | beautifulsoup/element.py | 855 | ||||
-rw-r--r-- | beautifulsoup/testing.py | 37 | ||||
-rw-r--r-- | beautifulsoup/util.py | 21 |
8 files changed, 0 insertions, 2218 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py deleted file mode 100644 index 518e95f..0000000 --- a/beautifulsoup/__init__.py +++ /dev/null @@ -1,295 +0,0 @@ -"""Beautiful Soup -Elixir and Tonic -"The Screen-Scraper's Friend" -http://www.crummy.com/software/BeautifulSoup/ - -Beautiful Soup uses a plug-in parser to parse a (possibly invalid) XML -or HTML document into a tree representation. The parser does the work -of building a parse tree, and Beautiful Soup provides provides methods -and Pythonic idioms that make it easy to navigate, search, and modify -the parse tree. - -Beautiful Soup works with Python 2.5 and up. To get it to work, you -must install either lxml or html5lib. - -For more than you ever wanted to know about Beautiful Soup, see the -documentation: -http://www.crummy.com/software/BeautifulSoup/documentation.html - -Here, have some legalese: - -Copyright (c) 2004-2011, Leonard Richardson - -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials provided - with the distribution. - - * Neither the name of the the Beautiful Soup Consortium and All - Night Kosher Bakery nor the names of its contributors may be - used to endorse or promote products derived from this software - without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. - -""" -from __future__ import generators - -__author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.0.0" -__copyright__ = "Copyright (c) 2004-2011 Leonard Richardson" -__license__ = "New-style BSD" - -__all__ = ['BeautifulSoup'] - -import re - -from util import isList, buildSet -from builder import builder_registry -from dammit import UnicodeDammit -from element import DEFAULT_OUTPUT_ENCODING, NavigableString, Tag - - -class BeautifulSoup(Tag): - """ - This class defines the basic interface called by the tree builders. - - These methods will be called by the parser: - reset() - feed(markup) - - The tree builder may call these methods from its feed() implementation: - handle_starttag(name, attrs) # See note about return value - handle_endtag(name) - handle_data(data) # Appends to the current data node - endData(containerClass=NavigableString) # Ends the current data node - - No matter how complicated the underlying parser is, you should be - able to build a tree using 'start tag' events, 'end tag' events, - 'data' events, and "done with data" events. - - If you encounter an empty-element tag (aka a self-closing tag, - like HTML's <br> tag), call handle_starttag and then - handle_endtag. - """ - ROOT_TAG_NAME = u'[document]' - - # If the end-user gives no indication which tree builder they - # want, look for one with these features. - DEFAULT_BUILDER_FEATURES = ['html'] - - # Used when determining whether a text node is all whitespace and - # can be replaced with a single space. A text node that contains - # fancy Unicode spaces (usually non-breaking) should be left - # alone. - STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } - - def __init__(self, markup="", features=None, builder=None, - parse_only=None, from_encoding=None): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser.""" - - if builder is None: - if isinstance(features, basestring): - features = [features] - if features is None or len(features) == 0: - features = self.DEFAULT_BUILDER_FEATURES - builder_class = builder_registry.lookup(*features) - if builder_class is None: - raise ValueError( - "Couldn't find a tree builder with the features you " - "requested: %s. Do you need to install a parser library?" - % ",".join(features)) - builder = builder_class() - self.builder = builder - self.is_xml = builder.is_xml - self.builder.soup = self - - self.parse_only = parse_only - - self.reset() - - if hasattr(markup, 'read'): # It's a file-type object. - markup = markup.read() - self.markup, self.original_encoding, self.declared_html_encoding = ( - self.builder.prepare_markup(markup, from_encoding)) - - try: - self._feed() - except StopParsing: - pass - - # Clear out the markup and the builder so they can be CGed. - self.markup = None - self.builder.soup = None - self.builder = None - - def _feed(self): - # Convert the document to Unicode. - self.builder.reset() - - self.builder.feed(self.markup) - # Close out any unfinished strings and close all the open tags. - self.endData() - while self.currentTag.name != self.ROOT_TAG_NAME: - self.popTag() - - def reset(self): - Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) - self.hidden = 1 - self.builder.reset() - self.currentData = [] - self.currentTag = None - self.tagStack = [] - self.pushTag(self) - - def popTag(self): - tag = self.tagStack.pop() - #print "Pop", tag.name - if self.tagStack: - self.currentTag = self.tagStack[-1] - return self.currentTag - - def pushTag(self, tag): - #print "Push", tag.name - if self.currentTag: - self.currentTag.contents.append(tag) - self.tagStack.append(tag) - self.currentTag = self.tagStack[-1] - - def endData(self, containerClass=NavigableString): - if self.currentData: - currentData = u''.join(self.currentData) - if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and - not buildSet([tag.name for tag in self.tagStack]).intersection( - self.builder.preserve_whitespace_tags)): - if '\n' in currentData: - currentData = '\n' - else: - currentData = ' ' - self.currentData = [] - if self.parse_only and len(self.tagStack) <= 1 and \ - (not self.parse_only.text or \ - not self.parse_only.search(currentData)): - return - o = containerClass(currentData) - self.object_was_parsed(o) - - def object_was_parsed(self, o): - """Add an object to the parse tree.""" - o.setup(self.currentTag, self.previous) - if self.previous: - self.previous.next = o - self.previous = o - self.currentTag.contents.append(o) - - - def _popToTag(self, name, inclusivePop=True): - """Pops the tag stack up to and including the most recent - instance of the given tag. If inclusivePop is false, pops the tag - stack up to but *not* including the most recent instqance of - the given tag.""" - #print "Popping to %s" % name - if name == self.ROOT_TAG_NAME: - return - - numPops = 0 - mostRecentTag = None - for i in range(len(self.tagStack)-1, 0, -1): - if name == self.tagStack[i].name: - numPops = len(self.tagStack)-i - break - if not inclusivePop: - numPops = numPops - 1 - - for i in range(0, numPops): - mostRecentTag = self.popTag() - return mostRecentTag - - def handle_starttag(self, name, attrs): - """Push a start tag on to the stack. - - If this method returns None, the tag was rejected by the - SoupStrainer. You should proceed as if the tag had not occured - in the document. For instance, if this was a self-closing tag, - don't call handle_endtag. - """ - - #print "Start tag %s: %s" % (name, attrs) - self.endData() - - if (self.parse_only and len(self.tagStack) <= 1 - and (self.parse_only.text - or not self.parse_only.searchTag(name, attrs))): - return None - - tag = Tag(self, self.builder, name, attrs, self.currentTag, - self.previous) - if tag is None: - return tag - if self.previous: - self.previous.next = tag - self.previous = tag - self.pushTag(tag) - return tag - - - def handle_endtag(self, name): - #print "End tag: " + name - self.endData() - self._popToTag(name) - - def handle_data(self, data): - self.currentData.append(data) - - def decode(self, pretty_print=False, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - substitute_html_entities=False): - """Returns a string or Unicode representation of this document. - To get Unicode, pass None for encoding.""" - if self.is_xml: - # Print the XML declaration - encoding_part = '' - if eventual_encoding != None: - encoding_part = ' encoding="%s"' % eventual_encoding - prefix = u'<?xml version="1.0"%s>\n' % encoding_part - else: - prefix = u'' - if not pretty_print: - indent_level = None - else: - indent_level = 0 - return prefix + super(BeautifulSoup, self).decode( - indent_level, eventual_encoding, - substitute_html_entities) - - -class StopParsing(Exception): - pass - - -#By default, act as an HTML pretty-printer. -if __name__ == '__main__': - import sys - soup = BeautifulSoup(sys.stdin) - print soup.prettify() diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py deleted file mode 100644 index 10c6b7f..0000000 --- a/beautifulsoup/builder/__init__.py +++ /dev/null @@ -1,259 +0,0 @@ -from collections import defaultdict -import re -import sys - -__all__ = [ - 'HTMLTreeBuilder', - 'SAXTreeBuilder', - 'TreeBuilder', - 'TreeBuilderRegistry', - ] - -# Some useful features for a TreeBuilder to have. -FAST = 'fast' -PERMISSIVE = 'permissive' -XML = 'xml' -HTML = 'html' -HTML_5 = 'html5' - - -class TreeBuilderRegistry(object): - - def __init__(self): - self.builders_for_feature = defaultdict(list) - self.builders = [] - - def register(self, treebuilder_class): - """Register a treebuilder based on its advertised features.""" - for feature in treebuilder_class.features: - self.builders_for_feature[feature].insert(0, treebuilder_class) - self.builders.insert(0, treebuilder_class) - - def lookup(self, *features): - if len(self.builders) == 0: - # There are no builders at all. - return None - - if len(features) == 0: - # They didn't ask for any features. Give them the most - # recently registered builder. - return self.builders[0] - - # Go down the list of features in order, and eliminate any builders - # that don't match every feature. - features = list(features) - features.reverse() - candidates = None - candidate_set = None - while len(features) > 0: - feature = features.pop() - we_have_the_feature = self.builders_for_feature.get(feature, []) - if len(we_have_the_feature) > 0: - if candidates is None: - candidates = we_have_the_feature - candidate_set = set(candidates) - else: - # Eliminate any candidates that don't have this feature. - candidate_set = candidate_set.intersection( - set(we_have_the_feature)) - - # The only valid candidates are the ones in candidate_set. - # Go through the original list of candidates and pick the first one - # that's in candidate_set. - if candidate_set is None: - return None - for candidate in candidates: - if candidate in candidate_set: - return candidate - return None - -# The BeautifulSoup class will take feature lists from developers and use them -# to look up builders in this registry. -builder_registry = TreeBuilderRegistry() - - -class TreeBuilder(object): - """Turn a document into a Beautiful Soup object tree.""" - - features = [] - - is_xml = False - preserve_whitespace_tags = set() - empty_element_tags = None # A tag will be considered an empty-element - # tag when and only when it has no contents. - - def __init__(self): - self.soup = None - - def reset(self): - pass - - def can_be_empty_element(self, tag_name): - """Might a tag with this name be an empty-element tag? - - The final markup may or may not actually present this tag as - self-closing. - - For instance: an HTMLBuilder does not consider a <p> tag to be - an empty-element tag (it's not in - HTMLBuilder.empty_element_tags). This means an empty <p> tag - will be presented as "<p></p>", not "<p />". - - The default implementation has no opinion about which tags are - empty-element tags, so a tag will be presented as an - empty-element tag if and only if it has no contents. - "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will - be left alone. - """ - if self.empty_element_tags is None: - return True - return tag_name in self.empty_element_tags - - def feed(self, markup): - raise NotImplementedError() - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): - return markup, None, None - - def test_fragment_to_document(self, fragment): - """Wrap an HTML fragment to make it look like a document. - - Different parsers do this differently. For instance, lxml - introduces an empty <head> tag, and html5lib - doesn't. Abstracting this away lets us write simple tests - which run HTML fragments through the parser and compare the - results against other HTML fragments. - - This method should not be used outside of tests. - """ - return fragment - - def set_up_substitutions(self, tag): - pass - - -class SAXTreeBuilder(TreeBuilder): - """A Beautiful Soup treebuilder that listens for SAX events.""" - - def feed(self, markup): - raise NotImplementedError() - - def close(self): - pass - - def startElement(self, name, attrs): - attrs = dict((key[1], value) for key, value in attrs.items()) - #print "Start %s, %r" % (name, attrs) - self.soup.handle_starttag(name, attrs) - - def endElement(self, name): - #print "End %s" % name - self.soup.handle_endtag(name) - - def startElementNS(self, nsTuple, nodeName, attrs): - # Throw away (ns, nodeName) for now. - self.startElement(nodeName, attrs) - - def endElementNS(self, nsTuple, nodeName): - # Throw away (ns, nodeName) for now. - self.endElement(nodeName) - #handler.endElementNS((ns, node.nodeName), node.nodeName) - - def startPrefixMapping(self, prefix, nodeValue): - # Ignore the prefix for now. - pass - - def endPrefixMapping(self, prefix): - # Ignore the prefix for now. - # handler.endPrefixMapping(prefix) - pass - - def characters(self, content): - self.soup.handle_data(content) - - def startDocument(self): - pass - - def endDocument(self): - pass - - -class HTMLTreeBuilder(TreeBuilder): - """This TreeBuilder knows facts about HTML. - - Such as which tags are empty-element tags. - """ - - preserve_whitespace_tags = set(['pre', 'textarea']) - empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) - - # Used by set_up_substitutions to detect the charset in a META tag - CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) - - def set_up_substitutions(self, tag): - if tag.name != 'meta': - return False - - http_equiv = tag.get('http-equiv') - content = tag.get('content') - - if (http_equiv is not None - and content is not None - and http_equiv.lower() == 'content-type'): - # This is an interesting meta tag. - match = self.CHARSET_RE.search(content) - if match: - if (self.soup.declared_html_encoding is not None or - self.soup.original_encoding == self.soup.from_encoding): - # An HTML encoding was sniffed while converting - # the document to Unicode, or an HTML encoding was - # sniffed during a previous pass through the - # document, or an encoding was specified - # explicitly and it worked. Rewrite the meta tag. - def rewrite(match): - return match.group(1) + "%SOUP-ENCODING%" - tag['content'] = self.CHARSET_RE.sub(rewrite, content) - return True - else: - # This is our first pass through the document. - # Go through it again with the encoding information. - new_charset = match.group(3) - if (new_charset is not None - and new_charset != self.soup.original_encoding): - self.soup.declared_html_encoding = new_charset - self.soup._feed(self.soup.declared_html_encoding) - raise StopParsing - pass - return False - - -def register_treebuilders_from(module): - """Copy TreeBuilders from the given module into this module.""" - # I'm fairly sure this is not the best way to do this. - this_module = sys.modules[__package__] - for name in module.__all__: - obj = getattr(module, name) - - if issubclass(obj, TreeBuilder): - setattr(this_module, name, obj) - this_module.__all__.append(name) - # Register the builder while we're at it. - this_module.builder_registry.register(obj) - -# Builders are registered in reverse order of priority, so that custom -# builder registrations will take precedence. In general, we want -# html5lib to take precedence over lxml, because it's more reliable. -try: - import _lxml - register_treebuilders_from(_lxml) -except ImportError: - # They don't have lxml installed. - pass -try: - import _html5lib - register_treebuilders_from(_html5lib) -except ImportError: - # They don't have html5lib installed. - pass diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py deleted file mode 100644 index f8a7a40..0000000 --- a/beautifulsoup/builder/_html5lib.py +++ /dev/null @@ -1,233 +0,0 @@ -__all__ = [ - 'HTML5TreeBuilder', - ] - -from beautifulsoup.builder import ( - PERMISSIVE, - HTML, - HTML_5, - HTMLTreeBuilder, - ) -import html5lib -from html5lib.constants import DataLossWarning -import warnings -from beautifulsoup.element import ( - Comment, - Doctype, - NavigableString, - Tag, - ) - -class HTML5TreeBuilder(HTMLTreeBuilder): - """Use html5lib to build a tree.""" - - features = ['html5lib', PERMISSIVE, HTML_5, HTML] - - def prepare_markup(self, markup, user_specified_encoding): - # Store the user-specified encoding for use later on. - self.user_specified_encoding = user_specified_encoding - return markup, None, None - - # These methods are defined by Beautiful Soup. - def feed(self, markup): - parser = html5lib.HTMLParser(tree=self.create_treebuilder) - doc = parser.parse(markup, encoding=self.user_specified_encoding) - - # Set the character encoding detected by the tokenizer. - if isinstance(markup, unicode): - # We need to special-case this because html5lib sets - # charEncoding to UTF-8 if it gets Unicode input. - doc.original_encoding = None - else: - doc.original_encoding = parser.tokenizer.stream.charEncoding[0] - - def create_treebuilder(self, namespaceHTMLElements): - self.underlying_builder = TreeBuilderForHtml5lib( - self.soup, namespaceHTMLElements) - return self.underlying_builder - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return u'<html><head></head><body>%s</body></html>' % fragment - - -class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): - - def __init__(self, soup, namespaceHTMLElements): - self.soup = soup - if namespaceHTMLElements: - warnings.warn("namespaceHTMLElements not supported yet", - DataLossWarning) - super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) - - def documentClass(self): - self.soup.reset() - return Element(self.soup, self.soup, None) - - def insertDoctype(self, token): - name = token["name"] - publicId = token["publicId"] - systemId = token["systemId"] - - doctype = Doctype.for_name_and_ids(name, publicId, systemId) - self.soup.object_was_parsed(doctype) - - def elementClass(self, name, namespace): - if namespace is not None: - warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) - return Element(Tag(self.soup, self.soup.builder, name), self.soup, namespace) - - def commentClass(self, data): - return TextNode(Comment(data), self.soup) - - def fragmentClass(self): - self.soup = BeautifulSoup("") - self.soup.name = "[document_fragment]" - return Element(self.soup, self.soup, None) - - def appendChild(self, node): - self.soup.insert(len(self.soup.contents), node.element) - - def testSerializer(self, element): - return testSerializer(element) - - def getDocument(self): - return self.soup - - def getFragment(self): - return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element - -class AttrList(object): - def __init__(self, element): - self.element = element - self.attrs = dict(self.element.attrs) - def __iter__(self): - return self.attrs.items().__iter__() - def __setitem__(self, name, value): - "set attr", name, value - self.element[name] = value - def items(self): - return self.attrs.items() - def keys(self): - return self.attrs.keys() - def __getitem__(self, name): - return self.attrs[name] - def __contains__(self, name): - return name in self.attrs.keys() - - -class Element(html5lib.treebuilders._base.Node): - def __init__(self, element, soup, namespace): - html5lib.treebuilders._base.Node.__init__(self, element.name) - self.element = element - self.soup = soup - self.namespace = namespace - - def _nodeIndex(self, node, refNode): - # Finds a node by identity rather than equality - for index in range(len(self.element.contents)): - if id(self.element.contents[index]) == id(refNode.element): - return index - return None - - def appendChild(self, node): - if (node.element.__class__ == NavigableString and self.element.contents - and self.element.contents[-1].__class__ == NavigableString): - # Concatenate new text onto old text node - # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...") - newStr = NavigableString(self.element.contents[-1]+node.element) - - # Remove the old text node - # (Can't simply use .extract() by itself, because it fails if - # an equal text node exists within the parent node) - oldElement = self.element.contents[-1] - del self.element.contents[-1] - oldElement.parent = None - oldElement.extract() - - self.element.insert(len(self.element.contents), newStr) - else: - self.element.insert(len(self.element.contents), node.element) - node.parent = self - - def getAttributes(self): - return AttrList(self.element) - - def setAttributes(self, attributes): - if attributes is not None and attributes != {}: - for name, value in attributes.items(): - self.element[name] = value - # The attributes may contain variables that need substitution. - # Call set_up_substitutions manually. - # The Tag constructor calls this method automatically, - # but html5lib creates a Tag object before setting up - # the attributes. - self.element.contains_substitutions = ( - self.soup.builder.set_up_substitutions( - self.element)) - attributes = property(getAttributes, setAttributes) - - def insertText(self, data, insertBefore=None): - text = TextNode(NavigableString(data), self.soup) - if insertBefore: - self.insertBefore(text, insertBefore) - else: - self.appendChild(text) - - def insertBefore(self, node, refNode): - index = self._nodeIndex(node, refNode) - if (node.element.__class__ == NavigableString and self.element.contents - and self.element.contents[index-1].__class__ == NavigableString): - # (See comments in appendChild) - newStr = NavigableString(self.element.contents[index-1]+node.element) - oldNode = self.element.contents[index-1] - del self.element.contents[index-1] - oldNode.parent = None - oldNode.extract() - - self.element.insert(index-1, newStr) - else: - self.element.insert(index, node.element) - node.parent = self - - def removeChild(self, node): - index = self._nodeIndex(node.parent, node) - del node.parent.element.contents[index] - node.element.parent = None - node.element.extract() - node.parent = None - - def reparentChildren(self, newParent): - while self.element.contents: - child = self.element.contents[0] - child.extract() - if isinstance(child, Tag): - newParent.appendChild(Element(child, self.soup, namespaces["html"])) - else: - newParent.appendChild(TextNode(child, self.soup)) - - def cloneNode(self): - node = Element(Tag(self.soup, self.soup.builder, self.element.name), self.soup, self.namespace) - for key,value in self.attributes: - node.attributes[key] = value - return node - - def hasContent(self): - return self.element.contents - - def getNameTuple(self): - if self.namespace == None: - return namespaces["html"], self.name - else: - return self.namespace, self.name - - nameTuple = property(getNameTuple) - -class TextNode(Element): - def __init__(self, element, soup): - html5lib.treebuilders._base.Node.__init__(self, None) - self.element = element - self.soup = soup - - def cloneNode(self): - raise NotImplementedError diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py deleted file mode 100644 index 23ac485..0000000 --- a/beautifulsoup/builder/_lxml.py +++ /dev/null @@ -1,108 +0,0 @@ -__all__ = [ - 'LXMLTreeBuilderForXML', - 'LXMLTreeBuilder', - ] - -from lxml import etree -from beautifulsoup.element import Comment, Doctype -from beautifulsoup.builder import ( - FAST, - HTML, - HTMLTreeBuilder, - PERMISSIVE, - TreeBuilder, - XML) -from beautifulsoup.dammit import UnicodeDammit -import types - -LXML = 'lxml' - -class LXMLTreeBuilderForXML(TreeBuilder): - DEFAULT_PARSER_CLASS = etree.XMLParser - - is_xml = True - - # Well, it's permissive by XML parser standards. - features = [LXML, XML, FAST, PERMISSIVE] - - @property - def default_parser(self): - # This can either return a parser object or a class, which - # will be instantiated with default arguments. - return etree.XMLParser(target=self, strip_cdata=False, recover=True) - - def __init__(self, parser=None, empty_element_tags=None): - if empty_element_tags is not None: - self.empty_element_tags = set(empty_element_tags) - if parser is None: - # Use the default parser. - parser = self.default_parser - if callable(parser): - # Instantiate the parser with default arguments - parser = parser(target=self, strip_cdata=False) - self.parser = parser - self.soup = None - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): - """ - :return: A 3-tuple (markup, original encoding, encoding - declared within markup). - """ - if isinstance(markup, unicode): - return markup, None, None - - try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, isHTML=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding) - - def feed(self, markup): - self.parser.feed(markup) - self.parser.close() - - def close(self): - pass - - def start(self, name, attrs): - self.soup.handle_starttag(name, attrs) - - def end(self, name): - self.soup.endData() - completed_tag = self.soup.tagStack[-1] - self.soup.handle_endtag(name) - - def pi(self, target, data): - pass - - def data(self, content): - self.soup.handle_data(content) - - def doctype(self, name, pubid, system): - self.soup.endData() - doctype = Doctype.for_name_and_ids(name, pubid, system) - self.soup.object_was_parsed(doctype) - - def comment(self, content): - "Handle comments as Comment objects." - self.soup.endData() - self.soup.handle_data(content) - self.soup.endData(Comment) - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return u'<?xml version="1.0" encoding="utf-8">\n%s' % fragment - - -class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): - - features = [LXML, HTML, FAST] - is_xml = False - - @property - def default_parser(self): - return etree.HTMLParser - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return u'<html><body>%s</body></html>' % fragment diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py deleted file mode 100644 index 4483118..0000000 --- a/beautifulsoup/dammit.py +++ /dev/null @@ -1,410 +0,0 @@ -"""Beautiful Soup bonus library: Unicode, Dammit - -This class forces XML data into a standard format (usually to UTF-8 or -Unicode). It is heavily based on code from Mark Pilgrim's Universal -Feed Parser. It does not rewrite the XML or HTML to reflect a new -encoding; that's the tree builder's job. -""" - -import codecs -from htmlentitydefs import codepoint2name -import re -import types - -# Autodetects character encodings. Very useful. -# Download from http://chardet.feedparser.org/ -# or 'apt-get install python-chardet' -# or 'easy_install chardet' -try: - import chardet - #import chardet.constants - #chardet.constants._debug = 1 -except ImportError: - chardet = None - -# Available from http://cjkpython.i18n.org/. -try: - import iconv_codec -except ImportError: - pass - - -class EntitySubstitution(object): - - """Substitute XML or HTML entities for the corresponding characters.""" - - def _populate_class_variables(): - lookup = {} - characters = [] - for codepoint, name in codepoint2name.items(): - if codepoint == 34: - # There's no point in turning the quotation mark into - # ", unless it happens within an attribute value, which - # is handled elsewhere. - continue; - character = unichr(codepoint) - characters.append(character) - lookup[character] = name - re_definition = "[%s]" % "".join(characters) - return lookup, re.compile(re_definition) - CHARACTER_TO_HTML_ENTITY, CHARACTER_TO_HTML_ENTITY_RE = ( - _populate_class_variables()) - - - CHARACTER_TO_XML_ENTITY = { - "'" : "apos", - '"' : "quot", - "&" : "amp", - "<" : "lt", - ">" : "gt", - } - - BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" - ")") - - @classmethod - def _substitute_html_entity(cls, matchobj): - entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) - return "&%s;" % entity - - @classmethod - def _substitute_xml_entity(cls, matchobj): - """Used with a regular expression to substitute the - appropriate XML entity for an XML special character.""" - entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] - return "&%s;" % entity - - @classmethod - def substitute_xml(cls, value, make_quoted_attribute=False): - """Substitute XML entities for special XML characters. - - :param value: A string to be substituted. The less-than sign will - become <, the greater-than sign will become >, and any - ampersands that are not part of an entity defition will - become &. - - :param make_quoted_attribute: If True, then the string will be - quoted, as befits an attribute value. - - Ordinarily, the string will be quoted using double quotes. - - Bob's Bar -> "Bob's Bar" - - If the string contains double quotes, it will be quoted using - single quotes. - - Welcome to "my bar" -> 'Welcome to "my bar"' - - If the string contains both single and double quotes, the - double quotes will be escaped, and the string will be quoted - using double quotes. - - Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" - """ - if make_quoted_attribute: - quote_with = '"' - if '"' in value: - if "'" in value: - # The string contains both single and double - # quotes. Turn the double quotes into - # entities. We quote the double quotes rather than - # the single quotes because the entity name is - # """ whether this is HTML or XML. If we - # quoted the single quotes, we'd have to decide - # between ' and &squot;. - replace_with = """ - value = value.replace('"', replace_with) - else: - # There are double quotes but no single quotes. - # We can use single quotes to quote the attribute. - quote_with = "'" - - # Escape angle brackets, and ampersands that aren't part of - # entities. - value = cls.BARE_AMPERSAND_OR_BRACKET.sub( - cls._substitute_xml_entity, value) - if make_quoted_attribute: - return quote_with + value + quote_with - else: - return value - - @classmethod - def substitute_html(cls, s): - """Replace certain Unicode characters with named HTML entities. - - This differs from data.encode(encoding, 'xmlcharrefreplace') - in that the goal is to make the result more readable (to those - with ASCII displays) rather than to recover from - errors. There's absolutely nothing wrong with a UTF-8 string - containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that - character with "é" will make it more readable to some - people. - """ - return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( - cls._substitute_html_entity, s) - - -class UnicodeDammit: - """A class for detecting the encoding of a *ML document and - converting it to a Unicode string. If the source encoding is - windows-1252, can replace MS smart quotes with their HTML or XML - equivalents.""" - - # This dictionary maps commonly seen values for "charset" in HTML - # meta tags to the corresponding Python codec names. It only covers - # values that aren't in Python's aliases and can't be determined - # by the heuristics in find_codec. - CHARSET_ALIASES = { "macintosh" : "mac-roman", - "x-sjis" : "shift-jis" } - - ENCODINGS_WITH_SMART_QUOTES = [ - "windows-1252", - "iso-8859-1", - "iso-8859-2", - ] - - def __init__(self, markup, override_encodings=[], - smart_quotes_to=None, isHTML=False): - self.declared_html_encoding = None - self.markup, document_encoding, sniffed_encoding = \ - self._detectEncoding(markup, isHTML) - self.smart_quotes_to = smart_quotes_to - self.tried_encodings = [] - if markup == '' or isinstance(markup, unicode): - self.original_encoding = None - self.unicode = unicode(markup) - return - - u = None - for proposed_encoding in ( - override_encodings + [document_encoding, sniffed_encoding]): - if proposed_encoding is not None: - u = self._convert_from(proposed_encoding) - if u: - break - - # If no luck and we have auto-detection library, try that: - if not u and chardet and not isinstance(self.markup, unicode): - u = self._convert_from(chardet.detect(self.markup)['encoding']) - - # As a last resort, try utf-8 and windows-1252: - if not u: - for proposed_encoding in ("utf-8", "windows-1252"): - u = self._convert_from(proposed_encoding) - if u: - break - - self.unicode = u - if not u: self.original_encoding = None - - def _sub_ms_char(self, match): - """Changes a MS smart quote character to an XML or HTML - entity.""" - orig = match.group(1) - sub = self.MS_CHARS.get(orig) - if type(sub) == types.TupleType: - if self.smart_quotes_to == 'xml': - sub = '&#x'.encode() + sub[1].encode() + ';'.encode() - else: - sub = '&'.encode() + sub[0].encode() + ';'.encode() - else: - sub = sub.encode() - return sub - - def _convert_from(self, proposed): - proposed = self.find_codec(proposed) - if not proposed or proposed in self.tried_encodings: - return None - self.tried_encodings.append(proposed) - markup = self.markup - - # Convert smart quotes to HTML if coming from an encoding - # that might have them. - if (self.smart_quotes_to is not None - and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): - smart_quotes_re = "([\x80-\x9f])" - smart_quotes_compiled = re.compile(smart_quotes_re) - markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) - - try: - # print "Trying to convert document to %s" % proposed - u = self._to_unicode(markup, proposed) - self.markup = u - self.original_encoding = proposed - except Exception, e: - # print "That didn't work!" - # print e - return None - #print "Correct encoding: %s" % proposed - return self.markup - - def _to_unicode(self, data, encoding): - '''Given a string and its encoding, decodes the string into Unicode. - %encoding is a string recognized by encodings.aliases''' - - # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16be' - data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16le' - data = data[2:] - elif data[:3] == '\xef\xbb\xbf': - encoding = 'utf-8' - data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': - encoding = 'utf-32be' - data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': - encoding = 'utf-32le' - data = data[4:] - newdata = unicode(data, encoding) - return newdata - - def _detectEncoding(self, xml_data, isHTML=False): - """Given a document, tries to detect its XML encoding.""" - xml_encoding = sniffed_xml_encoding = None - try: - if xml_data[:4] == '\x4c\x6f\xa7\x94': - # EBCDIC - xml_data = self._ebcdic_to_ascii(xml_data) - elif xml_data[:4] == '\x00\x3c\x00\x3f': - # UTF-16BE - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ - and (xml_data[2:4] != '\x00\x00'): - # UTF-16BE with BOM - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x3f\x00': - # UTF-16LE - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ - (xml_data[2:4] != '\x00\x00'): - # UTF-16LE with BOM - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\x00\x3c': - # UTF-32BE - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\x3c\x00\x00\x00': - # UTF-32LE - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == '\x00\x00\xfe\xff': - # UTF-32BE with BOM - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == '\xff\xfe\x00\x00': - # UTF-32LE with BOM - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == '\xef\xbb\xbf': - # UTF-8 with BOM - sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') - else: - sniffed_xml_encoding = 'ascii' - pass - except: - xml_encoding_match = None - xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode() - xml_encoding_match = re.compile(xml_encoding_re).match(xml_data) - if not xml_encoding_match and isHTML: - meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode() - regexp = re.compile(meta_re, re.I) - xml_encoding_match = regexp.search(xml_data) - if xml_encoding_match is not None: - xml_encoding = xml_encoding_match.groups()[0].decode( - 'ascii').lower() - if isHTML: - self.declared_html_encoding = xml_encoding - if sniffed_xml_encoding and \ - (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', - 'iso-10646-ucs-4', 'ucs-4', 'csucs4', - 'utf-16', 'utf-32', 'utf_16', 'utf_32', - 'utf16', 'u16')): - xml_encoding = sniffed_xml_encoding - return xml_data, xml_encoding, sniffed_xml_encoding - - - def find_codec(self, charset): - return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ - or (charset and self._codec(charset.replace("-", ""))) \ - or (charset and self._codec(charset.replace("-", "_"))) \ - or charset - - def _codec(self, charset): - if not charset: return charset - codec = None - try: - codecs.lookup(charset) - codec = charset - except (LookupError, ValueError): - pass - return codec - - EBCDIC_TO_ASCII_MAP = None - def _ebcdic_to_ascii(self, s): - c = self.__class__ - if not c.EBCDIC_TO_ASCII_MAP: - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, - 250,251,252,253,254,255) - import string - c.EBCDIC_TO_ASCII_MAP = string.maketrans( \ - ''.join(map(chr, range(256))), ''.join(map(chr, emap))) - return s.translate(c.EBCDIC_TO_ASCII_MAP) - - MS_CHARS = { '\x80' : ('euro', '20AC'), - '\x81' : ' ', - '\x82' : ('sbquo', '201A'), - '\x83' : ('fnof', '192'), - '\x84' : ('bdquo', '201E'), - '\x85' : ('hellip', '2026'), - '\x86' : ('dagger', '2020'), - '\x87' : ('Dagger', '2021'), - '\x88' : ('circ', '2C6'), - '\x89' : ('permil', '2030'), - '\x8A' : ('Scaron', '160'), - '\x8B' : ('lsaquo', '2039'), - '\x8C' : ('OElig', '152'), - '\x8D' : '?', - '\x8E' : ('#x17D', '17D'), - '\x8F' : '?', - '\x90' : '?', - '\x91' : ('lsquo', '2018'), - '\x92' : ('rsquo', '2019'), - '\x93' : ('ldquo', '201C'), - '\x94' : ('rdquo', '201D'), - '\x95' : ('bull', '2022'), - '\x96' : ('ndash', '2013'), - '\x97' : ('mdash', '2014'), - '\x98' : ('tilde', '2DC'), - '\x99' : ('trade', '2122'), - '\x9a' : ('scaron', '161'), - '\x9b' : ('rsaquo', '203A'), - '\x9c' : ('oelig', '153'), - '\x9d' : '?', - '\x9e' : ('#x17E', '17E'), - '\x9f' : ('Yuml', ''),} diff --git a/beautifulsoup/element.py b/beautifulsoup/element.py deleted file mode 100644 index 61ed4ab..0000000 --- a/beautifulsoup/element.py +++ /dev/null @@ -1,855 +0,0 @@ -import re -import types -try: - from htmlentitydefs import name2codepoint -except ImportError: - name2codepoint = {} -from beautifulsoup.dammit import EntitySubstitution - -from util import isList - -DEFAULT_OUTPUT_ENCODING = "utf-8" - - -class PageElement(object): - """Contains the navigational information for some part of the page - (either a tag or a piece of text)""" - - def setup(self, parent=None, previous=None): - """Sets up the initial relations between this element and - other elements.""" - self.parent = parent - self.previous = previous - self.next = None - self.previousSibling = None - self.nextSibling = None - if self.parent and self.parent.contents: - self.previousSibling = self.parent.contents[-1] - self.previousSibling.nextSibling = self - - def replaceWith(self, replaceWith): - oldParent = self.parent - myIndex = self.parent.contents.index(self) - if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: - # We're replacing this element with one of its siblings. - index = self.parent.contents.index(replaceWith) - if index and index < myIndex: - # Furthermore, it comes before this element. That - # means that when we extract it, the index of this - # element will change. - myIndex = myIndex - 1 - self.extract() - oldParent.insert(myIndex, replaceWith) - - def extract(self): - """Destructively rips this element out of the tree.""" - if self.parent: - try: - self.parent.contents.remove(self) - except ValueError: - pass - - #Find the two elements that would be next to each other if - #this element (and any children) hadn't been parsed. Connect - #the two. - lastChild = self._lastRecursiveChild() - nextElement = lastChild.next - - if self.previous: - self.previous.next = nextElement - if nextElement: - nextElement.previous = self.previous - self.previous = None - lastChild.next = None - - self.parent = None - if self.previousSibling: - self.previousSibling.nextSibling = self.nextSibling - if self.nextSibling: - self.nextSibling.previousSibling = self.previousSibling - self.previousSibling = self.nextSibling = None - return self - - def _lastRecursiveChild(self): - "Finds the last element beneath this object to be parsed." - lastChild = self - while hasattr(lastChild, 'contents') and lastChild.contents: - lastChild = lastChild.contents[-1] - return lastChild - - def insert(self, position, newChild): - if (isinstance(newChild, basestring) - or isinstance(newChild, unicode)) \ - and not isinstance(newChild, NavigableString): - newChild = NavigableString(newChild) - - position = min(position, len(self.contents)) - if hasattr(newChild, 'parent') and newChild.parent != None: - # We're 'inserting' an element that's already one - # of this object's children. - if newChild.parent == self: - index = self.find(newChild) - if index and index < position: - # Furthermore we're moving it further down the - # list of this object's children. That means that - # when we extract this element, our target index - # will jump down one. - position = position - 1 - newChild.extract() - - newChild.parent = self - previousChild = None - if position == 0: - newChild.previousSibling = None - newChild.previous = self - else: - previousChild = self.contents[position-1] - newChild.previousSibling = previousChild - newChild.previousSibling.nextSibling = newChild - newChild.previous = previousChild._lastRecursiveChild() - if newChild.previous: - newChild.previous.next = newChild - - newChildsLastElement = newChild._lastRecursiveChild() - - if position >= len(self.contents): - newChild.nextSibling = None - - parent = self - parentsNextSibling = None - while not parentsNextSibling: - parentsNextSibling = parent.nextSibling - parent = parent.parent - if not parent: # This is the last element in the document. - break - if parentsNextSibling: - newChildsLastElement.next = parentsNextSibling - else: - newChildsLastElement.next = None - else: - nextChild = self.contents[position] - newChild.nextSibling = nextChild - if newChild.nextSibling: - newChild.nextSibling.previousSibling = newChild - newChildsLastElement.next = nextChild - - if newChildsLastElement.next: - newChildsLastElement.next.previous = newChildsLastElement - self.contents.insert(position, newChild) - - def append(self, tag): - """Appends the given tag to the contents of this tag.""" - self.insert(len(self.contents), tag) - - def find_next(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears after this Tag in the document.""" - return self._findOne(self.find_all_next, name, attrs, text, **kwargs) - findNext = find_next # BS3 - - def find_all_next(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns all items that match the given criteria and appear - after this Tag in the document.""" - return self._find_all(name, attrs, text, limit, self.next_elements, - **kwargs) - findAllNext = find_all_next # BS3 - - def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears after this Tag in the document.""" - return self._findOne(self.find_next_siblings, name, attrs, text, - **kwargs) - findNextSibling = find_next_sibling # BS3 - - def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear after this Tag in the document.""" - return self._find_all(name, attrs, text, limit, - self.next_siblings, **kwargs) - findNextSiblings = find_next_siblings # BS3 - fetchNextSiblings = find_next_siblings # BS2 - - def find_previous(self, name=None, attrs={}, text=None, **kwargs): - """Returns the first item that matches the given criteria and - appears before this Tag in the document.""" - return self._findOne( - self.find_all_previous, name, attrs, text, **kwargs) - findPrevious = find_previous # BS3 - - def find_all_previous(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Returns all items that match the given criteria and appear - before this Tag in the document.""" - return self._find_all(name, attrs, text, limit, self.previous_elements, - **kwargs) - findAllPrevious = find_all_previous # BS3 - fetchPrevious = find_all_previous # BS2 - - def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this Tag that matches the - given criteria and appears before this Tag in the document.""" - return self._findOne(self.find_previous_siblings, name, attrs, text, - **kwargs) - findPreviousSibling = find_previous_sibling # BS3 - - def find_previous_siblings(self, name=None, attrs={}, text=None, - limit=None, **kwargs): - """Returns the siblings of this Tag that match the given - criteria and appear before this Tag in the document.""" - return self._find_all(name, attrs, text, limit, - self.previous_siblings, **kwargs) - findPreviousSiblings = find_previous_siblings # BS3 - fetchPreviousSiblings = find_previous_siblings # BS2 - - def find_parent(self, name=None, attrs={}, **kwargs): - """Returns the closest parent of this Tag that matches the given - criteria.""" - # NOTE: We can't use _findOne because findParents takes a different - # set of arguments. - r = None - l = self.find_parents(name, attrs, 1) - if l: - r = l[0] - return r - findParent = find_parent # BS3 - - def find_parents(self, name=None, attrs={}, limit=None, **kwargs): - """Returns the parents of this Tag that match the given - criteria.""" - - return self._find_all(name, attrs, None, limit, self.parents, - **kwargs) - findParents = find_parents # BS3 - fetchParents = find_parents # BS2 - - #These methods do the real heavy lifting. - - def _findOne(self, method, name, attrs, text, **kwargs): - r = None - l = method(name, attrs, text, 1, **kwargs) - if l: - r = l[0] - return r - - def _find_all(self, name, attrs, text, limit, generator, **kwargs): - "Iterates over a generator looking for things that match." - - if isinstance(name, SoupStrainer): - strainer = name - else: - # Build a SoupStrainer - strainer = SoupStrainer(name, attrs, text, **kwargs) - results = ResultSet(strainer) - while True: - try: - i = generator.next() - except StopIteration: - break - if i: - found = strainer.search(i) - if found: - results.append(found) - if limit and len(results) >= limit: - break - return results - - #These generators can be used to navigate starting from both - #NavigableStrings and Tags. - @property - def next_elements(self): - i = self - while i: - i = i.next - yield i - - @property - def next_siblings(self): - i = self - while i: - i = i.nextSibling - yield i - - @property - def previous_elements(self): - i = self - while i: - i = i.previous - yield i - - @property - def previous_siblings(self): - i = self - while i: - i = i.previousSibling - yield i - - @property - def parents(self): - i = self - while i: - i = i.parent - yield i - - # Old non-property versions of the generators, for backwards - # compatibility with BS3. - def nextGenerator(self): - return self.next_elements - - def nextSiblingGenerator(self): - return self.next_siblings - - def previousGenerator(self): - return self.previous_elements - - def previousSiblingGenerator(self): - return self.previous_siblings - - def parentGenerator(self): - return self.parents - - # Utility methods - def substituteEncoding(self, str, encoding=None): - encoding = encoding or "utf-8" - return str.replace("%SOUP-ENCODING%", encoding) - - def toEncoding(self, s, encoding=None): - """Encodes an object to a string in some encoding, or to Unicode. - .""" - if isinstance(s, unicode): - if encoding: - s = s.encode(encoding) - elif isinstance(s, str): - if encoding: - s = s.encode(encoding) - else: - s = unicode(s) - else: - if encoding: - s = self.toEncoding(str(s), encoding) - else: - s = unicode(s) - return s - -class NavigableString(unicode, PageElement): - - PREFIX = '' - SUFFIX = '' - - def __new__(cls, value): - """Create a new NavigableString. - - When unpickling a NavigableString, this method is called with - the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be - passed in to the superclass's __new__ or the superclass won't know - how to handle non-ASCII characters. - """ - if isinstance(value, unicode): - return unicode.__new__(cls, value) - return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) - - def __getnewargs__(self): - return (unicode(self),) - - def __getattr__(self, attr): - """text.string gives you text. This is for backwards - compatibility for Navigable*String, but for CData* it lets you - get the string without the CData wrapper.""" - if attr == 'string': - return self - else: - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) - - def output_ready(self, substitute_html_entities=False): - if substitute_html_entities: - output = EntitySubstitution.substitute_html(self) - else: - output = self - return self.PREFIX + output + self.SUFFIX - - -class CData(NavigableString): - - PREFIX = u'<![CDATA[' - SUFFIX = u']]>' - - -class ProcessingInstruction(NavigableString): - - PREFIX = u'<?' - SUFFIX = u'?>' - - -class Comment(NavigableString): - - PREFIX = u'<!--' - SUFFIX = u'-->' - -class Declaration(NavigableString): - PREFIX = u'<!' - SUFFIX = u'!>' - - -class Doctype(NavigableString): - - @classmethod - def for_name_and_ids(cls, name, pub_id, system_id): - value = name - if pub_id is not None: - value += ' PUBLIC "%s"' % pub_id - if system_id is not None: - value += ' SYSTEM "%s"' % system_id - - return Doctype(value) - - PREFIX = u'<!DOCTYPE ' - SUFFIX = u'>' - - -class Tag(PageElement): - - """Represents a found HTML tag with its attributes and contents.""" - - def __init__(self, parser, builder, name, attrs=None, parent=None, - previous=None): - "Basic constructor." - - # We don't actually store the parser object: that lets extracted - # chunks be garbage-collected. - self.parserClass = parser.__class__ - self.name = name - if attrs == None: - attrs = {} - else: - attrs = dict(attrs) - self.attrs = attrs - self.contents = [] - self.setup(parent, previous) - self.hidden = False - - # Set up any substitutions, such as the charset in a META tag. - self.contains_substitutions = builder.set_up_substitutions(self) - - self.can_be_empty_element = builder.can_be_empty_element(name) - - @property - def is_empty_element(self): - """Is this tag an empty-element tag? (aka a self-closing tag) - - A tag that has contents is never an empty-element tag. - - A tag that has no contents may or may not be an empty-element - tag. It depends on the builder used to create the tag. If the - builder has a designated list of empty-element tags, then only - a tag whose name shows up in that list is considered an - empty-element tag. - - If the builder has no designated list of empty-element tags, - then any tag with no contents is an empty-element tag. - """ - return len(self.contents) == 0 and self.can_be_empty_element - isSelfClosing = is_empty_element # BS3 - - - @property - def string(self): - """Convenience property to get the single string within this tag. - - :Return: If this tag has a single string child, return value - is that string. If this tag has no children, or more than one - child, return value is None. If this tag has one child tag, - return value is the 'string' attribute of the child tag, - recursively. - """ - if len(self.contents) != 1: - return None - child = self.contents[0] - if isinstance(child, NavigableString): - return child - return child.string - - def get(self, key, default=None): - """Returns the value of the 'key' attribute for the tag, or - the value given for 'default' if it doesn't have that - attribute.""" - return self.attrs.get(key, default) - - def has_key(self, key): - return self.attrs.has_key(key) - - def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the tag, - and throws an exception if it's not there.""" - return self.attrs[key] - - def __iter__(self): - "Iterating over a tag iterates over its contents." - return iter(self.contents) - - def __len__(self): - "The length of a tag is the length of its list of contents." - return len(self.contents) - - def __contains__(self, x): - return x in self.contents - - def __nonzero__(self): - "A tag is non-None even if it has no contents." - return True - - def __setitem__(self, key, value): - """Setting tag[key] sets the value of the 'key' attribute for the - tag.""" - self.attrs[key] = value - - def __delitem__(self, key): - "Deleting tag[key] deletes all 'key' attributes for the tag." - if self.attrs.has_key(key): - del self.attrs[key] - - def __call__(self, *args, **kwargs): - """Calling a tag like a function is the same as calling its - find_all() method. Eg. tag('a') returns a list of all the A tags - found within this tag.""" - return apply(self.find_all, args, kwargs) - - def __getattr__(self, tag): - #print "Getattr %s.%s" % (self.__class__, tag) - if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: - return self.find(tag[:-3]) - elif tag.find('__') != 0: - return self.find(tag) - raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) - - def __eq__(self, other): - """Returns true iff this tag has the same name, the same attributes, - and the same contents (recursively) as the given tag. - - XXX: right now this will return false if two tags have the - same attributes in a different order. Should this be fixed?""" - if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): - return False - for i in range(0, len(self.contents)): - if self.contents[i] != other.contents[i]: - return False - return True - - def __ne__(self, other): - """Returns true iff this tag is not identical to the other tag, - as defined in __eq__.""" - return not self == other - - def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): - """Renders this tag as a string.""" - return self.encode(encoding) - - def __unicode__(self): - return self.decode() - - def __str__(self): - return self.encode() - - def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - indent_level=None, substitute_html_entities=False): - return self.decode(indent_level, encoding, - substitute_html_entities).encode(encoding) - - def decode(self, indent_level=None, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - substitute_html_entities=False): - """Returns a Unicode representation of this tag and its contents. - - :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a <META> tag that mentions the document's - encoding. - """ - attrs = [] - if self.attrs: - for key, val in sorted(self.attrs.items()): - if val is None: - decoded = key - else: - if not isinstance(val, basestring): - val = str(val) - if (self.contains_substitutions - and eventual_encoding is not None - and '%SOUP-ENCODING%' in val): - val = self.substituteEncoding(val, eventual_encoding) - - decoded = (key + '=' - + EntitySubstitution.substitute_xml(val, True)) - attrs.append(decoded) - close = '' - closeTag = '' - if self.is_empty_element: - close = ' /' - else: - closeTag = '</%s>' % self.name - - pretty_print = (indent_level is not None) - if pretty_print: - space = (' ' * (indent_level-1)) - indent_contents = indent_level + 1 - else: - space = '' - indent_contents = None - contents = self.decode_contents( - indent_contents, eventual_encoding, substitute_html_entities) - - if self.hidden: - # This is the 'document root' object. - s = contents - else: - s = [] - attributeString = '' - if attrs: - attributeString = ' ' + ' '.join(attrs) - if pretty_print: - s.append(space) - s.append('<%s%s%s>' % (self.name, attributeString, close)) - if pretty_print: - s.append("\n") - s.append(contents) - if pretty_print and contents and contents[-1] != "\n": - s.append("\n") - if pretty_print and closeTag: - s.append(space) - s.append(closeTag) - if pretty_print and closeTag and self.nextSibling: - s.append("\n") - s = ''.join(s) - return s - - def decompose(self): - """Recursively destroys the contents of this tree.""" - contents = [i for i in self.contents] - for i in contents: - if isinstance(i, Tag): - i.decompose() - else: - i.extract() - self.extract() - - def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): - return self.encode(encoding, True) - - def decode_contents(self, indent_level=None, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - substitute_html_entities=False): - """Renders the contents of this tag as a Unicode string. - - :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a <META> tag that mentions the document's - encoding. - """ - pretty_print = (indent_level is not None) - s=[] - for c in self: - text = None - if isinstance(c, NavigableString): - text = c.output_ready(substitute_html_entities) - elif isinstance(c, Tag): - s.append(c.decode(indent_level, eventual_encoding, - substitute_html_entities)) - if text and indent_level: - text = text.strip() - if text: - if pretty_print: - s.append(" " * (indent_level-1)) - s.append(text) - if pretty_print: - s.append("\n") - return ''.join(s) - - #Soup methods - - def find(self, name=None, attrs={}, recursive=True, text=None, - **kwargs): - """Return only the first child of this Tag matching the given - criteria.""" - r = None - l = self.find_all(name, attrs, recursive, text, 1, **kwargs) - if l: - r = l[0] - return r - findChild = find - - def find_all(self, name=None, attrs={}, recursive=True, text=None, - limit=None, **kwargs): - """Extracts a list of Tag objects that match the given - criteria. You can specify the name of the Tag and any - attributes you want the Tag to have. - - The value of a key-value pair in the 'attrs' map can be a - string, a list of strings, a regular expression object, or a - callable that takes a string and returns whether or not the - string matches for some custom definition of 'matches'. The - same is true of the tag name.""" - generator = self.recursive_children - if not recursive: - generator = self.children - return self._find_all(name, attrs, text, limit, generator, **kwargs) - findAll = find_all # BS3 - findChildren = find_all # BS2 - - #Generator methods - @property - def children(self): - for i in range(0, len(self.contents)): - yield self.contents[i] - raise StopIteration - - @property - def recursive_children(self): - if not len(self.contents): - raise StopIteration - stopNode = self._lastRecursiveChild().next - current = self.contents[0] - while current is not stopNode: - yield current - current = current.next - - # Old names for backwards compatibility - def childGenerator(self): - return self.children - - def recursiveChildGenerator(self): - return self.recursive_children - - -# Next, a couple classes to represent queries and their results. -class SoupStrainer(object): - """Encapsulates a number of ways of matching a markup element (tag or - text).""" - - def __init__(self, name=None, attrs={}, text=None, **kwargs): - self.name = name - if isinstance(attrs, basestring): - kwargs['class'] = attrs - attrs = None - if kwargs: - if attrs: - attrs = attrs.copy() - attrs.update(kwargs) - else: - attrs = kwargs - self.attrs = attrs - self.text = text - - def __str__(self): - if self.text: - return self.text - else: - return "%s|%s" % (self.name, self.attrs) - - def searchTag(self, markupName=None, markupAttrs={}): - found = None - markup = None - if isinstance(markupName, Tag): - markup = markupName - markupAttrs = markup - callFunctionWithTagData = callable(self.name) \ - and not isinstance(markupName, Tag) - - if (not self.name) \ - or callFunctionWithTagData \ - or (markup and self._matches(markup, self.name)) \ - or (not markup and self._matches(markupName, self.name)): - if callFunctionWithTagData: - match = self.name(markupName, markupAttrs) - else: - match = True - markupAttrMap = None - for attr, matchAgainst in self.attrs.items(): - if not markupAttrMap: - if hasattr(markupAttrs, 'get'): - markupAttrMap = markupAttrs - else: - markupAttrMap = {} - for k,v in markupAttrs: - markupAttrMap[k] = v - attrValue = markupAttrMap.get(attr) - if not self._matches(attrValue, matchAgainst): - match = False - break - if match: - if markup: - found = markup - else: - found = markupName - return found - - def search(self, markup): - #print 'looking for %s in %s' % (self, markup) - found = None - # If given a list of items, scan it for a text element that - # matches. - if isList(markup) and not isinstance(markup, Tag): - for element in markup: - if isinstance(element, NavigableString) \ - and self.search(element): - found = element - break - # If it's a Tag, make sure its name or attributes match. - # Don't bother with Tags if we're searching for text. - elif isinstance(markup, Tag): - if not self.text: - found = self.searchTag(markup) - # If it's text, make sure the text matches. - elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): - if self._matches(markup, self.text): - found = markup - else: - raise Exception, "I don't know how to match against a %s" \ - % markup.__class__ - return found - - def _matches(self, markup, matchAgainst): - #print "Matching %s against %s" % (markup, matchAgainst) - result = False - if matchAgainst == True and type(matchAgainst) == types.BooleanType: - result = markup != None - elif callable(matchAgainst): - result = matchAgainst(markup) - else: - #Custom match methods take the tag as an argument, but all - #other ways of matching match the tag name as a string. - if isinstance(markup, Tag): - markup = markup.name - if markup is not None and not isinstance(markup, basestring): - markup = unicode(markup) - #Now we know that chunk is either a string, or None. - if hasattr(matchAgainst, 'match'): - # It's a regexp object. - result = markup and matchAgainst.search(markup) - elif (isList(matchAgainst) - and (markup is not None - or not isinstance(matchAgainst, basestring))): - result = markup in matchAgainst - elif hasattr(matchAgainst, 'items'): - result = markup.has_key(matchAgainst) - elif matchAgainst and isinstance(markup, basestring): - if isinstance(markup, unicode): - matchAgainst = unicode(matchAgainst) - else: - matchAgainst = str(matchAgainst) - - if not result: - result = matchAgainst == markup - return result - - -class ResultSet(list): - """A ResultSet is just a list that keeps track of the SoupStrainer - that created it.""" - def __init__(self, source): - list.__init__([]) - self.source = source diff --git a/beautifulsoup/testing.py b/beautifulsoup/testing.py deleted file mode 100644 index 8fd9abf..0000000 --- a/beautifulsoup/testing.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Helper classes for tests.""" - -import unittest -from beautifulsoup import BeautifulSoup -from beautifulsoup.element import Comment, SoupStrainer -from beautifulsoup.builder import LXMLTreeBuilder - -class SoupTest(unittest.TestCase): - - @property - def default_builder(self): - return LXMLTreeBuilder() - - def soup(self, markup, **kwargs): - """Build a Beautiful Soup object from markup.""" - builder = kwargs.pop('builder', self.default_builder) - return BeautifulSoup(markup, builder=builder, **kwargs) - - def document_for(self, markup): - """Turn an HTML fragment into a document. - - The details depend on the builder. - """ - return self.default_builder.test_fragment_to_document(markup) - - def assertSoupEquals(self, to_parse, compare_parsed_to=None): - builder = self.default_builder - obj = BeautifulSoup(to_parse, builder=builder) - if compare_parsed_to is None: - compare_parsed_to = to_parse - - self.assertEquals(obj.decode(), self.document_for(compare_parsed_to)) - - - - - diff --git a/beautifulsoup/util.py b/beautifulsoup/util.py deleted file mode 100644 index 5978865..0000000 --- a/beautifulsoup/util.py +++ /dev/null @@ -1,21 +0,0 @@ -# Helper functions and mixin classes for Beautiful Soup - -import types -try: - set -except NameError: - from sets import Set as set - -def isList(l): - """Convenience method that works with all 2.x versions of Python - to determine whether or not something is listlike.""" - return ((hasattr(l, '__iter__') and not isinstance(l, basestring)) - or (type(l) in (types.ListType, types.TupleType))) - -def buildSet(args=None): - """Turns a list or a string into a set.""" - if isinstance(args, str): - return set([args]) - if args is None: - return set() - return set(args) |