diff options
-rw-r--r-- | CHANGELOG | 5 | ||||
-rw-r--r-- | beautifulsoup/__init__.py | 82 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 83 | ||||
-rw-r--r-- | beautifulsoup/builder/_html5lib.py | 8 | ||||
-rw-r--r-- | beautifulsoup/builder/_lxml.py | 15 | ||||
-rw-r--r-- | tests/test_builder_registry.py | 115 | ||||
-rw-r--r-- | tests/test_html5lib.py | 4 | ||||
-rw-r--r-- | tests/test_lxml.py | 4 | ||||
-rw-r--r-- | tests/test_soup.py | 2 | ||||
-rw-r--r-- | tests/test_tree.py | 5 |
10 files changed, 254 insertions, 69 deletions
@@ -21,6 +21,11 @@ Some attributes have also been renamed: * Tag.isSelfClosing -> Tag.is_empty_element +So have some arguments to popular methods: + + * BeautifulSoup(parseOnlyThese=...) -> BeautifulSoup(parse_only=...) + * BeautifulSoup(fromEncoding=...) -> BeautifulSoup(from_encoding=...) + == Generators are now properties == The generators have been given more sensible (and PEP 8-compliant) diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index 968be08..c998924 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -3,34 +3,14 @@ Elixir and Tonic "The Screen-Scraper's Friend" http://www.crummy.com/software/BeautifulSoup/ -Beautiful Soup parses a (possibly invalid) XML or HTML document into a -tree representation. It provides methods and Pythonic idioms that make -it easy to navigate, search, and modify the tree. +Beautiful Soup uses a plug-in parser to parse a (possibly invalid) XML +or HTML document into a tree representation. The parser does the work +of building a parse tree, and Beautiful Soup provides provides methods +and Pythonic idioms that make it easy to navigate, search, and modify +the parse tree. -A well-formed XML/HTML document yields a well-formed data -structure. An ill-formed XML/HTML document yields a correspondingly -ill-formed data structure. If your document is only locally -well-formed, you can use this library to find and process the -well-formed part of it. - -Beautiful Soup works with Python 2.2 and up. It has no external -dependencies, but you'll have more success at converting data to UTF-8 -if you also install these three packages: - -* chardet, for auto-detecting character encodings - http://chardet.feedparser.org/ -* cjkcodecs and iconv_codec, which add more encodings to the ones supported - by stock Python. - http://cjkpython.i18n.org/ - -Beautiful Soup defines classes for two main parsing strategies: - - * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific - language that kind of looks like XML. - - * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid - or invalid. This class has web browser-like heuristics for - obtaining a sensible parse tree in the face of common HTML errors. +Beautiful Soup works with Python 2.5 and up. To get it to work, you +must install either lxml or html5lib. For more than you ever wanted to know about Beautiful Soup, see the documentation: @@ -38,7 +18,7 @@ http://www.crummy.com/software/BeautifulSoup/documentation.html Here, have some legalese: -Copyright (c) 2004-2009, Leonard Richardson +Copyright (c) 2004-2011, Leonard Richardson All rights reserved. @@ -84,6 +64,7 @@ __all__ = ['BeautifulSoup'] import re from util import isList, isString, buildSet +from builder import builder_registry from dammit import UnicodeDammit from element import Entities, NavigableString, Tag @@ -112,40 +93,45 @@ class BeautifulSoup(Tag): """ ROOT_TAG_NAME = u'[document]' + # If the end-user gives no indication which tree builder they + # want, look for one with these features. + DEFAULT_BUILDER_FEATURES = ['html'] + # Used when determining whether a text node is all whitespace and # can be replaced with a single space. A text node that contains # fancy Unicode spaces (usually non-breaking) should be left # alone. STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } - @classmethod - def default_builder(self): - try: - from builder import HTML5TreeBuilder - return HTML5TreeBuilder() - except ImportError: - from builder import LXMLTreeBuilder - return LXMLTreeBuilder() - - def __init__(self, markup="", builder=None, parseOnlyThese=None, - fromEncoding=None): + def __init__(self, markup="", features=None, builder=None, + parse_only=None, from_encoding=None): """The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser.""" if builder is None: - builder = self.default_builder() + if isinstance(features, basestring): + features = [features] + if len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES + builder_class = builder_registry.lookup(*features) + if builder_class is None: + raise ValueError( + "Couldn't find a tree builder with the features you " + "requested: %s. Do you need to install a parser library?" + % ",".join(features)) + builder = builder_class() self.builder = builder self.builder.soup = self - self.parseOnlyThese = parseOnlyThese + self.parse_only = parse_only self.reset() if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() self.markup, self.original_encoding, self.declared_html_encoding = ( - self.builder.prepare_markup(markup, fromEncoding)) + self.builder.prepare_markup(markup, from_encoding)) try: self._feed() @@ -201,9 +187,9 @@ class BeautifulSoup(Tag): else: currentData = ' ' self.currentData = [] - if self.parseOnlyThese and len(self.tagStack) <= 1 and \ - (not self.parseOnlyThese.text or \ - not self.parseOnlyThese.search(currentData)): + if self.parse_only and len(self.tagStack) <= 1 and \ + (not self.parse_only.text or \ + not self.parse_only.search(currentData)): return o = containerClass(currentData) self.object_was_parsed(o) @@ -251,9 +237,9 @@ class BeautifulSoup(Tag): #print "Start tag %s: %s" % (name, attrs) self.endData() - if (self.parseOnlyThese and len(self.tagStack) <= 1 - and (self.parseOnlyThese.text - or not self.parseOnlyThese.searchTag(name, attrs))): + if (self.parse_only and len(self.tagStack) <= 1 + and (self.parse_only.text + or not self.parse_only.searchTag(name, attrs))): return None tag = Tag(self, self.builder, name, attrs, self.currentTag, diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 0888cef..b97c5f9 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -1,3 +1,4 @@ +from collections import defaultdict import re import sys from beautifulsoup.element import Entities @@ -6,19 +7,77 @@ __all__ = [ 'HTMLTreeBuilder', 'SAXTreeBuilder', 'TreeBuilder', + 'TreeBuilderRegistry', ] -# Some useful keywords. +# Some useful features for a TreeBuilder to have. FAST = 'fast' -ACCURATE = 'accurate' +PERMISSIVE = 'permissive' XML = 'xml' HTML = 'html' +HTML_5 = 'html5' + + +class TreeBuilderRegistry(object): + + def __init__(self): + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class): + """Register a treebuilder based on its advertised features.""" + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features): + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + features = list(features) + features.reverse() + candidates = None + candidate_set = None + while len(features) > 0: + feature = features.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection( + set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + +# The BeautifulSoup class will take feature lists from developers and use them +# to look up builders in this registry. +builder_registry = TreeBuilderRegistry() -builders_for_tag = {} class TreeBuilder(Entities): """Turn a document into a Beautiful Soup object tree.""" + features = [] + assume_html = False preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element @@ -150,7 +209,7 @@ class HTMLTreeBuilder(TreeBuilder): match = self.CHARSET_RE.search(content) if match: if (self.soup.declared_html_encoding is not None or - self.soup.original_encoding == self.soup.fromEncoding): + self.soup.original_encoding == self.soup.from_encoding): # An HTML encoding was sniffed while converting # the document to Unicode, or an HTML encoding was # sniffed during a previous pass through the @@ -173,27 +232,31 @@ class HTMLTreeBuilder(TreeBuilder): return False -def register_builders_from(module): - """Copy everything in __all___ from the given module into this module.""" +def register_treebuilders_from(module): + """Copy TreeBuilders from the given module into this module.""" # I'm fairly sure this is not the best way to do this. this_module = sys.modules[__package__] for name in module.__all__: obj = getattr(module, name) - setattr(this_module, name, obj) - this_module.__all__.append(name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want # html5lib to take precedence over lxml, because it's more reliable. try: import _lxml - register_builders_from(_lxml) + register_treebuilders_from(_lxml) except ImportError: # They don't have lxml installed. pass try: import _html5lib - register_builders_from(_html5lib) + register_treebuilders_from(_html5lib) except ImportError: # They don't have html5lib installed. pass diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py index 020b7ea..f8a7a40 100644 --- a/beautifulsoup/builder/_html5lib.py +++ b/beautifulsoup/builder/_html5lib.py @@ -3,8 +3,9 @@ __all__ = [ ] from beautifulsoup.builder import ( - ACCURATE, + PERMISSIVE, HTML, + HTML_5, HTMLTreeBuilder, ) import html5lib @@ -20,7 +21,7 @@ from beautifulsoup.element import ( class HTML5TreeBuilder(HTMLTreeBuilder): """Use html5lib to build a tree.""" - tags = [ACCURATE, HTML] + features = ['html5lib', PERMISSIVE, HTML_5, HTML] def prepare_markup(self, markup, user_specified_encoding): # Store the user-specified encoding for use later on. @@ -55,7 +56,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): def __init__(self, soup, namespaceHTMLElements): self.soup = soup if namespaceHTMLElements: - warnings.warn("namespaceHTMLElements not supported yet", DataLossWarning) + warnings.warn("namespaceHTMLElements not supported yet", + DataLossWarning) super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py index c2f368c..4c7a826 100644 --- a/beautifulsoup/builder/_lxml.py +++ b/beautifulsoup/builder/_lxml.py @@ -5,13 +5,24 @@ __all__ = [ from lxml import etree from beautifulsoup.element import Comment, Doctype -from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder +from beautifulsoup.builder import ( + FAST, + HTML, + HTMLTreeBuilder, + PERMISSIVE, + TreeBuilder, + XML) from beautifulsoup.dammit import UnicodeDammit import types +LXML = 'lxml' + class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser + # Well, it's permissive by XML parser standards. + features = [LXML, XML, FAST, PERMISSIVE] + @property def default_parser(self): # This can either return a parser object or a class, which @@ -79,6 +90,8 @@ class LXMLTreeBuilderForXML(TreeBuilder): class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + features = [LXML, HTML, FAST] + @property def default_parser(self): return etree.HTMLParser diff --git a/tests/test_builder_registry.py b/tests/test_builder_registry.py new file mode 100644 index 0000000..17e3fb1 --- /dev/null +++ b/tests/test_builder_registry.py @@ -0,0 +1,115 @@ +"""Tests of the builder registry.""" + +import unittest + +from beautifulsoup import BeautifulSoup +from beautifulsoup.builder import ( + builder_registry as registry, + LXMLTreeBuilderForXML, + LXMLTreeBuilder, + TreeBuilderRegistry, + HTML5TreeBuilder +) + + + +class BuiltInRegistryTest(unittest.TestCase): + """Test the built-in registry with the default builders registered.""" + + def test_combination(self): + self.assertEquals(registry.lookup('fast', 'html'), + LXMLTreeBuilder) + self.assertEquals(registry.lookup('permissive', 'xml'), + LXMLTreeBuilderForXML) + self.assertEquals(registry.lookup('permissive', 'html'), + HTML5TreeBuilder) + + def test_lookup_by_markup_type(self): + self.assertEquals(registry.lookup('html'), HTML5TreeBuilder) + self.assertEquals(registry.lookup('xml'), LXMLTreeBuilderForXML) + + def test_named_library(self): + self.assertEquals(registry.lookup('lxml', 'xml'), + LXMLTreeBuilderForXML) + self.assertEquals(registry.lookup('lxml', 'html'), + LXMLTreeBuilder) + self.assertEquals(registry.lookup('html5lib'), + HTML5TreeBuilder) + + def test_unimplemented_combinations(self): + self.assertEquals(registry.lookup('fast', 'permissive', 'html'), + None) + + def test_beautifulsoup_constructor_does_lookup(self): + # You can pass in a string. + BeautifulSoup("", features="html") + # Or a list of strings. + BeautifulSoup("", features=["html", "permissive"]) + + # You'll get an exception if BS can't find an appropriate + # builder. + self.assertRaises(ValueError, BeautifulSoup, + "", features="no-such-feature") + +class RegistryTest(unittest.TestCase): + """Test the TreeBuilderRegistry class in general.""" + + def setUp(self): + self.registry = TreeBuilderRegistry() + + def builder_for_features(self, *feature_list): + cls = type('Builder_' + '_'.join(feature_list), + (object,), {'features' : feature_list}) + + self.registry.register(cls) + return cls + + def test_register_with_no_features(self): + builder = self.builder_for_features() + + # Since the builder advertises no features, you can't find it + # by looking up features. + self.assertEquals(self.registry.lookup('foo'), None) + + # But you can find it by doing a lookup with no features, if + # this happens to be the only registered builder. + self.assertEquals(self.registry.lookup(), builder) + + def test_register_with_features_makes_lookup_succeed(self): + builder = self.builder_for_features('foo', 'bar') + self.assertEquals(self.registry.lookup('foo'), builder) + self.assertEquals(self.registry.lookup('bar'), builder) + + def test_lookup_fails_when_no_builder_implements_feature(self): + builder = self.builder_for_features('foo', 'bar') + self.assertEquals(self.registry.lookup('baz'), None) + + def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): + builder1 = self.builder_for_features('foo') + builder2 = self.builder_for_features('bar') + self.assertEquals(self.registry.lookup(), builder2) + + def test_lookup_fails_when_no_tree_builders_registered(self): + self.assertEquals(self.registry.lookup(), None) + + def test_lookup_gets_most_recent_builder_supporting_all_features(self): + has_one = self.builder_for_features('foo') + has_the_other = self.builder_for_features('bar') + has_both_early = self.builder_for_features('foo', 'bar', 'baz') + has_both_late = self.builder_for_features('foo', 'bar', 'quux') + lacks_one = self.builder_for_features('bar') + has_the_other = self.builder_for_features('foo') + + # There are two builders featuring 'foo' and 'bar', but + # the one that also features 'quux' was registered later. + self.assertEquals(self.registry.lookup('foo', 'bar'), + has_both_late) + + # There is only one builder featuring 'foo', 'bar', and 'baz'. + self.assertEquals(self.registry.lookup('foo', 'bar', 'baz'), + has_both_early) + + def test_lookup_fails_when_cannot_reconcile_requested_features(self): + builder1 = self.builder_for_features('foo', 'bar') + builder2 = self.builder_for_features('foo', 'baz') + self.assertEquals(self.registry.lookup('bar', 'baz'), None) diff --git a/tests/test_html5lib.py b/tests/test_html5lib.py index aa0bad2..5abc29d 100644 --- a/tests/test_html5lib.py +++ b/tests/test_html5lib.py @@ -18,7 +18,7 @@ class TestHTML5Builder(TestLXMLBuilder): strainer = SoupStrainer("b") markup = "<p>A <b>bold</b> statement.</p>" soup = self.soup(markup, - parseOnlyThese=strainer) + parse_only=strainer) self.assertEquals( soup.decode(), self.document_for(markup)) @@ -210,7 +210,7 @@ class TestHTML5LibEncodingConversion(TestLXMLBuilderEncodingConversion): # A real-world test to make sure we can convert ISO-8859-9 (a # Hebrew encoding) to UTF-8. soup = self.soup(self.HEBREW_DOCUMENT, - fromEncoding="iso-8859-8") + from_encoding="iso-8859-8") self.assertEquals(soup.original_encoding, 'iso8859-8') self.assertEquals( soup.encode('utf-8'), diff --git a/tests/test_lxml.py b/tests/test_lxml.py index 9d08aef..df2f341 100644 --- a/tests/test_lxml.py +++ b/tests/test_lxml.py @@ -325,7 +325,7 @@ class TestLXMLBuilder(SoupTest): def test_soupstrainer(self): strainer = SoupStrainer("b") soup = self.soup("A <b>bold</b> <meta /> <i>statement</i>", - parseOnlyThese=strainer) + parse_only=strainer) self.assertEquals(soup.decode(), "<b>bold</b>") @@ -506,7 +506,7 @@ class TestLXMLBuilderEncodingConversion(SoupTest): # A real-world test to make sure we can convert ISO-8859-9 (a # Hebrew encoding) to UTF-8. soup = self.soup(self.HEBREW_DOCUMENT, - fromEncoding="iso-8859-8") + from_encoding="iso-8859-8") self.assertEquals(soup.original_encoding, 'iso-8859-8') self.assertEquals( soup.encode('utf-8'), diff --git a/tests/test_soup.py b/tests/test_soup.py index 01dff53..bb2262a 100644 --- a/tests/test_soup.py +++ b/tests/test_soup.py @@ -12,7 +12,7 @@ class TestSelectiveParsing(SoupTest): def test_parse_with_soupstrainer(self): markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" strainer = SoupStrainer("b") - soup = self.soup(markup, parseOnlyThese=strainer) + soup = self.soup(markup, parse_only=strainer) self.assertEquals(soup.encode(), "<b>Yes</b><b>Yes <c>Yes</c></b>") diff --git a/tests/test_tree.py b/tests/test_tree.py index 384d518..0b3d72e 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -13,6 +13,7 @@ import copy import cPickle as pickle import re from beautifulsoup import BeautifulSoup +from beautifulsoup.builder import builder_registry from beautifulsoup.element import CData, SoupStrainer, Tag from beautifulsoup.testing import SoupTest @@ -523,7 +524,7 @@ class TestTreeModification(SoupTest): self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>')) def test_new_tag_creation(self): - builder = BeautifulSoup.default_builder() + builder = builder_registry.lookup('html5lib')() soup = self.soup("<body></body>", builder=builder) a = Tag(soup, builder, 'a') ol = Tag(soup, builder, 'ol') @@ -863,7 +864,7 @@ class TestSubstitutions(SoupTest): # meta tag got filtered out by the strainer. This test makes # sure that doesn't happen. strainer = SoupStrainer('pre') - soup = self.soup(markup, parseOnlyThese=strainer) + soup = self.soup(markup, parse_only=strainer) self.assertEquals(soup.contents[0].name, 'pre') |