summaryrefslogtreecommitdiff
path: root/beautifulsoup
diff options
context:
space:
mode:
Diffstat (limited to 'beautifulsoup')
-rw-r--r--beautifulsoup/__init__.py82
-rw-r--r--beautifulsoup/builder/__init__.py83
-rw-r--r--beautifulsoup/builder/_html5lib.py8
-rw-r--r--beautifulsoup/builder/_lxml.py15
4 files changed, 126 insertions, 62 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index 968be08..c998924 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -3,34 +3,14 @@ Elixir and Tonic
"The Screen-Scraper's Friend"
http://www.crummy.com/software/BeautifulSoup/
-Beautiful Soup parses a (possibly invalid) XML or HTML document into a
-tree representation. It provides methods and Pythonic idioms that make
-it easy to navigate, search, and modify the tree.
+Beautiful Soup uses a plug-in parser to parse a (possibly invalid) XML
+or HTML document into a tree representation. The parser does the work
+of building a parse tree, and Beautiful Soup provides provides methods
+and Pythonic idioms that make it easy to navigate, search, and modify
+the parse tree.
-A well-formed XML/HTML document yields a well-formed data
-structure. An ill-formed XML/HTML document yields a correspondingly
-ill-formed data structure. If your document is only locally
-well-formed, you can use this library to find and process the
-well-formed part of it.
-
-Beautiful Soup works with Python 2.2 and up. It has no external
-dependencies, but you'll have more success at converting data to UTF-8
-if you also install these three packages:
-
-* chardet, for auto-detecting character encodings
- http://chardet.feedparser.org/
-* cjkcodecs and iconv_codec, which add more encodings to the ones supported
- by stock Python.
- http://cjkpython.i18n.org/
-
-Beautiful Soup defines classes for two main parsing strategies:
-
- * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
- language that kind of looks like XML.
-
- * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
- or invalid. This class has web browser-like heuristics for
- obtaining a sensible parse tree in the face of common HTML errors.
+Beautiful Soup works with Python 2.5 and up. To get it to work, you
+must install either lxml or html5lib.
For more than you ever wanted to know about Beautiful Soup, see the
documentation:
@@ -38,7 +18,7 @@ http://www.crummy.com/software/BeautifulSoup/documentation.html
Here, have some legalese:
-Copyright (c) 2004-2009, Leonard Richardson
+Copyright (c) 2004-2011, Leonard Richardson
All rights reserved.
@@ -84,6 +64,7 @@ __all__ = ['BeautifulSoup']
import re
from util import isList, isString, buildSet
+from builder import builder_registry
from dammit import UnicodeDammit
from element import Entities, NavigableString, Tag
@@ -112,40 +93,45 @@ class BeautifulSoup(Tag):
"""
ROOT_TAG_NAME = u'[document]'
+ # If the end-user gives no indication which tree builder they
+ # want, look for one with these features.
+ DEFAULT_BUILDER_FEATURES = ['html']
+
# Used when determining whether a text node is all whitespace and
# can be replaced with a single space. A text node that contains
# fancy Unicode spaces (usually non-breaking) should be left
# alone.
STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
- @classmethod
- def default_builder(self):
- try:
- from builder import HTML5TreeBuilder
- return HTML5TreeBuilder()
- except ImportError:
- from builder import LXMLTreeBuilder
- return LXMLTreeBuilder()
-
- def __init__(self, markup="", builder=None, parseOnlyThese=None,
- fromEncoding=None):
+ def __init__(self, markup="", features=None, builder=None,
+ parse_only=None, from_encoding=None):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser."""
if builder is None:
- builder = self.default_builder()
+ if isinstance(features, basestring):
+ features = [features]
+ if len(features) == 0:
+ features = self.DEFAULT_BUILDER_FEATURES
+ builder_class = builder_registry.lookup(*features)
+ if builder_class is None:
+ raise ValueError(
+ "Couldn't find a tree builder with the features you "
+ "requested: %s. Do you need to install a parser library?"
+ % ",".join(features))
+ builder = builder_class()
self.builder = builder
self.builder.soup = self
- self.parseOnlyThese = parseOnlyThese
+ self.parse_only = parse_only
self.reset()
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
self.markup, self.original_encoding, self.declared_html_encoding = (
- self.builder.prepare_markup(markup, fromEncoding))
+ self.builder.prepare_markup(markup, from_encoding))
try:
self._feed()
@@ -201,9 +187,9 @@ class BeautifulSoup(Tag):
else:
currentData = ' '
self.currentData = []
- if self.parseOnlyThese and len(self.tagStack) <= 1 and \
- (not self.parseOnlyThese.text or \
- not self.parseOnlyThese.search(currentData)):
+ if self.parse_only and len(self.tagStack) <= 1 and \
+ (not self.parse_only.text or \
+ not self.parse_only.search(currentData)):
return
o = containerClass(currentData)
self.object_was_parsed(o)
@@ -251,9 +237,9 @@ class BeautifulSoup(Tag):
#print "Start tag %s: %s" % (name, attrs)
self.endData()
- if (self.parseOnlyThese and len(self.tagStack) <= 1
- and (self.parseOnlyThese.text
- or not self.parseOnlyThese.searchTag(name, attrs))):
+ if (self.parse_only and len(self.tagStack) <= 1
+ and (self.parse_only.text
+ or not self.parse_only.searchTag(name, attrs))):
return None
tag = Tag(self, self.builder, name, attrs, self.currentTag,
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index 0888cef..b97c5f9 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
import re
import sys
from beautifulsoup.element import Entities
@@ -6,19 +7,77 @@ __all__ = [
'HTMLTreeBuilder',
'SAXTreeBuilder',
'TreeBuilder',
+ 'TreeBuilderRegistry',
]
-# Some useful keywords.
+# Some useful features for a TreeBuilder to have.
FAST = 'fast'
-ACCURATE = 'accurate'
+PERMISSIVE = 'permissive'
XML = 'xml'
HTML = 'html'
+HTML_5 = 'html5'
+
+
+class TreeBuilderRegistry(object):
+
+ def __init__(self):
+ self.builders_for_feature = defaultdict(list)
+ self.builders = []
+
+ def register(self, treebuilder_class):
+ """Register a treebuilder based on its advertised features."""
+ for feature in treebuilder_class.features:
+ self.builders_for_feature[feature].insert(0, treebuilder_class)
+ self.builders.insert(0, treebuilder_class)
+
+ def lookup(self, *features):
+ if len(self.builders) == 0:
+ # There are no builders at all.
+ return None
+
+ if len(features) == 0:
+ # They didn't ask for any features. Give them the most
+ # recently registered builder.
+ return self.builders[0]
+
+ # Go down the list of features in order, and eliminate any builders
+ # that don't match every feature.
+ features = list(features)
+ features.reverse()
+ candidates = None
+ candidate_set = None
+ while len(features) > 0:
+ feature = features.pop()
+ we_have_the_feature = self.builders_for_feature.get(feature, [])
+ if len(we_have_the_feature) > 0:
+ if candidates is None:
+ candidates = we_have_the_feature
+ candidate_set = set(candidates)
+ else:
+ # Eliminate any candidates that don't have this feature.
+ candidate_set = candidate_set.intersection(
+ set(we_have_the_feature))
+
+ # The only valid candidates are the ones in candidate_set.
+ # Go through the original list of candidates and pick the first one
+ # that's in candidate_set.
+ if candidate_set is None:
+ return None
+ for candidate in candidates:
+ if candidate in candidate_set:
+ return candidate
+ return None
+
+# The BeautifulSoup class will take feature lists from developers and use them
+# to look up builders in this registry.
+builder_registry = TreeBuilderRegistry()
-builders_for_tag = {}
class TreeBuilder(Entities):
"""Turn a document into a Beautiful Soup object tree."""
+ features = []
+
assume_html = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
@@ -150,7 +209,7 @@ class HTMLTreeBuilder(TreeBuilder):
match = self.CHARSET_RE.search(content)
if match:
if (self.soup.declared_html_encoding is not None or
- self.soup.original_encoding == self.soup.fromEncoding):
+ self.soup.original_encoding == self.soup.from_encoding):
# An HTML encoding was sniffed while converting
# the document to Unicode, or an HTML encoding was
# sniffed during a previous pass through the
@@ -173,27 +232,31 @@ class HTMLTreeBuilder(TreeBuilder):
return False
-def register_builders_from(module):
- """Copy everything in __all___ from the given module into this module."""
+def register_treebuilders_from(module):
+ """Copy TreeBuilders from the given module into this module."""
# I'm fairly sure this is not the best way to do this.
this_module = sys.modules[__package__]
for name in module.__all__:
obj = getattr(module, name)
- setattr(this_module, name, obj)
- this_module.__all__.append(name)
+
+ if issubclass(obj, TreeBuilder):
+ setattr(this_module, name, obj)
+ this_module.__all__.append(name)
+ # Register the builder while we're at it.
+ this_module.builder_registry.register(obj)
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want
# html5lib to take precedence over lxml, because it's more reliable.
try:
import _lxml
- register_builders_from(_lxml)
+ register_treebuilders_from(_lxml)
except ImportError:
# They don't have lxml installed.
pass
try:
import _html5lib
- register_builders_from(_html5lib)
+ register_treebuilders_from(_html5lib)
except ImportError:
# They don't have html5lib installed.
pass
diff --git a/beautifulsoup/builder/_html5lib.py b/beautifulsoup/builder/_html5lib.py
index 020b7ea..f8a7a40 100644
--- a/beautifulsoup/builder/_html5lib.py
+++ b/beautifulsoup/builder/_html5lib.py
@@ -3,8 +3,9 @@ __all__ = [
]
from beautifulsoup.builder import (
- ACCURATE,
+ PERMISSIVE,
HTML,
+ HTML_5,
HTMLTreeBuilder,
)
import html5lib
@@ -20,7 +21,7 @@ from beautifulsoup.element import (
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
- tags = [ACCURATE, HTML]
+ features = ['html5lib', PERMISSIVE, HTML_5, HTML]
def prepare_markup(self, markup, user_specified_encoding):
# Store the user-specified encoding for use later on.
@@ -55,7 +56,8 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements):
self.soup = soup
if namespaceHTMLElements:
- warnings.warn("namespaceHTMLElements not supported yet", DataLossWarning)
+ warnings.warn("namespaceHTMLElements not supported yet",
+ DataLossWarning)
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
diff --git a/beautifulsoup/builder/_lxml.py b/beautifulsoup/builder/_lxml.py
index c2f368c..4c7a826 100644
--- a/beautifulsoup/builder/_lxml.py
+++ b/beautifulsoup/builder/_lxml.py
@@ -5,13 +5,24 @@ __all__ = [
from lxml import etree
from beautifulsoup.element import Comment, Doctype
-from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder
+from beautifulsoup.builder import (
+ FAST,
+ HTML,
+ HTMLTreeBuilder,
+ PERMISSIVE,
+ TreeBuilder,
+ XML)
from beautifulsoup.dammit import UnicodeDammit
import types
+LXML = 'lxml'
+
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
+ # Well, it's permissive by XML parser standards.
+ features = [LXML, XML, FAST, PERMISSIVE]
+
@property
def default_parser(self):
# This can either return a parser object or a class, which
@@ -79,6 +90,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+ features = [LXML, HTML, FAST]
+
@property
def default_parser(self):
return etree.HTMLParser