summaryrefslogtreecommitdiff
path: root/beautifulsoup
diff options
context:
space:
mode:
Diffstat (limited to 'beautifulsoup')
-rw-r--r--beautifulsoup/__init__.py34
-rw-r--r--beautifulsoup/builder/lxml_builder.py23
-rw-r--r--beautifulsoup/testing.py3
3 files changed, 25 insertions, 35 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index 5d66bc7..922005c 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -76,16 +76,10 @@ from __future__ import generators
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.0.0"
-__copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
+__copyright__ = "Copyright (c) 2004-2011 Leonard Richardson"
__license__ = "New-style BSD"
-__all__ = ['BeautifulSoup',
-
- # Stuff imported from other packages
- 'Entities',
-
- 'BeautifulStoneSoup',
- 'ICantBelieveItsBeautifulSoup']
+__all__ = ['BeautifulSoup']
import re
@@ -94,7 +88,7 @@ from dammit import UnicodeDammit
from element import Entities, NavigableString, Tag
-class BeautifulStoneSoup(Tag):
+class BeautifulSoup(Tag):
"""
This class defines the basic interface called by the tree builders.
@@ -128,9 +122,12 @@ class BeautifulStoneSoup(Tag):
@classmethod
def default_builder(self):
- from lxml import etree
- from builder.lxml_builder import LXMLTreeBuilder
- return LXMLTreeBuilder(parser_class=etree.XMLParser)
+ try:
+ from builder.html5_builder import HTML5TreeBuilder
+ return HTML5TreeBuilder()
+ except ImportError:
+ from builder.lxml_builder import LXMLTreeBuilder
+ return LXMLTreeBuilder()
def __init__(self, markup="", builder=None, parseOnlyThese=None,
fromEncoding=None):
@@ -278,19 +275,6 @@ class BeautifulStoneSoup(Tag):
self.currentData.append(data)
-class BeautifulSoup(BeautifulStoneSoup):
- """A convenience class for parsing HTML without creating a builder."""
-
- @classmethod
- def default_builder(self):
- try:
- from builder.html5_builder import HTML5TreeBuilder
- return HTML5TreeBuilder()
- except ImportError:
- from builder.lxml_builder import LXMLTreeBuilder
- return LXMLTreeBuilder()
-
-
class StopParsing(Exception):
pass
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index 2c264b3..afdf760 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -1,15 +1,16 @@
from lxml import etree
from beautifulsoup.element import Comment, Doctype
-from beautifulsoup.builder import HTMLTreeBuilder
+from beautifulsoup.builder import TreeBuilder, HTMLTreeBuilder
from beautifulsoup.dammit import UnicodeDammit
-class LXMLTreeBuilder(HTMLTreeBuilder):
+class LXMLTreeBuilderForXML(TreeBuilder):
+ DEFAULT_PARSER_CLASS = etree.XMLParser
- def __init__(self, parser_class=etree.HTMLParser):
- # etree.HTMLParser's constructor has an argument strip_cdata,
- # but it does nothing. CDATA sections are always stripped when
- # passed through HTMLParser.
- self.parser = parser_class(target=self)
+ def __init__(self, parser_class=None):
+ # strip_cdata only has an effect on XMLParser. HTMLParser's
+ # constructor accepts strip_cdata but ignores it.
+ parser_class = parser_class or self.DEFAULT_PARSER_CLASS
+ self.parser = parser_class(target=self, strip_cdata=False)
self.soup = None
def prepare_markup(self, markup, user_specified_encoding=None,
@@ -23,8 +24,8 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
- return dammit.markup, dammit.original_encoding, dammit.declared_html_encoding
-
+ return (dammit.markup, dammit.original_encoding,
+ dammit.declared_html_encoding)
def feed(self, markup):
self.parser.feed(markup)
@@ -60,3 +61,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
"""See `TreeBuilder`."""
return u'<html><body>%s</body></html>' % fragment
+
+class LXMLTreeBuilder(LXMLTreeBuilderForXML, HTMLTreeBuilder):
+
+ DEFAULT_PARSER_CLASS = etree.HTMLParser
diff --git a/beautifulsoup/testing.py b/beautifulsoup/testing.py
index 74937d9..9b1e858 100644
--- a/beautifulsoup/testing.py
+++ b/beautifulsoup/testing.py
@@ -13,7 +13,8 @@ class SoupTest(unittest.TestCase):
def soup(self, markup, **kwargs):
"""Build a Beautiful Soup object from markup."""
- return BeautifulSoup(markup, builder=self.default_builder, **kwargs)
+ builder = kwargs.pop('builder', self.default_builder)
+ return BeautifulSoup(markup, builder=builder, **kwargs)
def document_for(self, markup):
"""Turn an HTML fragment into a document.