summaryrefslogtreecommitdiff
path: root/beautifulsoup
diff options
context:
space:
mode:
Diffstat (limited to 'beautifulsoup')
-rw-r--r--beautifulsoup/__init__.py28
-rw-r--r--beautifulsoup/builder/__init__.py4
-rw-r--r--beautifulsoup/builder/lxml_builder.py15
3 files changed, 29 insertions, 18 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py
index f2c20de..32ea73f 100644
--- a/beautifulsoup/__init__.py
+++ b/beautifulsoup/__init__.py
@@ -144,37 +144,29 @@ class BeautifulStoneSoup(Tag):
self.builder.soup = self
self.parseOnlyThese = parseOnlyThese
- self.fromEncoding = fromEncoding
self.reset()
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
- self.markup = markup
+ self.markup, self.originalEncoding, self.declaredHTMLEncoding = (
+ self.builder.prepare_markup(markup, fromEncoding))
+
try:
- self._feed(isHTML=self.builder.assume_html)
+ self._feed()
except StopParsing:
pass
- self.markup = None # The markup can now be GCed.
+
+ # Clear out the markup and the builder so they can be CGed.
+ self.markup = None
self.builder.soup = None
- self.builder = None # So can the builder.
+ self.builder = None
- def _feed(self, inDocumentEncoding=None, isHTML=False):
+ def _feed(self):
# Convert the document to Unicode.
- markup = self.markup
- if isinstance(markup, unicode):
- if not hasattr(self, 'originalEncoding'):
- self.originalEncoding = None
- else:
- dammit = UnicodeDammit\
- (markup, [self.fromEncoding, inDocumentEncoding],
- isHTML=isHTML)
- markup = dammit.unicode
- self.originalEncoding = dammit.originalEncoding
- self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
self.builder.reset()
- self.builder.feed(markup)
+ self.builder.feed(self.markup)
# Close out any unfinished strings and close all the open tags.
self.endData()
while self.currentTag.name != self.ROOT_TAG_NAME:
diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py
index cf5e6c6..5bf5929 100644
--- a/beautifulsoup/builder/__init__.py
+++ b/beautifulsoup/builder/__init__.py
@@ -25,6 +25,10 @@ class TreeBuilder(Entities):
def feed(self, markup):
raise NotImplementedError()
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ document_declared_encoding=None):
+ return markup, None, None
+
def test_fragment_to_document(self, fragment):
"""Wrap an HTML fragment to make it look like a document.
diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py
index 9ced9f0..a1f8c1e 100644
--- a/beautifulsoup/builder/lxml_builder.py
+++ b/beautifulsoup/builder/lxml_builder.py
@@ -1,6 +1,7 @@
from lxml import etree
from beautifulsoup.element import Comment, Doctype
from beautifulsoup.builder import HTMLTreeBuilder
+from beautifulsoup.dammit import UnicodeDammit
class LXMLTreeBuilder(HTMLTreeBuilder):
@@ -11,6 +12,20 @@ class LXMLTreeBuilder(HTMLTreeBuilder):
self.parser = parser_class(target=self)
self.soup = None
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ document_declared_encoding=None):
+ """
+ :return: A 3-tuple (markup, original encoding, encoding
+ declared within markup).
+ """
+ if isinstance(markup, unicode):
+ return markup, None, None
+
+ try_encodings = [user_specified_encoding, document_declared_encoding]
+ dammit = UnicodeDammit(markup, try_encodings, isHTML=True)
+ return dammit.markup, dammit.originalEncoding, dammit.declaredHTMLEncoding
+
+
def feed(self, markup):
self.parser.feed(markup)
self.parser.close()