diff options
Diffstat (limited to 'beautifulsoup')
-rw-r--r-- | beautifulsoup/__init__.py | 28 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 4 | ||||
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 15 |
3 files changed, 29 insertions, 18 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index f2c20de..32ea73f 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -144,37 +144,29 @@ class BeautifulStoneSoup(Tag): self.builder.soup = self self.parseOnlyThese = parseOnlyThese - self.fromEncoding = fromEncoding self.reset() if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() - self.markup = markup + self.markup, self.originalEncoding, self.declaredHTMLEncoding = ( + self.builder.prepare_markup(markup, fromEncoding)) + try: - self._feed(isHTML=self.builder.assume_html) + self._feed() except StopParsing: pass - self.markup = None # The markup can now be GCed. + + # Clear out the markup and the builder so they can be CGed. + self.markup = None self.builder.soup = None - self.builder = None # So can the builder. + self.builder = None - def _feed(self, inDocumentEncoding=None, isHTML=False): + def _feed(self): # Convert the document to Unicode. - markup = self.markup - if isinstance(markup, unicode): - if not hasattr(self, 'originalEncoding'): - self.originalEncoding = None - else: - dammit = UnicodeDammit\ - (markup, [self.fromEncoding, inDocumentEncoding], - isHTML=isHTML) - markup = dammit.unicode - self.originalEncoding = dammit.originalEncoding - self.declaredHTMLEncoding = dammit.declaredHTMLEncoding self.builder.reset() - self.builder.feed(markup) + self.builder.feed(self.markup) # Close out any unfinished strings and close all the open tags. self.endData() while self.currentTag.name != self.ROOT_TAG_NAME: diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index cf5e6c6..5bf5929 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -25,6 +25,10 @@ class TreeBuilder(Entities): def feed(self, markup): raise NotImplementedError() + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + return markup, None, None + def test_fragment_to_document(self, fragment): """Wrap an HTML fragment to make it look like a document. diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index 9ced9f0..a1f8c1e 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -1,6 +1,7 @@ from lxml import etree from beautifulsoup.element import Comment, Doctype from beautifulsoup.builder import HTMLTreeBuilder +from beautifulsoup.dammit import UnicodeDammit class LXMLTreeBuilder(HTMLTreeBuilder): @@ -11,6 +12,20 @@ class LXMLTreeBuilder(HTMLTreeBuilder): self.parser = parser_class(target=self) self.soup = None + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + """ + :return: A 3-tuple (markup, original encoding, encoding + declared within markup). + """ + if isinstance(markup, unicode): + return markup, None, None + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, isHTML=True) + return dammit.markup, dammit.originalEncoding, dammit.declaredHTMLEncoding + + def feed(self, markup): self.parser.feed(markup) self.parser.close() |