diff options
author | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 15:13:41 -0500 |
---|---|---|
committer | Leonard Richardson <leonard.richardson@canonical.com> | 2011-02-18 15:13:41 -0500 |
commit | 8249b803d9bab9c06be02a244e629cb732f4f5b1 (patch) | |
tree | 447cddabac142fefd583df1acd6268f6abcb8f5c /beautifulsoup | |
parent | 0dda99b15112df7225e647db9702fbd62dcc8ea8 (diff) | |
parent | e170ff33e67e806cf33e2e51fcefcfa0b9310d96 (diff) |
Ported the rest of the HTML tests, including tests of broken HTML from the TODO. Made Unicode, Dammit PEP-8 compliant.
Diffstat (limited to 'beautifulsoup')
-rw-r--r-- | beautifulsoup/__init__.py | 2 | ||||
-rw-r--r-- | beautifulsoup/builder/__init__.py | 10 | ||||
-rw-r--r-- | beautifulsoup/builder/html5lib_builder.py | 4 | ||||
-rw-r--r-- | beautifulsoup/builder/lxml_builder.py | 2 | ||||
-rw-r--r-- | beautifulsoup/dammit.py | 73 |
5 files changed, 49 insertions, 42 deletions
diff --git a/beautifulsoup/__init__.py b/beautifulsoup/__init__.py index 32ea73f..5d66bc7 100644 --- a/beautifulsoup/__init__.py +++ b/beautifulsoup/__init__.py @@ -149,7 +149,7 @@ class BeautifulStoneSoup(Tag): if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() - self.markup, self.originalEncoding, self.declaredHTMLEncoding = ( + self.markup, self.original_encoding, self.declared_html_encoding = ( self.builder.prepare_markup(markup, fromEncoding)) try: diff --git a/beautifulsoup/builder/__init__.py b/beautifulsoup/builder/__init__.py index 5bf5929..5c275d7 100644 --- a/beautifulsoup/builder/__init__.py +++ b/beautifulsoup/builder/__init__.py @@ -120,8 +120,8 @@ class HTMLTreeBuilder(TreeBuilder): # This is an interesting meta tag. match = self.CHARSET_RE.search(content) if match: - if (self.soup.declaredHTMLEncoding is not None or - self.soup.originalEncoding == self.soup.fromEncoding): + if (self.soup.declared_html_encoding is not None or + self.soup.original_encoding == self.soup.fromEncoding): # An HTML encoding was sniffed while converting # the document to Unicode, or an HTML encoding was # sniffed during a previous pass through the @@ -136,9 +136,9 @@ class HTMLTreeBuilder(TreeBuilder): # Go through it again with the encoding information. new_charset = match.group(3) if (new_charset is not None - and new_charset != self.soup.originalEncoding): - self.soup.declaredHTMLEncoding = new_charset - self.soup._feed(self.soup.declaredHTMLEncoding) + and new_charset != self.soup.original_encoding): + self.soup.declared_html_encoding = new_charset + self.soup._feed(self.soup.declared_html_encoding) raise StopParsing pass return False diff --git a/beautifulsoup/builder/html5lib_builder.py b/beautifulsoup/builder/html5lib_builder.py index 95151da..0a24ce1 100644 --- a/beautifulsoup/builder/html5lib_builder.py +++ b/beautifulsoup/builder/html5lib_builder.py @@ -27,9 +27,9 @@ class HTML5TreeBuilder(HTMLTreeBuilder): if isinstance(markup, unicode): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. - doc.originalEncoding = None + doc.original_encoding = None else: - doc.originalEncoding = parser.tokenizer.stream.charEncoding[0] + doc.original_encoding = parser.tokenizer.stream.charEncoding[0] def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( diff --git a/beautifulsoup/builder/lxml_builder.py b/beautifulsoup/builder/lxml_builder.py index a1f8c1e..2c264b3 100644 --- a/beautifulsoup/builder/lxml_builder.py +++ b/beautifulsoup/builder/lxml_builder.py @@ -23,7 +23,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder): try_encodings = [user_specified_encoding, document_declared_encoding] dammit = UnicodeDammit(markup, try_encodings, isHTML=True) - return dammit.markup, dammit.originalEncoding, dammit.declaredHTMLEncoding + return dammit.markup, dammit.original_encoding, dammit.declared_html_encoding def feed(self, markup): diff --git a/beautifulsoup/dammit.py b/beautifulsoup/dammit.py index 954ca54..455b0bf 100644 --- a/beautifulsoup/dammit.py +++ b/beautifulsoup/dammit.py @@ -3,23 +3,24 @@ This class forces XML data into a standard format (usually to UTF-8 or Unicode). It is heavily based on code from Mark Pilgrim's Universal Feed Parser. It does not rewrite the XML or HTML to reflect a new -encoding; that's Beautiful Soup's job. +encoding; that's the tree builder's job. """ import codecs import re import types -# Autodetects character encodings. +# Autodetects character encodings. Very useful. # Download from http://chardet.feedparser.org/ +# or 'apt-get install python-chardet' +# or 'easy_install chardet' try: import chardet -# import chardet.constants -# chardet.constants._debug = 1 + #import chardet.constants + #chardet.constants._debug = 1 except ImportError: chardet = None -# cjkcodecs and iconv_codec make Python know about more character encodings. # Both are available from http://cjkpython.i18n.org/ # They're built in if you use Python 2.4. try: @@ -45,46 +46,53 @@ class UnicodeDammit: CHARSET_ALIASES = { "macintosh" : "mac-roman", "x-sjis" : "shift-jis" } - def __init__(self, markup, overrideEncodings=[], - smartQuotesTo='xml', isHTML=False): - self.declaredHTMLEncoding = None - self.markup, documentEncoding, sniffedEncoding = \ + ENCODINGS_WITH_SMART_QUOTES = [ + "windows-1252", + "iso-8859-1", + "iso-8859-2", + ] + + def __init__(self, markup, override_encodings=[], + smart_quotes_to=None, isHTML=False): + self.declared_html_encoding = None + self.markup, document_encoding, sniffed_encoding = \ self._detectEncoding(markup, isHTML) - self.smartQuotesTo = smartQuotesTo - self.triedEncodings = [] + self.smart_quotes_to = smart_quotes_to + self.tried_encodings = [] if markup == '' or isinstance(markup, unicode): - self.originalEncoding = None + self.original_encoding = None self.unicode = unicode(markup) return u = None - for proposedEncoding in ( - overrideEncodings + [documentEncoding, sniffedEncoding]): - if proposedEncoding is not None: - u = self._convertFrom(proposedEncoding) + for proposed_encoding in ( + override_encodings + [document_encoding, sniffed_encoding]): + if proposed_encoding is not None: + u = self._convert_from(proposed_encoding) if u: break # If no luck and we have auto-detection library, try that: if not u and chardet and not isinstance(self.markup, unicode): - u = self._convertFrom(chardet.detect(self.markup)['encoding']) + u = self._convert_from(chardet.detect(self.markup)['encoding']) # As a last resort, try utf-8 and windows-1252: if not u: for proposed_encoding in ("utf-8", "windows-1252"): - u = self._convertFrom(proposed_encoding) - if u: break + u = self._convert_from(proposed_encoding) + if u: + break self.unicode = u - if not u: self.originalEncoding = None + if not u: self.original_encoding = None - def _subMSChar(self, match): + def _sub_ms_char(self, match): """Changes a MS smart quote character to an XML or HTML entity.""" orig = match.group(1) sub = self.MS_CHARS.get(orig) if type(sub) == types.TupleType: - if self.smartQuotesTo == 'xml': + if self.smart_quotes_to == 'xml': sub = '&#x'.encode() + sub[1].encode() + ';'.encode() else: sub = '&'.encode() + sub[0].encode() + ';'.encode() @@ -92,27 +100,26 @@ class UnicodeDammit: sub = sub.encode() return sub - def _convertFrom(self, proposed): + def _convert_from(self, proposed): proposed = self.find_codec(proposed) - if not proposed or proposed in self.triedEncodings: + if not proposed or proposed in self.tried_encodings: return None - self.triedEncodings.append(proposed) + self.tried_encodings.append(proposed) markup = self.markup # Convert smart quotes to HTML if coming from an encoding # that might have them. - if self.smartQuotesTo and proposed.lower() in("windows-1252", - "iso-8859-1", - "iso-8859-2"): + if (self.smart_quotes_to is not None + and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): smart_quotes_re = "([\x80-\x9f])" smart_quotes_compiled = re.compile(smart_quotes_re) - markup = smart_quotes_compiled.sub(self._subMSChar, markup) + markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) try: # print "Trying to convert document to %s" % proposed - u = self._toUnicode(markup, proposed) + u = self._to_unicode(markup, proposed) self.markup = u - self.originalEncoding = proposed + self.original_encoding = proposed except Exception, e: # print "That didn't work!" # print e @@ -120,7 +127,7 @@ class UnicodeDammit: #print "Correct encoding: %s" % proposed return self.markup - def _toUnicode(self, data, encoding): + def _to_unicode(self, data, encoding): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' @@ -205,7 +212,7 @@ class UnicodeDammit: xml_encoding = xml_encoding_match.groups()[0].decode( 'ascii').lower() if isHTML: - self.declaredHTMLEncoding = xml_encoding + self.declared_html_encoding = xml_encoding if sniffed_xml_encoding and \ (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', |