summaryrefslogtreecommitdiff
path: root/bs4/dammit.py
diff options
context:
space:
mode:
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r--bs4/dammit.py94
1 files changed, 81 insertions, 13 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 74fa7f0..e72a867 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -43,6 +43,9 @@ except ImportError:
return None
# Available from http://cjkpython.i18n.org/.
+#
+# TODO: This doesn't work anymore and the closest thing, iconv_codecs,
+# is GPL-licensed. Check whether this is still necessary.
try:
import iconv_codec
except ImportError:
@@ -63,8 +66,7 @@ encoding_res[unicode] = {
}
class EntitySubstitution(object):
-
- """Substitute XML or HTML entities for the corresponding characters."""
+ """The ability to substitute XML or HTML entities for certain characters."""
def _populate_class_variables():
lookup = {}
@@ -110,13 +112,15 @@ class EntitySubstitution(object):
@classmethod
def _substitute_html_entity(cls, matchobj):
+ """Used with a regular expression to substitute the
+ appropriate HTML entity for a special character."""
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
return "&%s;" % entity
@classmethod
def _substitute_xml_entity(cls, matchobj):
"""Used with a regular expression to substitute the
- appropriate XML entity for an XML special character."""
+ appropriate XML entity for a special character."""
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
return "&%s;" % entity
@@ -211,6 +215,8 @@ class EntitySubstitution(object):
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
character with "é" will make it more readable to some
people.
+
+ :param s: A Unicode string.
"""
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
cls._substitute_html_entity, s)
@@ -238,6 +244,15 @@ class EncodingDetector:
"""
def __init__(self, markup, override_encodings=None, is_html=False,
exclude_encodings=None):
+ """Constructor.
+
+ :param markup: Some markup in an unknown encoding.
+ :param override_encodings: These encodings will be tried first.
+ :param is_html: If True, this markup is considered to be HTML. Otherwise
+ it's assumed to be XML.
+ :param exclude_encodings: These encodings will not be tried, even
+ if they otherwise would be.
+ """
self.override_encodings = override_encodings or []
exclude_encodings = exclude_encodings or []
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
@@ -249,6 +264,12 @@ class EncodingDetector:
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
def _usable(self, encoding, tried):
+ """Should we even bother to try this encoding?
+
+ :param encoding: Name of an encoding.
+ :param tried: Encodings that have already been tried. This will be modified
+ as a side effect.
+ """
if encoding is not None:
encoding = encoding.lower()
if encoding in self.exclude_encodings:
@@ -260,7 +281,10 @@ class EncodingDetector:
@property
def encodings(self):
- """Yield a number of encodings that might work for this markup."""
+ """Yield a number of encodings that might work for this markup.
+
+ :yield: A sequence of strings.
+ """
tried = set()
for e in self.override_encodings:
if self._usable(e, tried):
@@ -293,7 +317,11 @@ class EncodingDetector:
@classmethod
def strip_byte_order_mark(cls, data):
- """If a byte-order mark is present, strip it and return the encoding it implies."""
+ """If a byte-order mark is present, strip it and return the encoding it implies.
+
+ :param data: Some markup.
+ :return: A 2-tuple (modified data, implied encoding)
+ """
encoding = None
if isinstance(data, unicode):
# Unicode data cannot have a byte-order mark.
@@ -325,6 +353,13 @@ class EncodingDetector:
An HTML encoding is declared in a <meta> tag, hopefully near the
beginning of the document.
+
+ :param markup: Some markup.
+ :param is_html: If True, this markup is considered to be HTML. Otherwise
+ it's assumed to be XML.
+ :param search_entire_document: Since an encoding is supposed to declared near the beginning
+ of the document, most of the time it's only necessary to search a few kilobytes of data.
+ Set this to True to force this method to search the entire document.
"""
if search_entire_document:
xml_endpos = html_endpos = len(markup)
@@ -372,6 +407,21 @@ class UnicodeDammit:
def __init__(self, markup, override_encodings=[],
smart_quotes_to=None, is_html=False, exclude_encodings=[]):
+ """Constructor.
+
+ :param markup: A bytestring representing markup in an unknown encoding.
+ :param override_encodings: These encodings will be tried first,
+ before any sniffing code is run.
+
+ :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
+ to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
+ Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
+ will convert them to HTML entity references.
+ :param is_html: If True, this markup is considered to be HTML. Otherwise
+ it's assumed to be XML.
+ :param exclude_encodings: These encodings will not be considered, even
+ if the sniffing code thinks they might make sense.
+ """
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
@@ -438,6 +488,10 @@ class UnicodeDammit:
return sub
def _convert_from(self, proposed, errors="strict"):
+ """Attempt to convert the markup to the proposed encoding.
+
+ :param proposed: The name of a character encoding.
+ """
proposed = self.find_codec(proposed)
if not proposed or (proposed, errors) in self.tried_encodings:
return None
@@ -465,17 +519,27 @@ class UnicodeDammit:
return self.markup
def _to_unicode(self, data, encoding, errors="strict"):
- '''Given a string and its encoding, decodes the string into Unicode.
- %encoding is a string recognized by encodings.aliases'''
+ """Given a string and its encoding, decodes the string into Unicode.
+
+ :param encoding: The name of an encoding.
+ """
return unicode(data, encoding, errors)
@property
def declared_html_encoding(self):
+ """If the markup is an HTML document, returns the encoding declared _within_
+ the document.
+ """
if not self.is_html:
return None
return self.detector.declared_encoding
def find_codec(self, charset):
+ """Convert the name of a character set to a codec name.
+
+ :param charset: The name of a character set.
+ :return: The name of a codec.
+ """
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
or (charset and self._codec(charset.replace("-", "")))
or (charset and self._codec(charset.replace("-", "_")))
@@ -814,12 +878,16 @@ class UnicodeDammit:
Currently the only situation supported is Windows-1252 (or its
subset ISO-8859-1), embedded in UTF-8.
- The input must be a bytestring. If you've already converted
- the document to Unicode, you're too late.
-
- The output is a bytestring in which `embedded_encoding`
- characters have been converted to their `main_encoding`
- equivalents.
+ :param in_bytes: A bytestring that you suspect contains
+ characters from multiple encodings. Note that this _must_
+ be a bytestring. If you've already converted the document
+ to Unicode, you're too late.
+ :param main_encoding: The primary encoding of `in_bytes`.
+ :param embedded_encoding: The encoding that was used to embed characters
+ in the main document.
+ :return: A bytestring in which `embedded_encoding`
+ characters have been converted to their `main_encoding`
+ equivalents.
"""
if embedded_encoding.replace('_', '-').lower() not in (
'windows-1252', 'windows_1252'):