1 files changed, 81 insertions, 13 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py
index 74fa7f0..e72a867 100644
--- a/bs4/dammit.py
+++ b/bs4/dammit.py
@@ -43,6 +43,9 @@ except ImportError:
             return None
 
 # Available from http://cjkpython.i18n.org/.
+#
+# TODO: This doesn't work anymore and the closest thing, iconv_codecs,
+# is GPL-licensed. Check whether this is still necessary.
 try:
     import iconv_codec
 except ImportError:
@@ -63,8 +66,7 @@ encoding_res[unicode] = {
 }
 
 class EntitySubstitution(object):
-
-    """Substitute XML or HTML entities for the corresponding characters."""
+    """The ability to substitute XML or HTML entities for certain characters."""
 
     def _populate_class_variables():
         lookup = {}
@@ -110,13 +112,15 @@ class EntitySubstitution(object):
 
     @classmethod
     def _substitute_html_entity(cls, matchobj):
+        """Used with a regular expression to substitute the
+        appropriate HTML entity for a special character."""
         entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
         return "&%s;" % entity
 
     @classmethod
     def _substitute_xml_entity(cls, matchobj):
         """Used with a regular expression to substitute the
-        appropriate XML entity for an XML special character."""
+        appropriate XML entity for a special character."""
         entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
         return "&%s;" % entity
 
@@ -211,6 +215,8 @@ class EntitySubstitution(object):
         containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
         character with "&eacute;" will make it more readable to some
         people.
+
+        :param s: A Unicode string.
         """
         return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
             cls._substitute_html_entity, s)
@@ -238,6 +244,15 @@ class EncodingDetector:
     """
     def __init__(self, markup, override_encodings=None, is_html=False,
                  exclude_encodings=None):
+        """Constructor.
+
+        :param markup: Some markup in an unknown encoding.
+        :param override_encodings: These encodings will be tried first.
+        :param is_html: If True, this markup is considered to be HTML. Otherwise
+            it's assumed to be XML.
+        :param exclude_encodings: These encodings will not be tried, even
+            if they otherwise would be.
+        """
         self.override_encodings = override_encodings or []
         exclude_encodings = exclude_encodings or []
         self.exclude_encodings = set([x.lower() for x in exclude_encodings])
@@ -249,6 +264,12 @@ class EncodingDetector:
         self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
 
     def _usable(self, encoding, tried):
+        """Should we even bother to try this encoding?
+
+        :param encoding: Name of an encoding.
+        :param tried: Encodings that have already been tried. This will be modified
+            as a side effect.
+        """
         if encoding is not None:
             encoding = encoding.lower()
             if encoding in self.exclude_encodings:
@@ -260,7 +281,10 @@ class EncodingDetector:
 
     @property
     def encodings(self):
-        """Yield a number of encodings that might work for this markup."""
+        """Yield a number of encodings that might work for this markup.
+
+        :yield: A sequence of strings.
+        """
         tried = set()
         for e in self.override_encodings:
             if self._usable(e, tried):
@@ -293,7 +317,11 @@ class EncodingDetector:
 
     @classmethod
     def strip_byte_order_mark(cls, data):
-        """If a byte-order mark is present, strip it and return the encoding it implies."""
+        """If a byte-order mark is present, strip it and return the encoding it implies.
+
+        :param data: Some markup.
+        :return: A 2-tuple (modified data, implied encoding)
+        """
         encoding = None
         if isinstance(data, unicode):
             # Unicode data cannot have a byte-order mark.
@@ -325,6 +353,13 @@ class EncodingDetector:
 
         An HTML encoding is declared in a <meta> tag, hopefully near the
         beginning of the document.
+
+        :param markup: Some markup.
+        :param is_html: If True, this markup is considered to be HTML. Otherwise
+            it's assumed to be XML.
+        :param search_entire_document: Since an encoding is supposed to declared near the beginning
+            of the document, most of the time it's only necessary to search a few kilobytes of data.
+            Set this to True to force this method to search the entire document.
         """
         if search_entire_document:
             xml_endpos = html_endpos = len(markup)
@@ -372,6 +407,21 @@ class UnicodeDammit:
 
     def __init__(self, markup, override_encodings=[],
                  smart_quotes_to=None, is_html=False, exclude_encodings=[]):
+        """Constructor.
+
+        :param markup: A bytestring representing markup in an unknown encoding.
+        :param override_encodings: These encodings will be tried first,
+           before any sniffing code is run.
+
+        :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
+           to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
+           Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
+           will convert them to HTML entity references.
+        :param is_html: If True, this markup is considered to be HTML. Otherwise
+            it's assumed to be XML.
+        :param exclude_encodings: These encodings will not be considered, even
+            if the sniffing code thinks they might make sense.
+        """
         self.smart_quotes_to = smart_quotes_to
         self.tried_encodings = []
         self.contains_replacement_characters = False
@@ -438,6 +488,10 @@ class UnicodeDammit:
         return sub
 
     def _convert_from(self, proposed, errors="strict"):
+        """Attempt to convert the markup to the proposed encoding.
+
+        :param proposed: The name of a character encoding.
+        """
         proposed = self.find_codec(proposed)
         if not proposed or (proposed, errors) in self.tried_encodings:
             return None
@@ -465,17 +519,27 @@ class UnicodeDammit:
         return self.markup
 
     def _to_unicode(self, data, encoding, errors="strict"):
-        '''Given a string and its encoding, decodes the string into Unicode.
-        %encoding is a string recognized by encodings.aliases'''
+        """Given a string and its encoding, decodes the string into Unicode.
+
+        :param encoding: The name of an encoding.
+        """
         return unicode(data, encoding, errors)
 
     @property
     def declared_html_encoding(self):
+        """If the markup is an HTML document, returns the encoding declared _within_
+        the document.
+        """
         if not self.is_html:
             return None
         return self.detector.declared_encoding
 
     def find_codec(self, charset):
+        """Convert the name of a character set to a codec name.
+
+        :param charset: The name of a character set.
+        :return: The name of a codec.
+        """
         value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
                or (charset and self._codec(charset.replace("-", "")))
                or (charset and self._codec(charset.replace("-", "_")))
@@ -814,12 +878,16 @@ class UnicodeDammit:
         Currently the only situation supported is Windows-1252 (or its
         subset ISO-8859-1), embedded in UTF-8.
 
-        The input must be a bytestring. If you've already converted
-        the document to Unicode, you're too late.
-
-        The output is a bytestring in which `embedded_encoding`
-        characters have been converted to their `main_encoding`
-        equivalents.
+        :param in_bytes: A bytestring that you suspect contains
+            characters from multiple encodings. Note that this _must_
+            be a bytestring. If you've already converted the document
+            to Unicode, you're too late.
+        :param main_encoding: The primary encoding of `in_bytes`.
+        :param embedded_encoding: The encoding that was used to embed characters
+            in the main document.
+        :return: A bytestring in which `embedded_encoding`
+          characters have been converted to their `main_encoding`
+          equivalents.
         """
         if embedded_encoding.replace('_', '-').lower() not in (
             'windows-1252', 'windows_1252'):