diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-12-24 08:58:05 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-12-24 08:58:05 -0500 |
commit | b7dec273425095f8e2ab1f1c58ecc124ca63e856 (patch) | |
tree | ae20e27a551f99570996f262df71337ab16c1ed1 /bs4/dammit.py | |
parent | ee3f30b0e778747fb701856a33713369f778834f (diff) |
Added docstrings to all public methods in dammit.py.
Diffstat (limited to 'bs4/dammit.py')
-rw-r--r-- | bs4/dammit.py | 94 |
1 files changed, 81 insertions, 13 deletions
diff --git a/bs4/dammit.py b/bs4/dammit.py index 74fa7f0..e72a867 100644 --- a/bs4/dammit.py +++ b/bs4/dammit.py @@ -43,6 +43,9 @@ except ImportError: return None # Available from http://cjkpython.i18n.org/. +# +# TODO: This doesn't work anymore and the closest thing, iconv_codecs, +# is GPL-licensed. Check whether this is still necessary. try: import iconv_codec except ImportError: @@ -63,8 +66,7 @@ encoding_res[unicode] = { } class EntitySubstitution(object): - - """Substitute XML or HTML entities for the corresponding characters.""" + """The ability to substitute XML or HTML entities for certain characters.""" def _populate_class_variables(): lookup = {} @@ -110,13 +112,15 @@ class EntitySubstitution(object): @classmethod def _substitute_html_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate HTML entity for a special character.""" entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) return "&%s;" % entity @classmethod def _substitute_xml_entity(cls, matchobj): """Used with a regular expression to substitute the - appropriate XML entity for an XML special character.""" + appropriate XML entity for a special character.""" entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] return "&%s;" % entity @@ -211,6 +215,8 @@ class EntitySubstitution(object): containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that character with "é" will make it more readable to some people. + + :param s: A Unicode string. """ return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( cls._substitute_html_entity, s) @@ -238,6 +244,15 @@ class EncodingDetector: """ def __init__(self, markup, override_encodings=None, is_html=False, exclude_encodings=None): + """Constructor. + + :param markup: Some markup in an unknown encoding. + :param override_encodings: These encodings will be tried first. + :param is_html: If True, this markup is considered to be HTML. Otherwise + it's assumed to be XML. + :param exclude_encodings: These encodings will not be tried, even + if they otherwise would be. + """ self.override_encodings = override_encodings or [] exclude_encodings = exclude_encodings or [] self.exclude_encodings = set([x.lower() for x in exclude_encodings]) @@ -249,6 +264,12 @@ class EncodingDetector: self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) def _usable(self, encoding, tried): + """Should we even bother to try this encoding? + + :param encoding: Name of an encoding. + :param tried: Encodings that have already been tried. This will be modified + as a side effect. + """ if encoding is not None: encoding = encoding.lower() if encoding in self.exclude_encodings: @@ -260,7 +281,10 @@ class EncodingDetector: @property def encodings(self): - """Yield a number of encodings that might work for this markup.""" + """Yield a number of encodings that might work for this markup. + + :yield: A sequence of strings. + """ tried = set() for e in self.override_encodings: if self._usable(e, tried): @@ -293,7 +317,11 @@ class EncodingDetector: @classmethod def strip_byte_order_mark(cls, data): - """If a byte-order mark is present, strip it and return the encoding it implies.""" + """If a byte-order mark is present, strip it and return the encoding it implies. + + :param data: Some markup. + :return: A 2-tuple (modified data, implied encoding) + """ encoding = None if isinstance(data, unicode): # Unicode data cannot have a byte-order mark. @@ -325,6 +353,13 @@ class EncodingDetector: An HTML encoding is declared in a <meta> tag, hopefully near the beginning of the document. + + :param markup: Some markup. + :param is_html: If True, this markup is considered to be HTML. Otherwise + it's assumed to be XML. + :param search_entire_document: Since an encoding is supposed to declared near the beginning + of the document, most of the time it's only necessary to search a few kilobytes of data. + Set this to True to force this method to search the entire document. """ if search_entire_document: xml_endpos = html_endpos = len(markup) @@ -372,6 +407,21 @@ class UnicodeDammit: def __init__(self, markup, override_encodings=[], smart_quotes_to=None, is_html=False, exclude_encodings=[]): + """Constructor. + + :param markup: A bytestring representing markup in an unknown encoding. + :param override_encodings: These encodings will be tried first, + before any sniffing code is run. + + :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted + to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. + Setting it to 'xml' will convert them to XML entity references, and setting it to 'html' + will convert them to HTML entity references. + :param is_html: If True, this markup is considered to be HTML. Otherwise + it's assumed to be XML. + :param exclude_encodings: These encodings will not be considered, even + if the sniffing code thinks they might make sense. + """ self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False @@ -438,6 +488,10 @@ class UnicodeDammit: return sub def _convert_from(self, proposed, errors="strict"): + """Attempt to convert the markup to the proposed encoding. + + :param proposed: The name of a character encoding. + """ proposed = self.find_codec(proposed) if not proposed or (proposed, errors) in self.tried_encodings: return None @@ -465,17 +519,27 @@ class UnicodeDammit: return self.markup def _to_unicode(self, data, encoding, errors="strict"): - '''Given a string and its encoding, decodes the string into Unicode. - %encoding is a string recognized by encodings.aliases''' + """Given a string and its encoding, decodes the string into Unicode. + + :param encoding: The name of an encoding. + """ return unicode(data, encoding, errors) @property def declared_html_encoding(self): + """If the markup is an HTML document, returns the encoding declared _within_ + the document. + """ if not self.is_html: return None return self.detector.declared_encoding def find_codec(self, charset): + """Convert the name of a character set to a codec name. + + :param charset: The name of a character set. + :return: The name of a codec. + """ value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) or (charset and self._codec(charset.replace("-", ""))) or (charset and self._codec(charset.replace("-", "_"))) @@ -814,12 +878,16 @@ class UnicodeDammit: Currently the only situation supported is Windows-1252 (or its subset ISO-8859-1), embedded in UTF-8. - The input must be a bytestring. If you've already converted - the document to Unicode, you're too late. - - The output is a bytestring in which `embedded_encoding` - characters have been converted to their `main_encoding` - equivalents. + :param in_bytes: A bytestring that you suspect contains + characters from multiple encodings. Note that this _must_ + be a bytestring. If you've already converted the document + to Unicode, you're too late. + :param main_encoding: The primary encoding of `in_bytes`. + :param embedded_encoding: The encoding that was used to embed characters + in the main document. + :return: A bytestring in which `embedded_encoding` + characters have been converted to their `main_encoding` + equivalents. """ if embedded_encoding.replace('_', '-').lower() not in ( 'windows-1252', 'windows_1252'): |