diff options
author | Leonard Richardson <leonardr@segfault.org> | 2019-12-24 10:41:57 -0500 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2019-12-24 10:41:57 -0500 |
commit | bef726b23d0770860cd347b03009ffb027159572 (patch) | |
tree | 325b698568a6fcb63018753db4830a579254f6ca /bs4/builder/_lxml.py | |
parent | 5952879a2458fdeb74673d3ccd61fd312c7d66df (diff) |
Added docstrings for some but not all tree buidlers.
Diffstat (limited to 'bs4/builder/_lxml.py')
-rw-r--r-- | bs4/builder/_lxml.py | 46 |
1 files changed, 37 insertions, 9 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index c83dd2d..1b44d75 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -62,10 +62,13 @@ class LXMLTreeBuilderForXML(TreeBuilder): # But instead we build an XMLParser or HTMLParser object to serve # as the target of parse messages, and those messages don't include # line numbers. + # See: https://bugs.launchpad.net/lxml/+bug/1846906 def initialize_soup(self, soup): """Let the BeautifulSoup object know about the standard namespace mapping. + + :param soup: A `BeautifulSoup`. """ super(LXMLTreeBuilderForXML, self).initialize_soup(soup) self._register_namespaces(self.DEFAULT_NSMAPS) @@ -75,6 +78,8 @@ class LXMLTreeBuilderForXML(TreeBuilder): while parsing the document. This might be useful later on when creating CSS selectors. + + :param mapping: A dictionary mapping namespace prefixes to URIs. """ for key, value in mapping.items(): if key and key not in self.soup._namespaces: @@ -84,14 +89,23 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.soup._namespaces[key] = value def default_parser(self, encoding): - # This can either return a parser object or a class, which - # will be instantiated with default arguments. + """Find the default parser for the given encoding. + + :param encoding: A string. + :return: Either a parser object or a class, which + will be instantiated with default arguments. + """ if self._default_parser is not None: return self._default_parser return etree.XMLParser( target=self, strip_cdata=False, recover=True, encoding=encoding) def parser_for(self, encoding): + """Instantiate an appropriate parser for the given encoding. + + :param encoding: A string. + :return: A parser object such as an `etree.XMLParser`. + """ # Use the default parser. parser = self.default_parser(encoding) @@ -124,17 +138,31 @@ class LXMLTreeBuilderForXML(TreeBuilder): def prepare_markup(self, markup, user_specified_encoding=None, exclude_encodings=None, document_declared_encoding=None): - """ - :yield: A series of 4-tuples. + """Run any preliminary steps necessary to make incoming markup + acceptable to the parser. + + lxml really wants to get a bytestring and convert it to + Unicode itself. So instead of using UnicodeDammit to convert + the bytestring to Unicode using different encodings, this + implementation uses EncodingDetector to iterate over the + encodings, and tell lxml to try to parse the document as each + one in turn. + + :param markup: Some markup -- hopefully a bytestring. + :param user_specified_encoding: The user asked to try this encoding. + :param document_declared_encoding: The markup itself claims to be + in this encoding. + :param exclude_encodings: The user asked _not_ to try any of + these encodings. + + :yield: A series of 4-tuples: (markup, encoding, declared encoding, has undergone character replacement) - Each 4-tuple represents a strategy for parsing the document. + Each 4-tuple represents a strategy for converting the + document to Unicode and parsing it. Each strategy will be tried + in turn. """ - # Instead of using UnicodeDammit to convert the bytestring to - # Unicode using different encodings, use EncodingDetector to - # iterate over the encodings, and tell lxml to try to parse - # the document as each one in turn. is_html = not self.is_xml if is_html: self.processing_instruction_class = ProcessingInstruction |