summaryrefslogtreecommitdiff
path: root/bs4/builder/_lxml.py
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2019-12-24 10:41:57 -0500
committerLeonard Richardson <leonardr@segfault.org>2019-12-24 10:41:57 -0500
commitbef726b23d0770860cd347b03009ffb027159572 (patch)
tree325b698568a6fcb63018753db4830a579254f6ca /bs4/builder/_lxml.py
parent5952879a2458fdeb74673d3ccd61fd312c7d66df (diff)
Added docstrings for some but not all tree buidlers.
Diffstat (limited to 'bs4/builder/_lxml.py')
-rw-r--r--bs4/builder/_lxml.py46
1 files changed, 37 insertions, 9 deletions
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index c83dd2d..1b44d75 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -62,10 +62,13 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# But instead we build an XMLParser or HTMLParser object to serve
# as the target of parse messages, and those messages don't include
# line numbers.
+ # See: https://bugs.launchpad.net/lxml/+bug/1846906
def initialize_soup(self, soup):
"""Let the BeautifulSoup object know about the standard namespace
mapping.
+
+ :param soup: A `BeautifulSoup`.
"""
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
self._register_namespaces(self.DEFAULT_NSMAPS)
@@ -75,6 +78,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
while parsing the document.
This might be useful later on when creating CSS selectors.
+
+ :param mapping: A dictionary mapping namespace prefixes to URIs.
"""
for key, value in mapping.items():
if key and key not in self.soup._namespaces:
@@ -84,14 +89,23 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.soup._namespaces[key] = value
def default_parser(self, encoding):
- # This can either return a parser object or a class, which
- # will be instantiated with default arguments.
+ """Find the default parser for the given encoding.
+
+ :param encoding: A string.
+ :return: Either a parser object or a class, which
+ will be instantiated with default arguments.
+ """
if self._default_parser is not None:
return self._default_parser
return etree.XMLParser(
target=self, strip_cdata=False, recover=True, encoding=encoding)
def parser_for(self, encoding):
+ """Instantiate an appropriate parser for the given encoding.
+
+ :param encoding: A string.
+ :return: A parser object such as an `etree.XMLParser`.
+ """
# Use the default parser.
parser = self.default_parser(encoding)
@@ -124,17 +138,31 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def prepare_markup(self, markup, user_specified_encoding=None,
exclude_encodings=None,
document_declared_encoding=None):
- """
- :yield: A series of 4-tuples.
+ """Run any preliminary steps necessary to make incoming markup
+ acceptable to the parser.
+
+ lxml really wants to get a bytestring and convert it to
+ Unicode itself. So instead of using UnicodeDammit to convert
+ the bytestring to Unicode using different encodings, this
+ implementation uses EncodingDetector to iterate over the
+ encodings, and tell lxml to try to parse the document as each
+ one in turn.
+
+ :param markup: Some markup -- hopefully a bytestring.
+ :param user_specified_encoding: The user asked to try this encoding.
+ :param document_declared_encoding: The markup itself claims to be
+ in this encoding.
+ :param exclude_encodings: The user asked _not_ to try any of
+ these encodings.
+
+ :yield: A series of 4-tuples:
(markup, encoding, declared encoding,
has undergone character replacement)
- Each 4-tuple represents a strategy for parsing the document.
+ Each 4-tuple represents a strategy for converting the
+ document to Unicode and parsing it. Each strategy will be tried
+ in turn.
"""
- # Instead of using UnicodeDammit to convert the bytestring to
- # Unicode using different encodings, use EncodingDetector to
- # iterate over the encodings, and tell lxml to try to parse
- # the document as each one in turn.
is_html = not self.is_xml
if is_html:
self.processing_instruction_class = ProcessingInstruction