diff options
-rw-r--r-- | bs4/__init__.py | 8 | ||||
-rw-r--r-- | bs4/builder/__init__.py | 1 | ||||
-rw-r--r-- | bs4/builder/_lxml.py | 5 | ||||
-rw-r--r-- | doc/source/index.rst | 4 |
4 files changed, 10 insertions, 8 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py index 34a72e4..b74acee 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -77,7 +77,7 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' - NO_PARSER_SPECIFIED_WARNING = 'Parser was not explicitly specified. Using the best available parser for this system ("%s"). The same code on other systems may use a different parser and behave differently.' + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, **kwargs): @@ -155,9 +155,9 @@ class BeautifulSoup(Tag): % ",".join(features)) builder = builder_class() if not (original_features == builder.NAME or - (not isinstance(builder.NAME, basestring) and - original_features in builder.NAME)): - warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % builder.NAME) + original_features in builder.ALTERNATE_NAMES): + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( + parser=builder.NAME)) self.builder = builder self.is_xml = builder.is_xml diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py index 0e84fae..820bc80 100644 --- a/bs4/builder/__init__.py +++ b/bs4/builder/__init__.py @@ -81,6 +81,7 @@ class TreeBuilder(object): """Turn a document into a Beautiful Soup object tree.""" NAME = "[Unknown tree builder]" + ALTERNATE_NAMES = [] features = [] is_xml = False diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py index 110e9d2..978c8df 100644 --- a/bs4/builder/_lxml.py +++ b/bs4/builder/_lxml.py @@ -214,9 +214,10 @@ class LXMLTreeBuilderForXML(TreeBuilder): class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): - NAME = [LXML, "lxml-html"] + NAME = LXML + ALTERNATE_NAMES = ["lxml-html"] - features = NAME + [HTML, FAST, PERMISSIVE] + features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] is_xml = False def default_parser(self, encoding): diff --git a/doc/source/index.rst b/doc/source/index.rst index 11d9f88..0d91c1c 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -68,7 +68,7 @@ Running the "three sisters" document through Beautiful Soup gives us a data structure:: from bs4 import BeautifulSoup - soup = BeautifulSoup(html_doc) + soup = BeautifulSoup(html_doc, 'html.parser') print(soup.prettify()) # <html> @@ -270,7 +270,7 @@ This table summarizes the advantages and disadvantages of each parser library: | lxml's HTML parser | ``BeautifulSoup(markup, "lxml")`` | * Very fast | * External C dependency | | | | * Lenient | | +----------------------+--------------------------------------------+--------------------------------+--------------------------+ -| lxml's XML parser | ``BeautifulSoup(markup, ["lxml", "xml"])`` | * Very fast | * External C dependency | +| lxml's XML parser | ``BeautifulSoup(markup, "lxml-xml")`` | * Very fast | * External C dependency | | | ``BeautifulSoup(markup, "xml")`` | * The only currently supported | | | | | XML parser | | +----------------------+--------------------------------------------+--------------------------------+--------------------------+ |