summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt12
-rw-r--r--bs4/__init__.py23
-rw-r--r--bs4/builder/_html5lib.py18
-rw-r--r--bs4/builder/_htmlparser.py3
-rw-r--r--bs4/builder/_lxml.py62
-rw-r--r--bs4/element.py37
-rw-r--r--bs4/testing.py60
-rw-r--r--bs4/tests/test_html5lib.py6
-rw-r--r--bs4/tests/test_lxml.py12
-rw-r--r--bs4/tests/test_soup.py38
-rw-r--r--bs4/tests/test_tree.py21
-rw-r--r--setup.py2
12 files changed, 246 insertions, 48 deletions
diff --git a/NEWS.txt b/NEWS.txt
index cd5d305..c93541e 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -1,3 +1,15 @@
+= 4.0.0b8 () =
+
+* All tree builders now preserve namespace information in the
+ documents they parse.
+
+ However, there is no special support for namespace-oriented
+ searching or tree manipulation. When you search the tree, you need
+ to use namespace prefixes exactly as they're used in the original
+ document.
+
+* The string representation of a DOCTYPE always ends in a newline.
+
= 4.0.0b7 (20110223) =
* Upon decoding to string, any characters that can't be represented in
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 13dac85..9b5c155 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -17,7 +17,7 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.0.0b7"
+__version__ = "4.0.0b8"
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
__license__ = "MIT"
@@ -193,9 +193,9 @@ class BeautifulSoup(Tag):
self.tagStack = []
self.pushTag(self)
- def new_tag(self, name, **attrs):
+ def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
"""Create a new tag associated with this soup."""
- return Tag(None, self.builder, name, attrs)
+ return Tag(None, self.builder, name, namespace, nsprefix, attrs)
def new_string(self, s):
"""Create a new NavigableString associated with this soup."""
@@ -249,7 +249,7 @@ class BeautifulSoup(Tag):
self.previous_element = o
self.currentTag.contents.append(o)
- def _popToTag(self, name, inclusivePop=True):
+ def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag
stack up to but *not* including the most recent instqance of
@@ -262,7 +262,8 @@ class BeautifulSoup(Tag):
mostRecentTag = None
for i in range(len(self.tagStack) - 1, 0, -1):
- if name == self.tagStack[i].name:
+ if (name == self.tagStack[i].name
+ and nsprefix == self.tagStack[i].nsprefix == nsprefix):
numPops = len(self.tagStack) - i
break
if not inclusivePop:
@@ -272,7 +273,7 @@ class BeautifulSoup(Tag):
mostRecentTag = self.popTag()
return mostRecentTag
- def handle_starttag(self, name, attrs):
+ def handle_starttag(self, name, namespace, nsprefix, attrs):
"""Push a start tag on to the stack.
If this method returns None, the tag was rejected by the
@@ -281,7 +282,7 @@ class BeautifulSoup(Tag):
don't call handle_endtag.
"""
- #print "Start tag %s: %s" % (name, attrs)
+ # print "Start tag %s: %s" % (name, attrs)
self.endData()
if (self.parse_only and len(self.tagStack) <= 1
@@ -289,8 +290,8 @@ class BeautifulSoup(Tag):
or not self.parse_only.search_tag(name, attrs))):
return None
- tag = Tag(self, self.builder, name, attrs, self.currentTag,
- self.previous_element)
+ tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
+ self.currentTag, self.previous_element)
if tag is None:
return tag
if self.previous_element:
@@ -299,10 +300,10 @@ class BeautifulSoup(Tag):
self.pushTag(tag)
return tag
- def handle_endtag(self, name):
+ def handle_endtag(self, name, nsprefix=None):
#print "End tag: " + name
self.endData()
- self._popToTag(name)
+ self._popToTag(name, nsprefix)
def handle_data(self, data):
self.currentData.append(data)
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 0d7a1a9..26b1773 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -8,12 +8,9 @@ from bs4.builder import (
HTML_5,
HTMLTreeBuilder,
)
+from bs4.element import NamespacedAttribute
import html5lib
-from html5lib.constants import (
- DataLossWarning,
- namespaces,
- )
-import warnings
+from html5lib.constants import namespaces
from bs4.element import (
Comment,
Doctype,
@@ -58,9 +55,6 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements):
self.soup = soup
- if namespaceHTMLElements:
- warnings.warn("namespaceHTMLElements not supported yet",
- DataLossWarning)
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
@@ -76,9 +70,7 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
self.soup.object_was_parsed(doctype)
def elementClass(self, name, namespace):
- if namespace is not None:
- warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
- tag = self.soup.new_tag(name)
+ tag = self.soup.new_tag(name, namespace)
return Element(tag, self.soup, namespace)
def commentClass(self, data):
@@ -144,6 +136,8 @@ class Element(html5lib.treebuilders._base.Node):
def setAttributes(self, attributes):
if attributes is not None and attributes != {}:
for name, value in list(attributes.items()):
+ if isinstance(name, tuple):
+ name = NamespacedAttribute(*name)
self.element[name] = value
# The attributes may contain variables that need substitution.
# Call set_up_substitutions manually.
@@ -189,7 +183,7 @@ class Element(html5lib.treebuilders._base.Node):
TextNode(child, self.soup))
def cloneNode(self):
- tag = self.soup.new_tag(self.element.name)
+ tag = self.soup.new_tag(self.element.name, self.namespace)
node = Element(tag, self.soup, self.namespace)
for key,value in self.attributes:
node.attributes[key] = value
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 62473cf..c307ff8 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -40,7 +40,8 @@ HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
def handle_starttag(self, name, attrs):
- self.soup.handle_starttag(name, dict(attrs))
+ # XXX namespace
+ self.soup.handle_starttag(name, None, None, dict(attrs))
def handle_endtag(self, name):
self.soup.handle_endtag(name)
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index cc3cb86..e5e30d4 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -5,7 +5,7 @@ __all__ = [
import collections
from lxml import etree
-from bs4.element import Comment, Doctype
+from bs4.element import Comment, Doctype, NamespacedAttribute
from bs4.builder import (
FAST,
HTML,
@@ -42,6 +42,15 @@ class LXMLTreeBuilderForXML(TreeBuilder):
parser = parser(target=self, strip_cdata=False)
self.parser = parser
self.soup = None
+ self.nsmaps = None
+
+ def _getNsTag(self, tag):
+ # Split the namespace URL out of a fully-qualified lxml tag
+ # name. Copied from lxml's src/lxml/sax.py.
+ if tag[0] == '{':
+ return tuple(tag[1:].split('}', 1))
+ else:
+ return (None, tag)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
@@ -63,15 +72,56 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.parser.close()
def close(self):
- pass
-
- def start(self, name, attrs):
- self.soup.handle_starttag(name, attrs)
+ self.nsmaps = None
+
+ def start(self, name, attrs, nsmap={}):
+ nsprefix = None
+ # Invert each namespace map as it comes in.
+ if len(nsmap) == 0 and self.nsmaps != None:
+ # There are no new namespaces for this tag, but namespaces
+ # are in play, so we need a separate tag stack to know
+ # when they end.
+ self.nsmaps.append(None)
+ elif len(nsmap) > 0:
+ # A new namespace mapping has come into play.
+ if self.nsmaps is None:
+ self.nsmaps = []
+ inverted_nsmap = dict((value, key) for key, value in nsmap.items())
+ self.nsmaps.append(inverted_nsmap)
+ # Also treat the namespace mapping as a set of attributes on the
+ # tag, so we can recreate it later.
+ attrs = attrs.copy()
+ for prefix, namespace in nsmap.items():
+ attribute = NamespacedAttribute(
+ "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
+ attrs[attribute] = namespace
+ namespace, name = self._getNsTag(name)
+ if namespace is not None:
+ for inverted_nsmap in reversed(self.nsmaps):
+ if inverted_nsmap is not None and namespace in inverted_nsmap:
+ nsprefix = inverted_nsmap[namespace]
+ break
+ self.soup.handle_starttag(name, namespace, nsprefix, attrs)
def end(self, name):
self.soup.endData()
completed_tag = self.soup.tagStack[-1]
- self.soup.handle_endtag(name)
+ namespace, name = self._getNsTag(name)
+ nsprefix = None
+ if namespace is not None:
+ for inverted_nsmap in reversed(self.nsmaps):
+ if inverted_nsmap is not None and namespace in inverted_nsmap:
+ nsprefix = inverted_nsmap[namespace]
+ break
+ self.soup.handle_endtag(name, nsprefix)
+ if self.nsmaps != None:
+ # This tag, or one of its parents, introduced a namespace
+ # mapping, so pop it off the stack.
+ self.nsmaps.pop()
+ if len(self.nsmaps) == 0:
+ # Namespaces are no longer in play, so don't bother keeping
+ # track of the namespace stack.
+ self.nsmaps = None
def pi(self, target, data):
pass
diff --git a/bs4/element.py b/bs4/element.py
index 997378a..efc6ec7 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -22,6 +22,19 @@ def _alias(attr):
return alias
+class NamespacedAttribute(unicode):
+
+ def __new__(cls, prefix, name, namespace=None):
+ if name is None:
+ obj = unicode.__new__(cls, prefix)
+ else:
+ obj = unicode.__new__(cls, prefix + ":" + name)
+ obj.prefix = prefix
+ obj.name = name
+ obj.namespace = namespace
+ return obj
+
+
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
@@ -500,15 +513,15 @@ class Doctype(NavigableString):
return Doctype(value)
PREFIX = u'<!DOCTYPE '
- SUFFIX = u'>'
+ SUFFIX = u'>\n'
class Tag(PageElement):
"""Represents a found HTML tag with its attributes and contents."""
- def __init__(self, parser=None, builder=None, name=None, attrs=None,
- parent=None, previous=None):
+ def __init__(self, parser=None, builder=None, name=None, namespace=None,
+ nsprefix=None, attrs=None, parent=None, previous=None):
"Basic constructor."
if parser is None:
@@ -520,6 +533,8 @@ class Tag(PageElement):
if name is None:
raise ValueError("No value provided for new tag's name.")
self.name = name
+ self.namespace = namespace
+ self.nsprefix = nsprefix
if attrs is None:
attrs = {}
else:
@@ -659,6 +674,9 @@ class Tag(PageElement):
def has_attr(self, key):
return key in self.attrs
+ def __hash__(self):
+ return str(self).__hash__()
+
def __getitem__(self, key):
"""tag[key] returns the value of the 'key' attribute for the tag,
and throws an exception if it's not there."""
@@ -779,7 +797,7 @@ class Tag(PageElement):
and '%SOUP-ENCODING%' in val):
val = self.substitute_encoding(val, eventual_encoding)
- decoded = (key + '='
+ decoded = (str(key) + '='
+ EntitySubstitution.substitute_xml(val, True))
attrs.append(decoded)
close = ''
@@ -789,6 +807,10 @@ class Tag(PageElement):
else:
closeTag = '</%s>' % self.name
+ prefix = ''
+ if self.nsprefix:
+ prefix = self.nsprefix + ":"
+
pretty_print = (indent_level is not None)
if pretty_print:
space = (' ' * (indent_level - 1))
@@ -809,7 +831,8 @@ class Tag(PageElement):
attribute_string = ' ' + ' '.join(attrs)
if pretty_print:
s.append(space)
- s.append('<%s%s%s>' % (self.name, attribute_string, close))
+ s.append('<%s%s%s%s>' % (
+ prefix, self.name, attribute_string, close))
if pretty_print:
s.append("\n")
s.append(contents)
@@ -986,7 +1009,7 @@ class SoupStrainer(object):
searchTag = search_tag
def search(self, markup):
- #print 'looking for %s in %s' % (self, markup)
+ # print 'looking for %s in %s' % (self, markup)
found = None
# If given a list of items, scan it for a text element that
# matches.
@@ -1012,7 +1035,7 @@ class SoupStrainer(object):
return found
def _matches(self, markup, match_against):
- #print "Matching %s against %s" % (markup, match_against)
+ # print "Matching %s against %s" % (markup, match_against)
result = False
if isinstance(markup, list) or isinstance(markup, tuple):
diff --git a/bs4/testing.py b/bs4/testing.py
index dc20812..d7b01aa 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -358,6 +358,66 @@ class HTMLTreeBuilderSmokeTest(object):
# For the rest of the story, see TestSubstitutions in
# test_tree.py.
+class XMLTreeBuilderSmokeTest(object):
+
+ def test_docstring_generated(self):
+ soup = self.soup("<root/>")
+ self.assertEqual(
+ soup.encode(), b'<?xml version="1.0" encoding="utf-8">\n<root/>')
+
+ def test_docstring_includes_correct_encoding(self):
+ soup = self.soup("<root/>")
+ self.assertEqual(
+ soup.encode("latin1"),
+ b'<?xml version="1.0" encoding="latin1">\n<root/>')
+
+ def test_real_xhtml_document(self):
+ """A real XHTML document should come out the same as it went in."""
+ markup = b"""<?xml version="1.0" encoding="utf-8">
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+ soup = self.soup(markup)
+ self.assertEqual(soup.encode("utf-8"), markup)
+
+
+ def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
+ self.assertSoupEquals("<p>", "<p/>")
+ self.assertSoupEquals("<p>foo</p>")
+
+ def test_namespaces_are_preserved(self):
+ markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
+ soup = self.soup(markup)
+ root = soup.root
+ self.assertEqual("http://example.com/", root['xmlns:a'])
+ self.assertEqual("http://example.net/", root['xmlns:b'])
+
+
+class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
+ """Smoke test for a tree builder that supports HTML5."""
+
+ def test_html_tags_have_namespace(self):
+ markup = "<a>"
+ soup = self.soup(markup)
+ self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
+
+ def test_svg_tags_have_namespace(self):
+ markup = '<svg><circle/></svg>'
+ soup = self.soup(markup)
+ namespace = "http://www.w3.org/2000/svg"
+ self.assertEqual(namespace, soup.svg.namespace)
+ self.assertEqual(namespace, soup.circle.namespace)
+
+
+ def test_mathml_tags_have_namespace(self):
+ markup = '<math><msqrt>5</msqrt></math>'
+ soup = self.soup(markup)
+ namespace = 'http://www.w3.org/1998/Math/MathML'
+ self.assertEqual(namespace, soup.math.namespace)
+ self.assertEqual(namespace, soup.msqrt.namespace)
+
def skipIf(condition, reason):
def nothing(test, *args, **kwargs):
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index f1edddf..0828cfd 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -7,7 +7,7 @@ except ImportError, e:
HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer
from bs4.testing import (
- HTMLTreeBuilderSmokeTest,
+ HTML5TreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@@ -15,8 +15,8 @@ from bs4.testing import (
@skipIf(
not HTML5LIB_PRESENT,
"html5lib seems not to be present, not testing its tree builder.")
-class HTML5LibBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
- """See ``HTMLTreeBuilderSmokeTest``."""
+class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
+ """See ``HTML5TreeBuilderSmokeTest``."""
@property
def default_builder(self):
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 92b7389..27ec570 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -14,6 +14,7 @@ from bs4.testing import skipIf
from bs4.tests import test_htmlparser
from bs4.testing import (
HTMLTreeBuilderSmokeTest,
+ XMLTreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@@ -35,3 +36,14 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
"<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
+
+@skipIf(
+ not LXML_PRESENT,
+ "lxml seems not to be present, not testing its XML tree builder.")
+class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
+ """See ``HTMLTreeBuilderSmokeTest``."""
+
+ @property
+ def default_builder(self):
+ return LXMLTreeBuilderForXML()
+
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 2b7c003..33ab0fa 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -3,7 +3,10 @@
import unittest
from bs4 import BeautifulSoup
-from bs4.element import SoupStrainer
+from bs4.element import (
+ SoupStrainer,
+ NamespacedAttribute,
+ )
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.testing import SoupTest
import warnings
@@ -16,7 +19,7 @@ class TestDeprecatedConstructorArguments(SoupTest):
msg = str(w[0].message)
self.assertTrue("parseOnlyThese" in msg)
self.assertTrue("parse_only" in msg)
- self.assertEquals(b"<b></b>", soup.encode())
+ self.assertEqual(b"<b></b>", soup.encode())
def test_fromEncoding_renamed_to_from_encoding(self):
with warnings.catch_warnings(record=True) as w:
@@ -25,7 +28,7 @@ class TestDeprecatedConstructorArguments(SoupTest):
msg = str(w[0].message)
self.assertTrue("fromEncoding" in msg)
self.assertTrue("from_encoding" in msg)
- self.assertEquals("utf8", soup.original_encoding)
+ self.assertEqual("utf8", soup.original_encoding)
def test_unrecognized_keyword_argument(self):
self.assertRaises(
@@ -206,7 +209,7 @@ class TestUnicodeDammit(unittest.TestCase):
b"<html><meta charset=euc-jp /></html>",
b"<html><meta charset=euc-jp/></html>"):
dammit = UnicodeDammit(data, is_html=True)
- self.assertEquals(
+ self.assertEqual(
"euc-jp", dammit.original_encoding)
def test_last_ditch_entity_replacement(self):
@@ -233,3 +236,30 @@ class TestUnicodeDammit(unittest.TestCase):
msg = w[0].message
self.assertTrue(isinstance(msg, UnicodeWarning))
self.assertTrue("Some characters could not be decoded" in str(msg))
+
+
+class TestNamedspacedAttribute(SoupTest):
+
+ def test_name_may_be_none(self):
+ a = NamespacedAttribute("xmlns", None)
+ self.assertEqual(a, "xmlns")
+
+ def test_attribute_is_equivalent_to_colon_separated_string(self):
+ a = NamespacedAttribute("a", "b")
+ self.assertEqual("a:b", a)
+
+ def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
+ a = NamespacedAttribute("a", "b", "c")
+ b = NamespacedAttribute("a", "b", "c")
+ self.assertEqual(a, b)
+
+ # The actual namespace is not considered.
+ c = NamespacedAttribute("a", "b", None)
+ self.assertEqual(a, c)
+
+ # But name and prefix are important.
+ d = NamespacedAttribute("a", "z", "c")
+ self.assertNotEqual(a, d)
+
+ e = NamespacedAttribute("z", "b", "c")
+ self.assertNotEqual(a, e)
diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py
index 6aa02cb..c75b561 100644
--- a/bs4/tests/test_tree.py
+++ b/bs4/tests/test_tree.py
@@ -18,7 +18,13 @@ from bs4.builder import (
builder_registry,
HTMLParserTreeBuilder,
)
-from bs4.element import CData, NavigableString, SoupStrainer, Tag
+from bs4.element import (
+ CData,
+ Doctype,
+ NavigableString,
+ SoupStrainer,
+ Tag,
+)
from bs4.testing import (
SoupTest,
skipIf,
@@ -97,8 +103,8 @@ class TestFindAllBasicNamespaces(TreeTest):
def test_find_by_namespaced_name(self):
soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">')
- self.assertEquals("4", soup.find("mathml:msqrt").string)
- self.assertEquals("a", soup.find(attrs= { "svg:fill" : "red" }).name)
+ self.assertEqual("4", soup.find("mathml:msqrt").string)
+ self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name)
class TestFindAllByName(TreeTest):
@@ -1277,3 +1283,12 @@ class TestNavigableStringSubclasses(SoupTest):
self.assertEqual(str(soup), "<![CDATA[foo]]>")
self.assertEqual(soup.find(text="foo"), "foo")
self.assertEqual(soup.contents[0], "foo")
+
+ def test_doctype_ends_in_newline(self):
+ # Unlike other NavigableString subclasses, a DOCTYPE always ends
+ # in a newline.
+ doctype = Doctype("foo")
+ soup = self.soup("")
+ soup.insert(1, doctype)
+ self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
+
diff --git a/setup.py b/setup.py
index 0d5b7d7..878d06c 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@ except ImportError:
from distutils.command.build_py import build_py
setup(name="beautifulsoup4",
- version = "4.0.0b7",
+ version = "4.0.0b8",
author="Leonard Richardson",
author_email='leonardr@segfault.org',
url="http://www.crummy.com/software/BeautifulSoup/bs4/",