diff options
-rw-r--r-- | README.txt | 7 | ||||
-rw-r--r-- | TODO | 41 | ||||
-rw-r--r-- | bs4/element.py | 8 | ||||
-rw-r--r-- | bs4/tests/test_tree.py | 17 |
4 files changed, 55 insertions, 18 deletions
@@ -141,6 +141,13 @@ You can write this: (But the old code will still work.) +Some of the generators used to yield None after they were done, and +then stop. That was a bug. Now, the generators just stop. + +There are two new generators, .strings and .stripped_strings. .strings +yields NavigableString objects, and .stripped_strings yields Python +strings that have had whitespace stripped. + == tag.string is recursive == tag.string now operates recursively. If tag A contains a single tag B @@ -1,26 +1,43 @@ -soup.new_tar("<br>") should create an empty-element tag if the soup +Bugs +---- + +* I think whitespace may not be processed correctly. + +* Characters like & < > should always be converted to HTML entities on + output, even if substitute_html_entities is False. + +Big features +------------ + +* Add namespace support. + +* soup.new_tag("<br>") should create an empty-element tag if the soup was created with an HTML-aware builder, but not otherwise. This requires keeping around information about the builder. -Is whitespace being processed correctly? +Optimizations +------------- -if len(tag) > 3 and tag.endswith('Tag'): -> endswith('_tag') markup_attr_map can be optimized since it's always a map now. -Can we get rid of isList? -Split self.assertRaises(ValueError, tree.index, 1) into a separate test -Bare ampersands should be converted to HTML entities upon output. +BS3 features not yet ported +--------------------------- + +* In BS3, "soup.aTag" is the same as 'soup.find("a")'. This lets you +locate a tag called (let's say) "find" with attribute +access. "soup.find" won't do what you want, but "soup.findTag" will. -Add namespace support. +This still works In BS4 but it's deprecated. I could make +"soup.find_tag" work the same way as "soup.find('find')", but I don't +think it's worth it. -XML handling: +CDATA +----- The elementtree XMLParser has a strip_cdata argument that, when set to False, should allow Beautiful Soup to preserve CDATA sections instead -of treating them as text. (This argument is also present for -HTMLParser, but does nothing.) - -Later: +of treating them as text. Except it doesn't. (This argument is also +present for HTMLParser, and also does nothing there.) Currently, htm5lib converts CDATA sections into comments. An as-yet-unreleased version of html5lib changes the parser's handling of diff --git a/bs4/element.py b/bs4/element.py index 75d5d9d..08a0181 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -1,6 +1,7 @@ import collections import re import sys +import warnings from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" @@ -618,7 +619,12 @@ class Tag(PageElement): def __getattr__(self, tag): #print "Getattr %s.%s" % (self.__class__, tag) if len(tag) > 3 and tag.endswith('Tag'): - return self.find(tag[:-3]) + # BS3: soup.aTag -> "soup.find("a") + tag_name = tag[:-3] + warnings.warn( + '.%sTag is deprecated, use .find("%s") instead.' % ( + tag_name, tag_name)) + return self.find(tag_name) # We special case contents to avoid recursion. elif not tag.startswith("__") and not tag=="contents": return self.find(tag) diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py index 60b9b91..c991a85 100644 --- a/bs4/tests/test_tree.py +++ b/bs4/tests/test_tree.py @@ -2,7 +2,7 @@ """Tests for Beautiful Soup's tree traversal methods. The tree traversal methods are the main advantage of using Beautiful -Soup over other parsers. +Soup over just using a parser. Different parsers will build different Beautiful Soup trees given the same markup, but all Beautiful Soup trees can be traversed with the @@ -12,6 +12,7 @@ methods tested here. import copy import pickle import re +import warnings from bs4 import BeautifulSoup from bs4.builder import builder_registry from bs4.element import CData, SoupStrainer, Tag @@ -778,14 +779,20 @@ class TestElementObjects(SoupTest): self.assertEqual(len(soup.top.contents), 3) def test_member_access_invokes_find(self): - """Accessing a Python member .foo or .fooTag invokes find('foo')""" + """Accessing a Python member .foo invokes find('foo')""" soup = self.soup('<b><i></i></b>') self.assertEqual(soup.b, soup.find('b')) - self.assertEqual(soup.bTag, soup.find('b')) self.assertEqual(soup.b.i, soup.find('b').find('i')) - self.assertEqual(soup.bTag.iTag, soup.find('b').find('i')) self.assertEqual(soup.a, None) - self.assertEqual(soup.aTag, None) + + def test_deprecated_member_access(self): + soup = self.soup('<b><i></i></b>') + with warnings.catch_warnings(record=True) as w: + tag = soup.bTag + self.assertEqual(soup.b, tag) + self.assertEqual( + '.bTag is deprecated, use .find("b") instead.', + str(w[0].message)) def test_has_attr(self): """has_attr() checks for the presence of an attribute. |