diff options
-rw-r--r-- | NEWS.txt | 4 | ||||
-rw-r--r-- | bs4/__init__.py | 6 | ||||
-rw-r--r-- | bs4/builder/_htmlparser.py | 10 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 7 | ||||
-rw-r--r-- | doc/source/index.rst | 4 |
5 files changed, 27 insertions, 4 deletions
@@ -1,5 +1,9 @@ = 4.3.2 (Unreleased) = +* Fixed a bug in which short Unicode input was improperly encoded to + ASCII when checking whether or not it was a file on + disk. [bug=1227016] + * Combined two tests to stop a spurious test failure when tests are run by nosetests. [bug=1212445] diff --git a/bs4/__init__.py b/bs4/__init__.py index 272d44a..6d44c95 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -164,7 +164,11 @@ class BeautifulSoup(Tag): # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, # just in case that's what the user really wants. - if os.path.exists(markup): + if isinstance(markup, unicode): + possible_filename = markup.encode("utf8") + else: + possible_filename = markup + if os.path.exists(possible_filename): warnings.warn( '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) if markup[:5] == "http:" or markup[:6] == "https:": diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py index 4b80f79..ca8d8b8 100644 --- a/bs4/builder/_htmlparser.py +++ b/bs4/builder/_htmlparser.py @@ -45,7 +45,15 @@ HTMLPARSER = 'html.parser' class BeautifulSoupHTMLParser(HTMLParser): def handle_starttag(self, name, attrs): # XXX namespace - self.soup.handle_starttag(name, None, None, dict(attrs)) + attr_dict = {} + for key, value in attrs: + # Change None attribute values to the empty string + # for consistency with the other tree builders. + if value is None: + value = '' + attr_dict[key] = value + attrvalue = '""' + self.soup.handle_starttag(name, None, None, attr_dict) def handle_endtag(self, name): self.soup.handle_endtag(name) diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index de93513..79a2bc5 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -36,6 +36,13 @@ except ImportError, e: PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) +class TestConstructor(SoupTest): + + def test_short_unicode_input(self): + data = u"<h1>éé</h1>" + soup = self.soup(data) + self.assertEqual(u"éé", soup.h1.string) + class TestDeprecatedConstructorArguments(SoupTest): def test_parseOnlyThese_renamed_to_parse_only(self): diff --git a/doc/source/index.rst b/doc/source/index.rst index f7ee5f9..0e5f6d1 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -1213,8 +1213,8 @@ against each tag's 'href' attribute:: You can filter an attribute based on `a string`_, `a regular expression`_, `a list`_, `a function`_, or `the value True`_. -This code finds all tags that have an ``id`` attribute, regardless of -what the value is:: +This code finds all tags whose ``id`` attribute has a value, +regardless of what the value is:: soup.find_all(id=True) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, |