diff options
-rw-r--r-- | NEWS.txt | 3 | ||||
-rw-r--r-- | bs4/__init__.py | 13 | ||||
-rw-r--r-- | bs4/tests/test_soup.py | 6 |
3 files changed, 20 insertions, 2 deletions
@@ -4,6 +4,9 @@ ASCII when checking whether or not it was a file on disk. [bug=1227016] +* Fixed a crash when a short input contains data not valid in + filenames. [bug=1232604] + * Combined two tests to stop a spurious test failure when tests are run by nosetests. [bug=1212445] diff --git a/bs4/__init__.py b/bs4/__init__.py index 6d44c95..341efc6 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -164,11 +164,20 @@ class BeautifulSoup(Tag): # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, # just in case that's what the user really wants. - if isinstance(markup, unicode): + if (isinstance(markup, unicode) + and not os.path.supports_unicode_filenames): possible_filename = markup.encode("utf8") else: possible_filename = markup - if os.path.exists(possible_filename): + is_file = False + try: + is_file = os.path.exists(possible_filename) + except Exception, e: + # This is almost certainly a problem involving + # characters not valid in filenames on this + # system. Just let it go. + pass + if is_file: warnings.warn( '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) if markup[:5] == "http:" or markup[:6] == "https:": diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py index 79a2bc5..b0247fe 100644 --- a/bs4/tests/test_soup.py +++ b/bs4/tests/test_soup.py @@ -43,6 +43,12 @@ class TestConstructor(SoupTest): soup = self.soup(data) self.assertEqual(u"éé", soup.h1.string) + def test_embedded_null(self): + data = u"<h1>foo\0bar</h1>" + soup = self.soup(data) + self.assertEqual(u"foo\0bar", soup.h1.string) + + class TestDeprecatedConstructorArguments(SoupTest): def test_parseOnlyThese_renamed_to_parse_only(self): |