summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--NEWS.txt3
-rw-r--r--bs4/__init__.py13
-rw-r--r--bs4/tests/test_soup.py6
3 files changed, 20 insertions, 2 deletions
diff --git a/NEWS.txt b/NEWS.txt
index 6df956a..259771e 100644
--- a/NEWS.txt
+++ b/NEWS.txt
@@ -4,6 +4,9 @@
ASCII when checking whether or not it was a file on
disk. [bug=1227016]
+* Fixed a crash when a short input contains data not valid in
+ filenames. [bug=1232604]
+
* Combined two tests to stop a spurious test failure when tests are
run by nosetests. [bug=1212445]
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 6d44c95..341efc6 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -164,11 +164,20 @@ class BeautifulSoup(Tag):
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants.
- if isinstance(markup, unicode):
+ if (isinstance(markup, unicode)
+ and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8")
else:
possible_filename = markup
- if os.path.exists(possible_filename):
+ is_file = False
+ try:
+ is_file = os.path.exists(possible_filename)
+ except Exception, e:
+ # This is almost certainly a problem involving
+ # characters not valid in filenames on this
+ # system. Just let it go.
+ pass
+ if is_file:
warnings.warn(
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
if markup[:5] == "http:" or markup[:6] == "https:":
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 79a2bc5..b0247fe 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -43,6 +43,12 @@ class TestConstructor(SoupTest):
soup = self.soup(data)
self.assertEqual(u"éé", soup.h1.string)
+ def test_embedded_null(self):
+ data = u"<h1>foo\0bar</h1>"
+ soup = self.soup(data)
+ self.assertEqual(u"foo\0bar", soup.h1.string)
+
+
class TestDeprecatedConstructorArguments(SoupTest):
def test_parseOnlyThese_renamed_to_parse_only(self):