summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
Diffstat (limited to 'bs4')
-rw-r--r--bs4/__init__.py13
-rw-r--r--bs4/tests/test_soup.py6
2 files changed, 17 insertions, 2 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 6d44c95..341efc6 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -164,11 +164,20 @@ class BeautifulSoup(Tag):
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants.
- if isinstance(markup, unicode):
+ if (isinstance(markup, unicode)
+ and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8")
else:
possible_filename = markup
- if os.path.exists(possible_filename):
+ is_file = False
+ try:
+ is_file = os.path.exists(possible_filename)
+ except Exception, e:
+ # This is almost certainly a problem involving
+ # characters not valid in filenames on this
+ # system. Just let it go.
+ pass
+ if is_file:
warnings.warn(
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
if markup[:5] == "http:" or markup[:6] == "https:":
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index 79a2bc5..b0247fe 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -43,6 +43,12 @@ class TestConstructor(SoupTest):
soup = self.soup(data)
self.assertEqual(u"éé", soup.h1.string)
+ def test_embedded_null(self):
+ data = u"<h1>foo\0bar</h1>"
+ soup = self.soup(data)
+ self.assertEqual(u"foo\0bar", soup.h1.string)
+
+
class TestDeprecatedConstructorArguments(SoupTest):
def test_parseOnlyThese_renamed_to_parse_only(self):