summaryrefslogtreecommitdiff
path: root/bs4
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2013-10-01 21:55:22 -0400
committerLeonard Richardson <leonardr@segfault.org>2013-10-01 21:55:22 -0400
commit6a06b9d998ea9502a93db14ebb65395b20c1b30f (patch)
tree0999ebae9dcc14522bdbb8bb1a3289d7d7498cbf /bs4
parent623d8c13b79003921fd13b59328d0c28e01eabd0 (diff)
Fixed a bug in which short Unicode input was improperly encoded to ASCII when checking whether or not it was a file on
disk. [bug=1227016]
Diffstat (limited to 'bs4')
-rw-r--r--bs4/__init__.py6
-rw-r--r--bs4/builder/_htmlparser.py10
-rw-r--r--bs4/tests/test_soup.py7
3 files changed, 21 insertions, 2 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 272d44a..6d44c95 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -164,7 +164,11 @@ class BeautifulSoup(Tag):
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants.
- if os.path.exists(markup):
+ if isinstance(markup, unicode):
+ possible_filename = markup.encode("utf8")
+ else:
+ possible_filename = markup
+ if os.path.exists(possible_filename):
warnings.warn(
'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
if markup[:5] == "http:" or markup[:6] == "https:":
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 4b80f79..ca8d8b8 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -45,7 +45,15 @@ HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
def handle_starttag(self, name, attrs):
# XXX namespace
- self.soup.handle_starttag(name, None, None, dict(attrs))
+ attr_dict = {}
+ for key, value in attrs:
+ # Change None attribute values to the empty string
+ # for consistency with the other tree builders.
+ if value is None:
+ value = ''
+ attr_dict[key] = value
+ attrvalue = '""'
+ self.soup.handle_starttag(name, None, None, attr_dict)
def handle_endtag(self, name):
self.soup.handle_endtag(name)
diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py
index de93513..79a2bc5 100644
--- a/bs4/tests/test_soup.py
+++ b/bs4/tests/test_soup.py
@@ -36,6 +36,13 @@ except ImportError, e:
PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
+class TestConstructor(SoupTest):
+
+ def test_short_unicode_input(self):
+ data = u"<h1>éé</h1>"
+ soup = self.soup(data)
+ self.assertEqual(u"éé", soup.h1.string)
+
class TestDeprecatedConstructorArguments(SoupTest):
def test_parseOnlyThese_renamed_to_parse_only(self):