summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CHANGELOG8
-rw-r--r--bs4/__init__.py24
-rw-r--r--bs4/element.py43
3 files changed, 47 insertions, 28 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 74619a1..6e3a423 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -11,8 +11,14 @@ Python 2 was revision 70f546b1e689a70e2f103795efce6d261a3dadf7.
tags than there are levels in the Python interpreter stack.
[bug=1471755]
+* The copy and deepcopy algorithms no longer use recursive function
+ calls, either, making it possible to copy deeply nested trees.
+
+* Making a copy of a BeautifulSoup object no longer parses the document
+ again, which should improve performance significantly.
+
* Tag.prettify() will now consistently end prettified markup with
- a newline. This is a side effect of the work done for bug #1471755.
+ a newline.
* Added unit tests for fuzz test cases created by third
parties. Most of these tests are skipped since they either point
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 5e1bebe..9c7e67d 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -117,7 +117,7 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
-
+
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
element_classes=None, **kwargs):
@@ -349,19 +349,19 @@ class BeautifulSoup(Tag):
self.markup = None
self.builder.soup = None
- def __copy__(self):
- """Copy a BeautifulSoup object by converting the document to a string and parsing it again."""
- copy = type(self)(
- self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
- )
+ def _clone(self):
+ """Create a new BeautifulSoup object with the same TreeBuilder,
+ but not associated with any markup.
- # Although we encoded the tree to UTF-8, that may not have
- # been the encoding of the original markup. Set the copy's
- # .original_encoding to reflect the original object's
- # .original_encoding.
- copy.original_encoding = self.original_encoding
- return copy
+ This is the first step of the deepcopy process.
+ """
+ clone = type(self)("", None, self.builder)
+ # Keep track of the encoding of the original document,
+ # since we won't be parsing it again.
+ clone.original_encoding = self.original_encoding
+ return clone
+
def __getstate__(self):
# Frequently a tree builder can't be pickled.
d = dict(self.__dict__)
diff --git a/bs4/element.py b/bs4/element.py
index 018f2b3..e75d326 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -966,7 +966,7 @@ class NavigableString(str, PageElement):
return type(self)(self)
def __copy__(self):
- return self.__deepcopy__()
+ return self.__deepcopy__({})
def __getnewargs__(self):
return (str(self),)
@@ -1312,21 +1312,11 @@ class Tag(PageElement):
parserClass = _alias("parser_class") # BS3
- def __deepcopy__(self, recursive=True):
+ def __deepcopy__(self, memo, recursive=True):
"""A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
Its contents are a copy of the old Tag's contents.
"""
- clone = type(self)(
- None, self.builder, self.name, self.namespace,
- self.prefix, self.attrs, is_xml=self._is_xml,
- sourceline=self.sourceline, sourcepos=self.sourcepos,
- can_be_empty_element=self.can_be_empty_element,
- cdata_list_attributes=self.cdata_list_attributes,
- preserve_whitespace_tags=self.preserve_whitespace_tags,
- interesting_string_types=self.interesting_string_types
- )
- for attr in ('can_be_empty_element', 'hidden'):
- setattr(clone, attr, getattr(self, attr))
+ clone = self._clone()
if recursive:
# Clone this tag's descendants recursively, but without
@@ -1338,7 +1328,9 @@ class Tag(PageElement):
# just closed.
tag_stack.pop()
else:
- descendant_clone = element.__copy__(recursive=False)
+ descendant_clone = element.__deepcopy__(
+ memo, recursive=False
+ )
# Add to its parent's .contents
tag_stack[-1].append(descendant_clone)
@@ -1349,8 +1341,29 @@ class Tag(PageElement):
return clone
def __copy__(self):
- return self.__deepcopy__()
+ # A copy of a Tag must always be a deep copy, because the
+ # Tag's children can only have one parent at a time.
+ return self.__deepcopy__({})
+
+ def _clone(self):
+ """Create a new Tag just like this one, but with no
+ contents and unattached to any parse tree.
+ This is the first step in the deepcopy process.
+ """
+ clone = type(self)(
+ None, self.builder, self.name, self.namespace,
+ self.prefix, self.attrs, is_xml=self._is_xml,
+ sourceline=self.sourceline, sourcepos=self.sourcepos,
+ can_be_empty_element=self.can_be_empty_element,
+ cdata_list_attributes=self.cdata_list_attributes,
+ preserve_whitespace_tags=self.preserve_whitespace_tags,
+ interesting_string_types=self.interesting_string_types
+ )
+ for attr in ('can_be_empty_element', 'hidden'):
+ setattr(clone, attr, getattr(self, attr))
+ return clone
+
@property
def is_empty_element(self):
"""Is this tag an empty-element tag? (aka a self-closing tag)