diff options
author | Leonard Richardson <leonardr@segfault.org> | 2023-03-26 13:27:11 -0400 |
---|---|---|
committer | Leonard Richardson <leonardr@segfault.org> | 2023-03-26 13:27:11 -0400 |
commit | 544a069c2d802f726c888e9f921f7dc9b79f341a (patch) | |
tree | 11f74078fe7117d393e0367827e96d71bcc4b883 | |
parent | f7dbd541b2473f5e72b818bb64cea434c39d0725 (diff) |
Implement a proper BeautifulSoup.deepcopy rather than parsing the document again.
-rw-r--r-- | CHANGELOG | 8 | ||||
-rw-r--r-- | bs4/__init__.py | 24 | ||||
-rw-r--r-- | bs4/element.py | 43 |
3 files changed, 47 insertions, 28 deletions
@@ -11,8 +11,14 @@ Python 2 was revision 70f546b1e689a70e2f103795efce6d261a3dadf7. tags than there are levels in the Python interpreter stack. [bug=1471755] +* The copy and deepcopy algorithms no longer use recursive function + calls, either, making it possible to copy deeply nested trees. + +* Making a copy of a BeautifulSoup object no longer parses the document + again, which should improve performance significantly. + * Tag.prettify() will now consistently end prettified markup with - a newline. This is a side effect of the work done for bug #1471755. + a newline. * Added unit tests for fuzz test cases created by third parties. Most of these tests are skipped since they either point diff --git a/bs4/__init__.py b/bs4/__init__.py index 5e1bebe..9c7e67d 100644 --- a/bs4/__init__.py +++ b/bs4/__init__.py @@ -117,7 +117,7 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" - + def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, element_classes=None, **kwargs): @@ -349,19 +349,19 @@ class BeautifulSoup(Tag): self.markup = None self.builder.soup = None - def __copy__(self): - """Copy a BeautifulSoup object by converting the document to a string and parsing it again.""" - copy = type(self)( - self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' - ) + def _clone(self): + """Create a new BeautifulSoup object with the same TreeBuilder, + but not associated with any markup. - # Although we encoded the tree to UTF-8, that may not have - # been the encoding of the original markup. Set the copy's - # .original_encoding to reflect the original object's - # .original_encoding. - copy.original_encoding = self.original_encoding - return copy + This is the first step of the deepcopy process. + """ + clone = type(self)("", None, self.builder) + # Keep track of the encoding of the original document, + # since we won't be parsing it again. + clone.original_encoding = self.original_encoding + return clone + def __getstate__(self): # Frequently a tree builder can't be pickled. d = dict(self.__dict__) diff --git a/bs4/element.py b/bs4/element.py index 018f2b3..e75d326 100644 --- a/bs4/element.py +++ b/bs4/element.py @@ -966,7 +966,7 @@ class NavigableString(str, PageElement): return type(self)(self) def __copy__(self): - return self.__deepcopy__() + return self.__deepcopy__({}) def __getnewargs__(self): return (str(self),) @@ -1312,21 +1312,11 @@ class Tag(PageElement): parserClass = _alias("parser_class") # BS3 - def __deepcopy__(self, recursive=True): + def __deepcopy__(self, memo, recursive=True): """A deepcopy of a Tag is a new Tag, unconnected to the parse tree. Its contents are a copy of the old Tag's contents. """ - clone = type(self)( - None, self.builder, self.name, self.namespace, - self.prefix, self.attrs, is_xml=self._is_xml, - sourceline=self.sourceline, sourcepos=self.sourcepos, - can_be_empty_element=self.can_be_empty_element, - cdata_list_attributes=self.cdata_list_attributes, - preserve_whitespace_tags=self.preserve_whitespace_tags, - interesting_string_types=self.interesting_string_types - ) - for attr in ('can_be_empty_element', 'hidden'): - setattr(clone, attr, getattr(self, attr)) + clone = self._clone() if recursive: # Clone this tag's descendants recursively, but without @@ -1338,7 +1328,9 @@ class Tag(PageElement): # just closed. tag_stack.pop() else: - descendant_clone = element.__copy__(recursive=False) + descendant_clone = element.__deepcopy__( + memo, recursive=False + ) # Add to its parent's .contents tag_stack[-1].append(descendant_clone) @@ -1349,8 +1341,29 @@ class Tag(PageElement): return clone def __copy__(self): - return self.__deepcopy__() + # A copy of a Tag must always be a deep copy, because the + # Tag's children can only have one parent at a time. + return self.__deepcopy__({}) + + def _clone(self): + """Create a new Tag just like this one, but with no + contents and unattached to any parse tree. + This is the first step in the deepcopy process. + """ + clone = type(self)( + None, self.builder, self.name, self.namespace, + self.prefix, self.attrs, is_xml=self._is_xml, + sourceline=self.sourceline, sourcepos=self.sourcepos, + can_be_empty_element=self.can_be_empty_element, + cdata_list_attributes=self.cdata_list_attributes, + preserve_whitespace_tags=self.preserve_whitespace_tags, + interesting_string_types=self.interesting_string_types + ) + for attr in ('can_be_empty_element', 'hidden'): + setattr(clone, attr, getattr(self, attr)) + return clone + @property def is_empty_element(self): """Is this tag an empty-element tag? (aka a self-closing tag) |