summaryrefslogtreecommitdiff
path: root/bs4/__init__.py
diff options
context:
space:
mode:
authorIsaac Muse <isaacmuse@gmail.com>2019-01-05 12:24:35 -0700
committerIsaac Muse <isaacmuse@gmail.com>2019-01-05 12:24:35 -0700
commit987cadea802228838cab3e7e6cd00a2542604a51 (patch)
tree0302cd786fb5545353c664cd18ecae85763076a5 /bs4/__init__.py
parentfed3ff99f661fbbf994469d522c9016e12be76d7 (diff)
Fix for performance with the linkage fix.
The exact situations have been pinned down, and now solve current known issues without excessive and aggressive recursion.
Diffstat (limited to 'bs4/__init__.py')
-rw-r--r--bs4/__init__.py137
1 files changed, 40 insertions, 97 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index c58ff85..3e34f63 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -435,113 +435,56 @@ class BeautifulSoup(Tag):
if previous_element is None:
previous_element = o.previous_element
+ fix = parent.next_element is not None
+
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
self._most_recent_element = o
parent.contents.append(o)
# Check if we are inserting into an already parsed node.
- if parent.next_element is not None:
-
- # Check that links are proper across tag parent boundaries
- child = self._linkage_fixer(parent)
+ if fix:
+ self._linkage_fixer(parent)
- def _linkage_fixer(self, el, _recursive_call=False):
+ def _linkage_fixer(self, el):
"""Make sure linkage of this fragment is sound."""
- descendant = None
-
- # If element is document element,
- # it should have no previous element, previous sibling, or next sibling.
- if el.parent is None:
- if el.previous_element is not None:
- el.previous_element = None
- if el.previous_sibling is not None:
- el.previous_element = None
- if el.next_sibling is not None:
- el.next_sibling = None
-
- idx = 0
- child = None
- last_child = None
- last_idx = len(el.contents) - 1
- for child in el.contents:
- descendant = None
-
- # Parent should link next element to their first child
- # That child should have no previous sibling
- if idx == 0:
- if el.parent is not None:
- if el.next_element is not child:
- el.next_element = child
-
- if child.previous_element is not el:
- child.previous_element = el
-
- if child.previous_sibling is not None:
- child.previous_sibling = None
-
- # If not the first child, previous index should link as sibling to last index.
- # Previous element should match the last index or the last bubbled up descendant (of a Tag sibling).
- else:
- if child.previous_sibling is not el.contents[idx - 1]:
- child.previous_sibling = el.contents[idx - 1]
- if el.contents[idx - 1].next_sibling is not child:
- el.contents[idx - 1].next_sibling = child
-
- if last_child is not None:
- if child.previous_element is not last_child:
- child.previous_element = last_child
- if last_child.next_element is not child:
- last_child.next_element = child
-
- # This index is a tag, dig deeper for a "last descendant" fixing linkage along the way
- if isinstance(child, Tag) and child.contents:
- descendant = self._linkage_fixer(child, True)
- # A bubbled up descendant should have no next siblings
- # as it is last in its content list.
- if descendant.next_sibling is not None:
- descendant.next_sibling = None
-
- # Mark last child as either the bubbled up descendant or the current child
- if descendant is not None:
- last_child = descendant
- else:
- last_child = child
-
- # If last child in list, there are no next siblings
- if idx == last_idx:
- if child.next_sibling is not None:
- child.next_sibling = None
- idx += 1
-
- # The child to return is either the last descendant (if available)
- # or the last processed child (if any). If neither is available,
- # the parent element is its own last descendant.
- child = descendant if descendant is not None else child
- if child is None:
- child = el
-
- # If not a recursive call, we are done processing this element.
+
+ first = el.contents[0]
+ child = el.contents[-1]
+ descendant = child
+
+ if child is first and el.parent is not None:
+ # Parent should be linked to first child
+ el.next_element = child
+ # We are no longer linked to whatever this element is
+ prev_el = child.previous_element
+ if prev_el is not None and prev_el is not el:
+ prev_el.next_element = None
+ # First child should be linked to the parent, and no previous siblings.
+ child.previous_element = el
+ child.previous_sibling = None
+
+ # We have no sibling as we've been appended as the last.
+ child.next_sibling = None
+
+ # This index is a tag, dig deeper for a "last descendant"
+ if isinstance(child, Tag) and child.contents:
+ descendant = child._last_descendant(False)
+
# As the final step, link last descendant. It should be linked
# to the parent's next sibling (if found), else walk up the chain
- # and find a parent with a sibling.
- if not _recursive_call and child is not None:
- child.next_element = None
- target = el
- while True:
- if target is None:
- break
- elif target.next_sibling is not None:
- child.next_element = target.next_sibling
- target.next_sibling.previous_element = child
- break
- target = target.parent
-
- # We are done, so nothing to return
- return None
- else:
- # Return the child to the recursive caller
- return child
+ # and find a parent with a sibling. It should have no next sibling.
+ descendant.next_element = None
+ descendant.next_sibling = None
+ target = el
+ while True:
+ if target is None:
+ break
+ elif target.next_sibling is not None:
+ descendant.next_element = target.next_sibling
+ target.next_sibling.previous_element = child
+ break
+ target = target.parent
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent