summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLeonard Richardson <leonardr@segfault.org>2019-07-21 14:58:16 -0400
committerLeonard Richardson <leonardr@segfault.org>2019-07-21 14:58:16 -0400
commitb2294f4f05d9e8583613560986f8aa64b18866b9 (patch)
tree5af13a59eca15ea082cb46ea286bc9c5b91996da
parent819fa4255063d6b8d16f62469afa6c6e504f284a (diff)
Adapt Chris Mayo's code to track line number and position when using html.parser.
-rw-r--r--CHANGELOG7
-rw-r--r--bs4/__init__.py6
-rw-r--r--bs4/builder/__init__.py24
-rw-r--r--bs4/builder/_htmlparser.py9
-rw-r--r--bs4/element.py6
-rw-r--r--bs4/testing.py17
-rw-r--r--bs4/tests/test_htmlparser.py11
-rw-r--r--doc/source/index.rst14
8 files changed, 87 insertions, 7 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 26610f5..62a75ce 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,10 @@
+= Unreleased
+
+* When the html.parser is in use, Beautiful Soup will, by default,
+ record the position in the original document where each tag was
+ encountered. This includes line number (Tag.lineno) and position
+ within a line (Tag.position). Based on code by Chris Mayo.
+
= 4.8.0 (20190720, "One Small Soup")
This release focuses on making it easier to customize Beautiful Soup's
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 9cd01c8..753aa73 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -531,7 +531,8 @@ class BeautifulSoup(Tag):
return most_recently_popped
- def handle_starttag(self, name, namespace, nsprefix, attrs):
+ def handle_starttag(self, name, namespace, nsprefix, attrs, lineno=None,
+ offset=None):
"""Push a start tag on to the stack.
If this method returns None, the tag was rejected by the
@@ -549,7 +550,8 @@ class BeautifulSoup(Tag):
return None
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
- self.currentTag, self._most_recent_element)
+ self.currentTag, self._most_recent_element, lineno=lineno,
+ offset=offset)
if tag is None:
return tag
if self._most_recent_element is not None:
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index e087f07..e28242b 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -99,8 +99,13 @@ class TreeBuilder(object):
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
USE_DEFAULT = object()
+
+ # Most parsers don't keep track of line numbers.
+ TRACKS_LINE_NUMBERS = False
- def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT):
+ def __init__(self, multi_valued_attributes=USE_DEFAULT,
+ preserve_whitespace_tags=USE_DEFAULT,
+ store_line_numbers=USE_DEFAULT):
"""Constructor.
:param multi_valued_attributes: If this is set to None, the
@@ -113,7 +118,17 @@ class TreeBuilder(object):
probably doesn't make sense to an end-user, so the argument name
is `multi_valued_attributes`.
- :param preserve_whitespace_tags:
+ :param preserve_whitespace_tags: A list of tags to treat
+ the way <pre> tags are treated in HTML. Tags in this list
+ will have
+
+ :param store_line_numbers: If the parser keeps track of the
+ line numbers and positions of the original markup, that
+ information will, by default, be stored in each corresponding
+ `Tag` object. You can turn this off by passing
+ store_line_numbers=False. If the parser you're using doesn't
+ keep track of this information, then setting store_line_numbers=True
+ will do nothing.
"""
self.soup = None
if multi_valued_attributes is self.USE_DEFAULT:
@@ -122,7 +137,10 @@ class TreeBuilder(object):
if preserve_whitespace_tags is self.USE_DEFAULT:
preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
self.preserve_whitespace_tags = preserve_whitespace_tags
-
+ if store_line_numbers == self.USE_DEFAULT:
+ store_line_numbers = self.TRACKS_LINE_NUMBERS
+ self.store_line_numbers = store_line_numbers
+
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder.
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 56b8b91..e2c87c1 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -99,7 +99,10 @@ class BeautifulSoupHTMLParser(HTMLParser):
attr_dict[key] = value
attrvalue = '""'
#print "START", name
- tag = self.soup.handle_starttag(name, None, None, attr_dict)
+ lineno, offset = self.getpos()
+ tag = self.soup.handle_starttag(
+ name, None, None, attr_dict, lineno=lineno, offset=offset
+ )
if tag and tag.is_empty_element and handle_empty_element:
# Unlike other parsers, html.parser doesn't send separate end tag
# events for empty-element tags. (It's handled in
@@ -214,6 +217,10 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
NAME = HTMLPARSER
features = [NAME, HTML, STRICT]
+ # The html.parser knows which line number and position in the
+ # original file is the source of a document.
+ TRACKS_LINE_NUMBERS = True
+
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
parser_args = parser_args or []
diff --git a/bs4/element.py b/bs4/element.py
index 73e3867..41acf45 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -724,7 +724,7 @@ class Tag(PageElement):
def __init__(self, parser=None, builder=None, name=None, namespace=None,
prefix=None, attrs=None, parent=None, previous=None,
- is_xml=None):
+ is_xml=None, lineno=None, offset=None):
"Basic constructor."
if parser is None:
@@ -738,6 +738,10 @@ class Tag(PageElement):
self.name = name
self.namespace = namespace
self.prefix = prefix
+ if ((not builder or builder.store_line_numbers)
+ and (lineno is not None or offset is not None)):
+ self.lineno = lineno
+ self.offset = offset
if attrs is None:
attrs = {}
elif attrs:
diff --git a/bs4/testing.py b/bs4/testing.py
index 9f12e8d..3e8d15b 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -790,6 +790,23 @@ Hello, world!
soup = self.soup(BAD_DOCUMENT)
self.linkage_validator(soup)
+ def test_tracking_line_numbers(self):
+ # In general, TreeBuilders do not keep track of
+ # line numbers from the original markup. Even if you
+ # ask for line numbers, we don't have 'em.
+ #
+ # This means that if you have a tag like <lineno> or <offset>,
+ # tag.lineno will find it rather than giving you a numeric
+ # answer.
+ #
+ # See HTMLParserTreeBuilderSmokeTest for a situation
+ # where the parser _does_ keep track of the line numbers.
+ soup = self.soup(
+ "\n <p>\n\n<lineno>\n<b>text</b></lineno><offset></p>",
+ store_line_numbers=True
+ )
+ self.assertEqual("lineno", soup.p.lineno.name)
+ self.assertEqual("offset", soup.p.offset.name)
class XMLTreeBuilderSmokeTest(object):
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 790489a..c6a6691 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -37,6 +37,17 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
# finishes working is handled.
self.assertSoupEquals("foo &# bar", "foo &amp;# bar")
+ def test_tracking_line_numbers(self):
+ # Unlike other TreeBuilders, the html.parser TreeBuilder
+ # keeps track of line number and position of each element.
+ soup = self.soup(
+ "\n <p>\n\n<lineno>\n<b>text</b></lineno><offset></p>",
+ store_line_numbers=True
+ )
+ self.assertEqual(2, soup.p.lineno)
+ self.assertEqual(3, soup.p.offset)
+ self.assertEqual("lineno", soup.p.find('lineno').name)
+
class TestHTMLParserSubclass(SoupTest):
def test_error(self):
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 0c94d6a..69976fe 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2495,6 +2495,20 @@ machines, you should specify a parser in the ``BeautifulSoup``
constructor. That will reduce the chances that your users parse a
document differently from the way you parse it.
+Line numbers
+------------
+
+The html.parser parser will keep track of where in the original
+document it found each Tag. You can access this information as
+``Tag.lineno`` (line number) and ``Tag.offset`` (position of the start
+tag within a line)::
+
+ soup = BeautifulSoup("<p>Paragraph 1</p>\n <p>Paragraph 2</p>", 'html.parser')
+ for tag in soup.find_all('p'):
+ print(tag.lineno, tag.offset, tag.string)
+ # (1, 0, u'Paragraph 1')
+ # (2, 3, u'Paragraph 2')
+
Encodings
=========