Adapt Chris Mayo's code to track line number and position when using html.parser.

author: Leonard Richardson <leonardr@segfault.org> 2019-07-21 14:58:16 -0400
committer: Leonard Richardson <leonardr@segfault.org> 2019-07-21 14:58:16 -0400
commit: b2294f4f05d9e8583613560986f8aa64b18866b9 (patch)
tree: 5af13a59eca15ea082cb46ea286bc9c5b91996da
parent: 819fa4255063d6b8d16f62469afa6c6e504f284a (diff)
8 files changed, 87 insertions, 7 deletions
diff --git a/CHANGELOG b/CHANGELOG
index 26610f5..62a75ce 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,10 @@
+= Unreleased
+
+* When the html.parser is in use, Beautiful Soup will, by default,
+  record the position in the original document where each tag was
+  encountered. This includes line number (Tag.lineno) and position
+  within a line (Tag.position).  Based on code by Chris Mayo.
+
 = 4.8.0 (20190720, "One Small Soup")
 
 This release focuses on making it easier to customize Beautiful Soup's
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 9cd01c8..753aa73 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -531,7 +531,8 @@ class BeautifulSoup(Tag):
 
         return most_recently_popped
 
-    def handle_starttag(self, name, namespace, nsprefix, attrs):
+    def handle_starttag(self, name, namespace, nsprefix, attrs, lineno=None,
+                        offset=None):
         """Push a start tag on to the stack.
 
         If this method returns None, the tag was rejected by the
@@ -549,7 +550,8 @@ class BeautifulSoup(Tag):
             return None
 
         tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
-                  self.currentTag, self._most_recent_element)
+                  self.currentTag, self._most_recent_element, lineno=lineno,
+                  offset=offset)
         if tag is None:
             return tag
         if self._most_recent_element is not None:
diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py
index e087f07..e28242b 100644
--- a/bs4/builder/__init__.py
+++ b/bs4/builder/__init__.py
@@ -99,8 +99,13 @@ class TreeBuilder(object):
     DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
     
     USE_DEFAULT = object()
+
+    # Most parsers don't keep track of line numbers.
+    TRACKS_LINE_NUMBERS = False
     
-    def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT):
+    def __init__(self, multi_valued_attributes=USE_DEFAULT,
+                 preserve_whitespace_tags=USE_DEFAULT,
+                 store_line_numbers=USE_DEFAULT):
         """Constructor.
 
         :param multi_valued_attributes: If this is set to None, the
@@ -113,7 +118,17 @@ class TreeBuilder(object):
         probably doesn't make sense to an end-user, so the argument name
         is `multi_valued_attributes`.
 
-        :param preserve_whitespace_tags:
+        :param preserve_whitespace_tags: A list of tags to treat
+        the way <pre> tags are treated in HTML. Tags in this list
+        will have 
+
+        :param store_line_numbers: If the parser keeps track of the
+        line numbers and positions of the original markup, that
+        information will, by default, be stored in each corresponding
+        `Tag` object. You can turn this off by passing
+        store_line_numbers=False. If the parser you're using doesn't 
+        keep track of this information, then setting store_line_numbers=True
+        will do nothing.
         """
         self.soup = None
         if multi_valued_attributes is self.USE_DEFAULT:
@@ -122,7 +137,10 @@ class TreeBuilder(object):
         if preserve_whitespace_tags is self.USE_DEFAULT:
             preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
         self.preserve_whitespace_tags = preserve_whitespace_tags
-            
+        if store_line_numbers == self.USE_DEFAULT:
+            store_line_numbers = self.TRACKS_LINE_NUMBERS
+        self.store_line_numbers = store_line_numbers
+        
     def initialize_soup(self, soup):
         """The BeautifulSoup object has been initialized and is now
         being associated with the TreeBuilder.
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index 56b8b91..e2c87c1 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -99,7 +99,10 @@ class BeautifulSoupHTMLParser(HTMLParser):
             attr_dict[key] = value
             attrvalue = '""'
         #print "START", name
-        tag = self.soup.handle_starttag(name, None, None, attr_dict)
+        lineno, offset = self.getpos()
+        tag = self.soup.handle_starttag(
+            name, None, None, attr_dict, lineno=lineno, offset=offset
+        )
         if tag and tag.is_empty_element and handle_empty_element:
             # Unlike other parsers, html.parser doesn't send separate end tag
             # events for empty-element tags. (It's handled in
@@ -214,6 +217,10 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
     NAME = HTMLPARSER
     features = [NAME, HTML, STRICT]
 
+    # The html.parser knows which line number and position in the
+    # original file is the source of a document.
+    TRACKS_LINE_NUMBERS = True
+    
     def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
         super(HTMLParserTreeBuilder, self).__init__(**kwargs)
         parser_args = parser_args or []
diff --git a/bs4/element.py b/bs4/element.py
index 73e3867..41acf45 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -724,7 +724,7 @@ class Tag(PageElement):
 
     def __init__(self, parser=None, builder=None, name=None, namespace=None,
                  prefix=None, attrs=None, parent=None, previous=None,
-                 is_xml=None):
+                 is_xml=None, lineno=None, offset=None):
         "Basic constructor."
 
         if parser is None:
@@ -738,6 +738,10 @@ class Tag(PageElement):
         self.name = name
         self.namespace = namespace
         self.prefix = prefix
+        if ((not builder or builder.store_line_numbers)
+            and (lineno is not None or offset is not None)):
+            self.lineno = lineno
+            self.offset = offset
         if attrs is None:
             attrs = {}
         elif attrs:
diff --git a/bs4/testing.py b/bs4/testing.py
index 9f12e8d..3e8d15b 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -790,6 +790,23 @@ Hello, world!
         soup = self.soup(BAD_DOCUMENT)
         self.linkage_validator(soup)
 
+    def test_tracking_line_numbers(self):
+        # In general, TreeBuilders do not keep track of
+        # line numbers from the original markup. Even if you
+        # ask for line numbers, we don't have 'em.
+        #
+        # This means that if you have a tag like <lineno> or <offset>,
+        # tag.lineno will find it rather than giving you a numeric
+        # answer.
+        #
+        # See HTMLParserTreeBuilderSmokeTest for a situation
+        # where the parser _does_ keep track of the line numbers.
+        soup = self.soup(
+            "\n   <p>\n\n<lineno>\n<b>text</b></lineno><offset></p>",
+            store_line_numbers=True
+        )
+        self.assertEqual("lineno", soup.p.lineno.name)
+        self.assertEqual("offset", soup.p.offset.name)
 
 class XMLTreeBuilderSmokeTest(object):
 
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index 790489a..c6a6691 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -37,6 +37,17 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
         # finishes working is handled.
         self.assertSoupEquals("foo &# bar", "foo &amp;# bar")
 
+    def test_tracking_line_numbers(self):
+        # Unlike other TreeBuilders, the html.parser TreeBuilder
+        # keeps track of line number and position of each element.
+        soup = self.soup(
+            "\n   <p>\n\n<lineno>\n<b>text</b></lineno><offset></p>",
+            store_line_numbers=True
+        )
+        self.assertEqual(2, soup.p.lineno)
+        self.assertEqual(3, soup.p.offset)
+        self.assertEqual("lineno", soup.p.find('lineno').name)
+
 
 class TestHTMLParserSubclass(SoupTest):
     def test_error(self):
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 0c94d6a..69976fe 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -2495,6 +2495,20 @@ machines, you should specify a parser in the ``BeautifulSoup``
 constructor. That will reduce the chances that your users parse a
 document differently from the way you parse it.
 
+Line numbers
+------------
+
+The html.parser parser will keep track of where in the original
+document it found each Tag. You can access this information as
+``Tag.lineno`` (line number) and ``Tag.offset`` (position of the start
+tag within a line)::
+
+   soup = BeautifulSoup("<p>Paragraph 1</p>\n    <p>Paragraph 2</p>", 'html.parser')
+   for tag in soup.find_all('p'):
+       print(tag.lineno, tag.offset, tag.string)
+   # (1, 0, u'Paragraph 1')
+   # (2, 3, u'Paragraph 2')
+       
 Encodings
 =========
author	Leonard Richardson <leonardr@segfault.org>	2019-07-21 14:58:16 -0400
committer	Leonard Richardson <leonardr@segfault.org>	2019-07-21 14:58:16 -0400
commit	b2294f4f05d9e8583613560986f8aa64b18866b9 (patch)
tree	5af13a59eca15ea082cb46ea286bc9c5b91996da
parent	819fa4255063d6b8d16f62469afa6c6e504f284a (diff)