9 files changed, 102 insertions, 49 deletions
diff --git a/bs4/__init__.py b/bs4/__init__.py
index 753aa73..e6efb38 100644
--- a/bs4/__init__.py
+++ b/bs4/__init__.py
@@ -377,10 +377,12 @@ class BeautifulSoup(Tag):
         self.preserve_whitespace_tag_stack = []
         self.pushTag(self)
 
-    def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
+    def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
+                sourceline=None, sourcepos=None, **kwattrs):
         """Create a new tag associated with this soup."""
         kwattrs.update(attrs)
-        return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
+        return Tag(None, self.builder, name, namespace, nsprefix, kwattrs,
+                   sourceline=sourceline, sourcepos=sourcepos)
 
     def new_string(self, s, subclass=NavigableString):
         """Create a new NavigableString associated with this soup."""
@@ -531,8 +533,8 @@ class BeautifulSoup(Tag):
 
         return most_recently_popped
 
-    def handle_starttag(self, name, namespace, nsprefix, attrs, lineno=None,
-                        offset=None):
+    def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
+                        sourcepos=None):
         """Push a start tag on to the stack.
 
         If this method returns None, the tag was rejected by the
@@ -550,8 +552,8 @@ class BeautifulSoup(Tag):
             return None
 
         tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
-                  self.currentTag, self._most_recent_element, lineno=lineno,
-                  offset=offset)
+                  self.currentTag, self._most_recent_element,
+                  sourceline=sourceline, sourcepos=sourcepos)
         if tag is None:
             return tag
         if self._most_recent_element is not None:
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
index 6892a93..13f697c 100644
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@@ -45,6 +45,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
 
     features = [NAME, PERMISSIVE, HTML_5, HTML]
 
+    # html5lib can tell us which line number and position in the
+    # original file is the source of an element.
+    TRACKS_LINE_NUMBERS = True
+    
     def prepare_markup(self, markup, user_specified_encoding,
                        document_declared_encoding=None, exclude_encodings=None):
         # Store the user-specified encoding for use later on.
@@ -62,7 +66,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
         if self.soup.parse_only is not None:
             warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
         parser = html5lib.HTMLParser(tree=self.create_treebuilder)
-
+        self.underlying_builder.parser = parser
         extra_kwargs = dict()
         if not isinstance(markup, unicode):
             if new_html5lib:
@@ -70,7 +74,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
             else:
                 extra_kwargs['encoding'] = self.user_specified_encoding
         doc = parser.parse(markup, **extra_kwargs)
-
+        
         # Set the character encoding detected by the tokenizer.
         if isinstance(markup, unicode):
             # We need to special-case this because html5lib sets
@@ -84,10 +88,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
                 # with other tree builders.
                 original_encoding = original_encoding.name
             doc.original_encoding = original_encoding
-
+        self.underlying_builder.parser = None
+            
     def create_treebuilder(self, namespaceHTMLElements):
         self.underlying_builder = TreeBuilderForHtml5lib(
-            namespaceHTMLElements, self.soup)
+            namespaceHTMLElements, self.soup,
+            store_line_numbers=self.store_line_numbers
+        )
         return self.underlying_builder
 
     def test_fragment_to_document(self, fragment):
@@ -96,15 +103,26 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
 
 
 class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
-
-    def __init__(self, namespaceHTMLElements, soup=None):
+    
+    def __init__(self, namespaceHTMLElements, soup=None,
+                 store_line_numbers=True, **kwargs):
         if soup:
             self.soup = soup
         else:
             from bs4 import BeautifulSoup
-            self.soup = BeautifulSoup("", "html.parser")
+            # TODO: Why is the parser 'html.parser' here? To avoid an
+            # infinite loop?
+            self.soup = BeautifulSoup(
+                "", "html.parser", store_line_numbers=store_line_numbers,
+                **kwargs
+            )
         super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
 
+        # This will be set later to an html5lib.html5parser.HTMLParser
+        # object, which we can use to track the current line number.
+        self.parser = None
+        self.store_line_numbers = store_line_numbers
+        
     def documentClass(self):
         self.soup.reset()
         return Element(self.soup, self.soup, None)
@@ -118,7 +136,16 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
         self.soup.object_was_parsed(doctype)
 
     def elementClass(self, name, namespace):
-        tag = self.soup.new_tag(name, namespace)
+        kwargs = {}
+        if self.parser and self.store_line_numbers:
+            # This represents the point immediately after the end of the
+            # tag. We don't know when the tag started, but we do know
+            # where it ended -- the character just before this one.
+            sourceline, sourcepos = self.parser.tokenizer.stream.position()
+            kwargs['sourceline'] = sourceline
+            kwargs['sourcepos'] = sourcepos-1
+        tag = self.soup.new_tag(name, namespace, **kwargs)
+
         return Element(tag, self.soup, namespace)
 
     def commentClass(self, data):
@@ -126,6 +153,8 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
 
     def fragmentClass(self):
         from bs4 import BeautifulSoup
+        # TODO: Why is the parser 'html.parser' here? To avoid an
+        # infinite loop?
         self.soup = BeautifulSoup("", "html.parser")
         self.soup.name = "[document_fragment]"
         return Element(self.soup, self.soup, None)
diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py
index e2c87c1..cd50eb0 100644
--- a/bs4/builder/_htmlparser.py
+++ b/bs4/builder/_htmlparser.py
@@ -99,9 +99,10 @@ class BeautifulSoupHTMLParser(HTMLParser):
             attr_dict[key] = value
             attrvalue = '""'
         #print "START", name
-        lineno, offset = self.getpos()
+        sourceline, sourcepos = self.getpos()
         tag = self.soup.handle_starttag(
-            name, None, None, attr_dict, lineno=lineno, offset=offset
+            name, None, None, attr_dict, sourceline=sourceline,
+            sourcepos=sourcepos
         )
         if tag and tag.is_empty_element and handle_empty_element:
             # Unlike other parsers, html.parser doesn't send separate end tag
@@ -218,7 +219,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
     features = [NAME, HTML, STRICT]
 
     # The html.parser knows which line number and position in the
-    # original file is the source of a document.
+    # original file is the source of an element.
     TRACKS_LINE_NUMBERS = True
     
     def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py
index 27cadcb..85be1b5 100644
--- a/bs4/builder/_lxml.py
+++ b/bs4/builder/_lxml.py
@@ -57,6 +57,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
 
     DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
 
+    # NOTE: If we parsed Element objects and looked at .sourceline,
+    # we'd be able to see the line numbers from the original document.
+    # But instead we build an XMLParser or HTMLParser object to serve
+    # as the target of parse messages, and those messages don't include
+    # line numbers.
+    
     def initialize_soup(self, soup):
         """Let the BeautifulSoup object know about the standard namespace
         mapping.
diff --git a/bs4/element.py b/bs4/element.py
index 41acf45..a610008 100644
--- a/bs4/element.py
+++ b/bs4/element.py
@@ -724,7 +724,7 @@ class Tag(PageElement):
 
     def __init__(self, parser=None, builder=None, name=None, namespace=None,
                  prefix=None, attrs=None, parent=None, previous=None,
-                 is_xml=None, lineno=None, offset=None):
+                 is_xml=None, sourceline=None, sourcepos=None):
         "Basic constructor."
 
         if parser is None:
@@ -739,9 +739,9 @@ class Tag(PageElement):
         self.namespace = namespace
         self.prefix = prefix
         if ((not builder or builder.store_line_numbers)
-            and (lineno is not None or offset is not None)):
-            self.lineno = lineno
-            self.offset = offset
+            and (sourceline is not None or sourcepos is not None)):
+            self.sourceline = sourceline
+            self.sourcepos = sourcepos
         if attrs is None:
             attrs = {}
         elif attrs:
diff --git a/bs4/testing.py b/bs4/testing.py
index 3e8d15b..9f12e8d 100644
--- a/bs4/testing.py
+++ b/bs4/testing.py
@@ -790,23 +790,6 @@ Hello, world!
         soup = self.soup(BAD_DOCUMENT)
         self.linkage_validator(soup)
 
-    def test_tracking_line_numbers(self):
-        # In general, TreeBuilders do not keep track of
-        # line numbers from the original markup. Even if you
-        # ask for line numbers, we don't have 'em.
-        #
-        # This means that if you have a tag like <lineno> or <offset>,
-        # tag.lineno will find it rather than giving you a numeric
-        # answer.
-        #
-        # See HTMLParserTreeBuilderSmokeTest for a situation
-        # where the parser _does_ keep track of the line numbers.
-        soup = self.soup(
-            "\n   <p>\n\n<lineno>\n<b>text</b></lineno><offset></p>",
-            store_line_numbers=True
-        )
-        self.assertEqual("lineno", soup.p.lineno.name)
-        self.assertEqual("offset", soup.p.offset.name)
 
 class XMLTreeBuilderSmokeTest(object):
 
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
index 371463a..6446f84 100644
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@@ -168,3 +168,17 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
         for form in soup.find_all('form'):
             inputs.extend(form.find_all('input'))
         self.assertEqual(len(inputs), 1)
+
+    def test_tracking_line_numbers(self):
+        # The html.parser TreeBuilder keeps track of line number and
+        # position of each element.
+        markup = "\n   <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
+        soup = self.soup(markup)
+        self.assertEqual(2, soup.p.sourceline)
+        self.assertEqual(5, soup.p.sourcepos)
+        self.assertEqual("sourceline", soup.p.find('sourceline').name)
+
+        # You can deactivate this behavior.
+        soup = self.soup(markup, store_line_numbers=False)
+        self.assertEqual("sourceline", soup.p.sourceline.name)
+        self.assertEqual("sourcepos", soup.p.sourcepos.name)
diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py
index c6a6691..7be6493 100644
--- a/bs4/tests/test_htmlparser.py
+++ b/bs4/tests/test_htmlparser.py
@@ -38,17 +38,20 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
         self.assertSoupEquals("foo &# bar", "foo &amp;# bar")
 
     def test_tracking_line_numbers(self):
-        # Unlike other TreeBuilders, the html.parser TreeBuilder
-        # keeps track of line number and position of each element.
-        soup = self.soup(
-            "\n   <p>\n\n<lineno>\n<b>text</b></lineno><offset></p>",
-            store_line_numbers=True
-        )
-        self.assertEqual(2, soup.p.lineno)
-        self.assertEqual(3, soup.p.offset)
-        self.assertEqual("lineno", soup.p.find('lineno').name)
-
-
+        # The html.parser TreeBuilder keeps track of line number and
+        # position of each element.
+        markup = "\n   <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
+        soup = self.soup(markup)
+        self.assertEqual(2, soup.p.sourceline)
+        self.assertEqual(3, soup.p.sourcepos)
+        self.assertEqual("sourceline", soup.p.find('sourceline').name)
+
+        # You can deactivate this behavior.
+        soup = self.soup(markup, store_line_numbers=False)
+        self.assertEqual("sourceline", soup.p.sourceline.name)
+        self.assertEqual("sourcepos", soup.p.sourcepos.name)
+
+        
 class TestHTMLParserSubclass(SoupTest):
     def test_error(self):
         """Verify that our HTMLParser subclass implements error() in a way
diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py
index 3b7858f..f96e4ae 100644
--- a/bs4/tests/test_lxml.py
+++ b/bs4/tests/test_lxml.py
@@ -71,6 +71,21 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
         self.assertEqual(u"<b/>", unicode(soup.b))
         self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
 
+    def test_tracking_line_numbers(self):
+        # The lxml TreeBuilder cannot keep track of line numbers from
+        # the original markup. Even if you ask for line numbers, we
+        # don't have 'em.
+        #
+        # This means that if you have a tag like <sourceline> or
+        # <sourcepos>, attribute access will find it rather than
+        # giving you a numeric answer.
+        soup = self.soup(
+            "\n   <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>",
+            store_line_numbers=True
+        )
+        self.assertEqual("sourceline", soup.p.sourceline.name)
+        self.assertEqual("sourcepos", soup.p.sourcepos.name)
+        
 @skipIf(
     not LXML_PRESENT,
     "lxml seems not to be present, not testing its XML tree builder.")