FooBar
should pop to 'p', not 'b'.
Foo
| * | * should pop to 'tr', not the first 'td'
"""
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
isNestable = nestingResetTriggers != None
isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
popTo = None
inclusive = True
for i in range(len(self.tagStack)-1, 0, -1):
p = self.tagStack[i]
if (not p or p.name == name) and not isNestable:
#Non-nestable tags get popped to the top or to their
#last occurance.
popTo = name
break
if (nestingResetTriggers != None
and p.name in nestingResetTriggers) \
or (nestingResetTriggers == None and isResetNesting
and self.RESET_NESTING_TAGS.has_key(p.name)):
#If we encounter one of the nesting reset triggers
#peculiar to this tag, or we encounter another tag
#that causes nesting to reset, pop up to but not
#including that tag.
popTo = p.name
inclusive = False
break
p = p.parent
if popTo:
self._popToTag(popTo, inclusive)
def unknown_starttag(self, name, attrs, selfClosing=0):
#print "Start tag %s: %s" % (name, attrs)
if self.quoteStack:
#This is not a real tag.
#print "<%s> is not real!" % name
attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
self.currentData.append('<%s%s>' % (name, attrs))
return
self.endData()
if not self.isSelfClosingTag(name) and not selfClosing:
self._smartPop(name)
if self.parseOnlyThese and len(self.tagStack) <= 1 \
and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
return
tag = Tag(self, name, attrs, self.currentTag, self.previous)
if self.previous:
self.previous.next = tag
self.previous = tag
self.pushTag(tag)
if selfClosing or self.isSelfClosingTag(name):
self.popTag()
if name in self.QUOTE_TAGS:
#print "Beginning quote (%s)" % name
self.quoteStack.append(name)
self.literal = 1
return tag
def unknown_endtag(self, name):
#print "End tag %s" % name
if self.quoteStack and self.quoteStack[-1] != name:
#This is not a real end tag.
#print "%s> is not real!" % name
self.currentData.append('%s>' % name)
return
self.endData()
self._popToTag(name)
if self.quoteStack and self.quoteStack[-1] == name:
self.quoteStack.pop()
self.literal = (len(self.quoteStack) > 0)
def handle_data(self, data):
if self.convertHTMLEntities:
if data[0] == '&':
data = self.BARE_AMPERSAND.sub("&",data)
else:
data = data.replace('&','&') \
.replace('<','<') \
.replace('>','>')
self.currentData.append(data)
def _toStringSubclass(self, text, subclass):
"""Adds a certain piece of text to the tree as a NavigableString
subclass."""
self.endData()
self.handle_data(text)
self.endData(subclass)
def handle_pi(self, text):
"""Handle a processing instruction as a ProcessingInstruction
object, possibly one with a %SOUP-ENCODING% slot into which an
encoding will be plugged later."""
if text[:3] == "xml":
text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
self._toStringSubclass(text, ProcessingInstruction)
def handle_comment(self, text):
"Handle comments as Comment objects."
self._toStringSubclass(text, Comment)
def handle_charref(self, ref):
"Handle character references as data."
if ref[0] == 'x':
data = unichr(int(ref[1:],16))
else:
data = unichr(int(ref))
if u'\x80' <= data <= u'\x9F':
data = UnicodeDammit.subMSChar(chr(ord(data)), self.smartQuotesTo)
elif not self.convertHTMLEntities and not self.convertXMLEntities:
data = '%s;' % ref
self.handle_data(data)
def handle_entityref(self, ref):
"""Handle entity references as data, possibly converting known
HTML entity references to the corresponding Unicode
characters."""
replaceWithXMLEntity = self.convertXMLEntities and \
self.XML_ENTITIES_TO_CHARS.has_key(ref)
if self.convertHTMLEntities or replaceWithXMLEntity:
try:
data = unichr(name2codepoint[ref])
except KeyError:
if replaceWithXMLEntity:
data = self.XML_ENTITIES_TO_CHARS.get(ref)
else:
data="&%s" % ref
else:
data = '&%s;' % ref
self.handle_data(data)
def handle_decl(self, data):
"Handle DOCTYPEs and the like as Declaration objects."
self._toStringSubclass(data, Declaration)
def parse_declaration(self, i):
"""Treat a bogus SGML declaration as raw data. Treat a CDATA
declaration as a CData object."""
j = None
if self.rawdata[i:i+9] == '', i)
if k == -1:
k = len(self.rawdata)
data = self.rawdata[i+9:k]
j = k+3
self._toStringSubclass(data, CData)
else:
try:
j = SGMLParser.parse_declaration(self, i)
except SGMLParseError:
toHandle = self.rawdata[i:]
self.handle_data(toHandle)
j = i + len(toHandle)
return j
class BeautifulSoup(BeautifulStoneSoup):
"""This parser knows the following facts about HTML:
* Some tags have no closing tag and should be interpreted as being
closed as soon as they are encountered.
* The text inside some tags (ie. 'script') may contain tags which
are not really part of the document and which should be parsed
as text, not tags. If you want to parse the text as tags, you can
always fetch it and parse it explicitly.
* Tag nesting rules:
Most tags can't be nested at all. For instance, the occurance of
a tag should implicitly close the previous tag. Para1 Para2 should be transformed into: Para1 Para2 Some tags can be nested arbitrarily. For instance, the occurance of a tag should _not_ implicitly close the previoustag. Alice said:Bob said:Blah should NOT be transformed into: Alice said:Bob said:Blah Some tags can be nested, but the nesting is reset by the interposition of other tags. For instance, a |