root/django/trunk/contrib/BeautifulSoup.py

Revision 88, 67.9 kB (checked in by steadicat, 13 months ago)

Added BeatifulSoup? with soupselect for testing web pages.

  • Property svn:keywords set to Id
Line 
1"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/
5
6Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7tree representation. It provides methods and Pythonic idioms that make
8it easy to navigate, search, and modify the tree.
9
10A well-formed XML/HTML document yields a well-formed data
11structure. An ill-formed XML/HTML document yields a correspondingly
12ill-formed data structure. If your document is only locally
13well-formed, you can use this library to find and process the
14well-formed part of it. The BeautifulSoup class
15
16Beautiful Soup works with Python 2.2 and up. It has no external
17dependencies, but you'll have more success at converting data to UTF-8
18if you also install these three packages:
19
20* chardet, for auto-detecting character encodings
21  http://chardet.feedparser.org/
22* cjkcodecs and iconv_codec, which add more encodings to the ones supported
23  by stock Python.
24  http://cjkpython.i18n.org/
25
26Beautiful Soup defines classes for two main parsing strategies:
27   
28 * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29   language that kind of looks like XML.
30
31 * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32   or invalid. This class has web browser-like heuristics for
33   obtaining a sensible parse tree in the face of common HTML errors.
34
35Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36the encoding of an HTML or XML document, and converting it to
37Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38
39For more than you ever wanted to know about Beautiful Soup, see the
40documentation:
41http://www.crummy.com/software/BeautifulSoup/documentation.html
42
43"""
44from __future__ import generators
45
46__author__ = "Leonard Richardson (leonardr@segfault.org)"
47__version__ = "3.0.4"
48__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
49__license__ = "PSF"
50
51from sgmllib import SGMLParser, SGMLParseError
52import codecs
53import types
54import re
55import sgmllib
56try:
57  from htmlentitydefs import name2codepoint
58except ImportError:
59  name2codepoint = {}
60
61#This hack makes Beautiful Soup able to parse XML with namespaces
62sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
63
64DEFAULT_OUTPUT_ENCODING = "utf-8"
65
66# First, the classes that represent markup elements.
67
68class PageElement:
69    """Contains the navigational information for some part of the page
70    (either a tag or a piece of text)"""
71
72    def setup(self, parent=None, previous=None):
73        """Sets up the initial relations between this element and
74        other elements."""       
75        self.parent = parent
76        self.previous = previous
77        self.next = None
78        self.previousSibling = None
79        self.nextSibling = None
80        if self.parent and self.parent.contents:
81            self.previousSibling = self.parent.contents[-1]
82            self.previousSibling.nextSibling = self
83
84    def replaceWith(self, replaceWith):       
85        oldParent = self.parent
86        myIndex = self.parent.contents.index(self)
87        if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
88            # We're replacing this element with one of its siblings.
89            index = self.parent.contents.index(replaceWith)
90            if index and index < myIndex:
91                # Furthermore, it comes before this element. That
92                # means that when we extract it, the index of this
93                # element will change.
94                myIndex = myIndex - 1
95        self.extract()       
96        oldParent.insert(myIndex, replaceWith)
97       
98    def extract(self):
99        """Destructively rips this element out of the tree."""       
100        if self.parent:
101            try:
102                self.parent.contents.remove(self)
103            except ValueError:
104                pass
105
106        #Find the two elements that would be next to each other if
107        #this element (and any children) hadn't been parsed. Connect
108        #the two.       
109        lastChild = self._lastRecursiveChild()
110        nextElement = lastChild.next
111
112        if self.previous:
113            self.previous.next = nextElement
114        if nextElement:
115            nextElement.previous = self.previous
116        self.previous = None
117        lastChild.next = None
118
119        self.parent = None       
120        if self.previousSibling:
121            self.previousSibling.nextSibling = self.nextSibling
122        if self.nextSibling:
123            self.nextSibling.previousSibling = self.previousSibling
124        self.previousSibling = self.nextSibling = None       
125
126    def _lastRecursiveChild(self):
127        "Finds the last element beneath this object to be parsed."
128        lastChild = self
129        while hasattr(lastChild, 'contents') and lastChild.contents:
130            lastChild = lastChild.contents[-1]
131        return lastChild
132
133    def insert(self, position, newChild):
134        if (isinstance(newChild, basestring)
135            or isinstance(newChild, unicode)) \
136            and not isinstance(newChild, NavigableString):
137            newChild = NavigableString(newChild)       
138
139        position =  min(position, len(self.contents))
140        if hasattr(newChild, 'parent') and newChild.parent != None:
141            # We're 'inserting' an element that's already one
142            # of this object's children.
143            if newChild.parent == self:
144                index = self.find(newChild)
145                if index and index < position:
146                    # Furthermore we're moving it further down the
147                    # list of this object's children. That means that
148                    # when we extract this element, our target index
149                    # will jump down one.
150                    position = position - 1
151            newChild.extract()
152           
153        newChild.parent = self
154        previousChild = None
155        if position == 0:
156            newChild.previousSibling = None
157            newChild.previous = self
158        else:
159            previousChild = self.contents[position-1]
160            newChild.previousSibling = previousChild
161            newChild.previousSibling.nextSibling = newChild
162            newChild.previous = previousChild._lastRecursiveChild()
163        if newChild.previous:
164            newChild.previous.next = newChild       
165
166        newChildsLastElement = newChild._lastRecursiveChild()
167
168        if position >= len(self.contents):
169            newChild.nextSibling = None
170           
171            parent = self
172            parentsNextSibling = None
173            while not parentsNextSibling:
174                parentsNextSibling = parent.nextSibling
175                parent = parent.parent
176                if not parent: # This is the last element in the document.
177                    break
178            if parentsNextSibling:
179                newChildsLastElement.next = parentsNextSibling
180            else:
181                newChildsLastElement.next = None
182        else:
183            nextChild = self.contents[position]           
184            newChild.nextSibling = nextChild           
185            if newChild.nextSibling:
186                newChild.nextSibling.previousSibling = newChild
187            newChildsLastElement.next = nextChild
188
189        if newChildsLastElement.next:
190            newChildsLastElement.next.previous = newChildsLastElement
191        self.contents.insert(position, newChild)
192
193    def findNext(self, name=None, attrs={}, text=None, **kwargs):
194        """Returns the first item that matches the given criteria and
195        appears after this Tag in the document."""
196        return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
197
198    def findAllNext(self, name=None, attrs={}, text=None, limit=None,
199                    **kwargs):
200        """Returns all items that match the given criteria and appear
201        before after Tag in the document."""
202        return self._findAll(name, attrs, text, limit, self.nextGenerator)
203
204    def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
205        """Returns the closest sibling to this Tag that matches the
206        given criteria and appears after this Tag in the document."""
207        return self._findOne(self.findNextSiblings, name, attrs, text,
208                             **kwargs)
209
210    def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
211                         **kwargs):
212        """Returns the siblings of this Tag that match the given
213        criteria and appear after this Tag in the document."""
214        return self._findAll(name, attrs, text, limit,
215                             self.nextSiblingGenerator, **kwargs)
216    fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
217
218    def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
219        """Returns the first item that matches the given criteria and
220        appears before this Tag in the document."""
221        return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
222
223    def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
224                        **kwargs):
225        """Returns all items that match the given criteria and appear
226        before this Tag in the document."""
227        return self._findAll(name, attrs, text, limit, self.previousGenerator,
228                           **kwargs)
229    fetchPrevious = findAllPrevious # Compatibility with pre-3.x
230
231    def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
232        """Returns the closest sibling to this Tag that matches the
233        given criteria and appears before this Tag in the document."""
234        return self._findOne(self.findPreviousSiblings, name, attrs, text,
235                             **kwargs)
236
237    def findPreviousSiblings(self, name=None, attrs={}, text=None,
238                             limit=None, **kwargs):
239        """Returns the siblings of this Tag that match the given
240        criteria and appear before this Tag in the document."""
241        return self._findAll(name, attrs, text, limit,
242                             self.previousSiblingGenerator, **kwargs)
243    fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
244
245    def findParent(self, name=None, attrs={}, **kwargs):
246        """Returns the closest parent of this Tag that matches the given
247        criteria."""
248        # NOTE: We can't use _findOne because findParents takes a different
249        # set of arguments.
250        r = None
251        l = self.findParents(name, attrs, 1)
252        if l:
253            r = l[0]
254        return r
255
256    def findParents(self, name=None, attrs={}, limit=None, **kwargs):
257        """Returns the parents of this Tag that match the given
258        criteria."""
259
260        return self._findAll(name, attrs, None, limit, self.parentGenerator,
261                             **kwargs)
262    fetchParents = findParents # Compatibility with pre-3.x
263
264    #These methods do the real heavy lifting.
265
266    def _findOne(self, method, name, attrs, text, **kwargs):
267        r = None
268        l = method(name, attrs, text, 1, **kwargs)
269        if l:
270            r = l[0]
271        return r
272   
273    def _findAll(self, name, attrs, text, limit, generator, **kwargs):
274        "Iterates over a generator looking for things that match."
275
276        if isinstance(name, SoupStrainer):
277            strainer = name
278        else:
279            # Build a SoupStrainer
280            strainer = SoupStrainer(name, attrs, text, **kwargs)
281        results = ResultSet(strainer)
282        g = generator()
283        while True:
284            try:
285                i = g.next()
286            except StopIteration:
287                break
288            if i:
289                found = strainer.search(i)
290                if found:
291                    results.append(found)
292                    if limit and len(results) >= limit:
293                        break
294        return results
295
296    #These Generators can be used to navigate starting from both
297    #NavigableStrings and Tags.               
298    def nextGenerator(self):
299        i = self
300        while i:
301            i = i.next
302            yield i
303
304    def nextSiblingGenerator(self):
305        i = self
306        while i:
307            i = i.nextSibling
308            yield i
309
310    def previousGenerator(self):
311        i = self
312        while i:
313            i = i.previous
314            yield i
315
316    def previousSiblingGenerator(self):
317        i = self
318        while i:
319            i = i.previousSibling
320            yield i
321
322    def parentGenerator(self):
323        i = self
324        while i:
325            i = i.parent
326            yield i
327
328    # Utility methods
329    def substituteEncoding(self, str, encoding=None):
330        encoding = encoding or "utf-8"
331        return str.replace("%SOUP-ENCODING%", encoding)   
332
333    def toEncoding(self, s, encoding=None):
334        """Encodes an object to a string in some encoding, or to Unicode.
335        ."""
336        if isinstance(s, unicode):
337            if encoding:
338                s = s.encode(encoding)
339        elif isinstance(s, str):
340            if encoding:
341                s = s.encode(encoding)
342            else:
343                s = unicode(s)
344        else:
345            if encoding:
346                s  = self.toEncoding(str(s), encoding)
347            else:
348                s = unicode(s)
349        return s
350
351class NavigableString(unicode, PageElement):
352
353    def __getattr__(self, attr):
354        """text.string gives you text. This is for backwards
355        compatibility for Navigable*String, but for CData* it lets you
356        get the string without the CData wrapper."""
357        if attr == 'string':
358            return self
359        else:
360            raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
361
362    def __unicode__(self):
363        return self.__str__(None)
364
365    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
366        if encoding:
367            return self.encode(encoding)
368        else:
369            return self
370       
371class CData(NavigableString):
372
373    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
374        return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
375
376class ProcessingInstruction(NavigableString):
377    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
378        output = self
379        if "%SOUP-ENCODING%" in output:
380            output = self.substituteEncoding(output, encoding)
381        return "<?%s?>" % self.toEncoding(output, encoding)
382
383class Comment(NavigableString):
384    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
385        return "<!--%s-->" % NavigableString.__str__(self, encoding)   
386
387class Declaration(NavigableString):
388    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
389        return "<!%s>" % NavigableString.__str__(self, encoding)       
390
391class Tag(PageElement):
392
393    """Represents a found HTML tag with its attributes and contents."""
394
395    XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot",
396                                      '"' : "quote",
397                                      "&" : "amp",
398                                      "<" : "lt",
399                                      ">" : "gt" }
400
401    def __init__(self, parser, name, attrs=None, parent=None,
402                 previous=None):
403        "Basic constructor."
404
405        # We don't actually store the parser object: that lets extracted
406        # chunks be garbage-collected
407        self.parserClass = parser.__class__
408        self.isSelfClosing = parser.isSelfClosingTag(name)
409        self.name = name
410        if attrs == None:
411            attrs = []
412        self.attrs = attrs
413        self.contents = []
414        self.setup(parent, previous)
415        self.hidden = False
416        self.containsSubstitutions = False
417
418    def get(self, key, default=None):
419        """Returns the value of the 'key' attribute for the tag, or
420        the value given for 'default' if it doesn't have that
421        attribute."""
422        return self._getAttrMap().get(key, default)   
423
424    def has_key(self, key):
425        return self._getAttrMap().has_key(key)
426
427    def __getitem__(self, key):
428        """tag[key] returns the value of the 'key' attribute for the tag,
429        and throws an exception if it's not there."""
430        return self._getAttrMap()[key]
431
432    def __iter__(self):
433        "Iterating over a tag iterates over its contents."
434        return iter(self.contents)
435
436    def __len__(self):
437        "The length of a tag is the length of its list of contents."
438        return len(self.contents)
439
440    def __contains__(self, x):
441        return x in self.contents
442
443    def __nonzero__(self):
444        "A tag is non-None even if it has no contents."
445        return True
446
447    def __setitem__(self, key, value):       
448        """Setting tag[key] sets the value of the 'key' attribute for the
449        tag."""
450        self._getAttrMap()
451        self.attrMap[key] = value
452        found = False
453        for i in range(0, len(self.attrs)):
454            if self.attrs[i][0] == key:
455                self.attrs[i] = (key, value)
456                found = True
457        if not found:
458            self.attrs.append((key, value))
459        self._getAttrMap()[key] = value
460
461    def __delitem__(self, key):
462        "Deleting tag[key] deletes all 'key' attributes for the tag."
463        for item in self.attrs:
464            if item[0] == key:
465                self.attrs.remove(item)
466                #We don't break because bad HTML can define the same
467                #attribute multiple times.
468            self._getAttrMap()
469            if self.attrMap.has_key(key):
470                del self.attrMap[key]
471
472    def __call__(self, *args, **kwargs):
473        """Calling a tag like a function is the same as calling its
474        findAll() method. Eg. tag('a') returns a list of all the A tags
475        found within this tag."""
476        return apply(self.findAll, args, kwargs)
477
478    def __getattr__(self, tag):
479        #print "Getattr %s.%s" % (self.__class__, tag)
480        if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
481            return self.find(tag[:-3])
482        elif tag.find('__') != 0:
483            return self.find(tag)
484
485    def __eq__(self, other):
486        """Returns true iff this tag has the same name, the same attributes,
487        and the same contents (recursively) as the given tag.
488
489        NOTE: right now this will return false if two tags have the
490        same attributes in a different order. Should this be fixed?"""
491        if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
492            return False
493        for i in range(0, len(self.contents)):
494            if self.contents[i] != other.contents[i]:
495                return False
496        return True
497
498    def __ne__(self, other):
499        """Returns true iff this tag is not identical to the other tag,
500        as defined in __eq__."""
501        return not self == other
502
503    def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
504        """Renders this tag as a string."""
505        return self.__str__(encoding)
506
507    def __unicode__(self):
508        return self.__str__(None)
509
510    def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
511                prettyPrint=False, indentLevel=0):
512        """Returns a string or Unicode representation of this tag and
513        its contents. To get Unicode, pass None for encoding.
514
515        NOTE: since Python's HTML parser consumes whitespace, this
516        method is not certain to reproduce the whitespace present in
517        the original string."""
518
519        encodedName = self.toEncoding(self.name, encoding)
520
521        attrs = []
522        if self.attrs:
523            for key, val in self.attrs:
524                fmt = '%s="%s"'
525                if isString(val):                   
526                    if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
527                        val = self.substituteEncoding(val, encoding)
528
529                    # The attribute value either:
530                    #
531                    # * Contains no embedded double quotes or single quotes.
532                    #   No problem: we enclose it in double quotes.
533                    # * Contains embedded single quotes. No problem:
534                    #   double quotes work here too.
535                    # * Contains embedded double quotes. No problem:
536                    #   we enclose it in single quotes.
537                    # * Embeds both single _and_ double quotes. This
538                    #   can't happen naturally, but it can happen if
539                    #   you modify an attribute value after parsing
540                    #   the document. Now we have a bit of a
541                    #   problem. We solve it by enclosing the
542                    #   attribute in single quotes, and escaping any
543                    #   embedded single quotes to XML entities.
544                    if '"' in val:
545                        fmt = "%s='%s'"
546                        # This can't happen naturally, but it can happen
547                        # if you modify an attribute value after parsing.
548                        if "'" in val:
549                            val = val.replace("'", "&squot;")
550
551                    # Now we're okay w/r/t quotes. But the attribute
552                    # value might also contain angle brackets, or
553                    # ampersands that aren't part of entities. We need
554                    # to escape those to XML entities too.
555                    val = re.sub("([<>]|&(?![^\s]+;))",
556                                 lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";",
557                                 val)
558                                     
559                attrs.append(fmt % (self.toEncoding(key, encoding),
560                                    self.toEncoding(val, encoding)))
561        close = ''
562        closeTag = ''
563        if self.isSelfClosing:
564            close = ' /'
565        else:
566            closeTag = '</%s>' % encodedName
567
568        indentTag, indentContents = 0, 0
569        if prettyPrint:
570            indentTag = indentLevel
571            space = (' ' * (indentTag-1))
572            indentContents = indentTag + 1
573        contents = self.renderContents(encoding, prettyPrint, indentContents)
574        if self.hidden:
575            s = contents
576        else:
577            s = []
578            attributeString = ''
579            if attrs:
580                attributeString = ' ' + ' '.join(attrs)           
581            if prettyPrint:
582                s.append(space)
583            s.append('<%s%s%s>' % (encodedName, attributeString, close))
584            if prettyPrint:
585                s.append("\n")
586            s.append(contents)
587            if prettyPrint and contents and contents[-1] != "\n":
588                s.append("\n")
589            if prettyPrint and closeTag:
590                s.append(space)
591            s.append(closeTag)
592            if prettyPrint and closeTag and self.nextSibling:
593                s.append("\n")
594            s = ''.join(s)
595        return s
596
597    def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
598        return self.__str__(encoding, True)
599
600    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
601                       prettyPrint=False, indentLevel=0):
602        """Renders the contents of this tag as a string in the given
603        encoding. If encoding is None, returns a Unicode string.."""
604        s=[]
605        for c in self:
606            text = None
607            if isinstance(c, NavigableString):
608                text = c.__str__(encoding)
609            elif isinstance(c, Tag):
610                s.append(c.__str__(encoding, prettyPrint, indentLevel))
611            if text and prettyPrint:
612                text = text.strip()             
613            if text:
614                if prettyPrint:
615                    s.append(" " * (indentLevel-1))
616                s.append(text)
617                if prettyPrint:
618                    s.append("\n")
619        return ''.join(s)   
620
621    #Soup methods
622
623    def find(self, name=None, attrs={}, recursive=True, text=None,
624             **kwargs):
625        """Return only the first child of this Tag matching the given
626        criteria."""
627        r = None
628        l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
629        if l:
630            r = l[0]
631        return r
632    findChild = find
633
634    def findAll(self, name=None, attrs={}, recursive=True, text=None,
635                limit=None, **kwargs):
636        """Extracts a list of Tag objects that match the given
637        criteria.  You can specify the name of the Tag and any
638        attributes you want the Tag to have.
639
640        The value of a key-value pair in the 'attrs' map can be a
641        string, a list of strings, a regular expression object, or a
642        callable that takes a string and returns whether or not the
643        string matches for some custom definition of 'matches'. The
644        same is true of the tag name."""
645        generator = self.recursiveChildGenerator
646        if not recursive:
647            generator = self.childGenerator
648        return self._findAll(name, attrs, text, limit, generator, **kwargs)
649    findChildren = findAll
650
651    # Pre-3.x compatibility methods
652    first = find
653    fetch = findAll
654   
655    def fetchText(self, text=None, recursive=True, limit=None):
656        return self.findAll(text=text, recursive=recursive, limit=limit)
657
658    def firstText(self, text=None, recursive=True):
659        return self.find(text=text, recursive=recursive)
660   
661    #Utility methods
662
663    def append(self, tag):
664        """Appends the given tag to the contents of this tag."""
665        self.contents.append(tag)
666
667    #Private methods
668
669    def _getAttrMap(self):
670        """Initializes a map representation of this tag's attributes,
671        if not already initialized."""
672        if not getattr(self, 'attrMap'):
673            self.attrMap = {}
674            for (key, value) in self.attrs:
675                self.attrMap[key] = value
676        return self.attrMap
677
678    #Generator methods
679    def childGenerator(self):
680        for i in range(0, len(self.contents)):
681            yield self.contents[i]
682        raise StopIteration
683   
684    def recursiveChildGenerator(self):
685        stack = [(self, 0)]
686        while stack:
687            tag, start = stack.pop()
688            if isinstance(tag, Tag):           
689                for i in range(start, len(tag.contents)):
690                    a = tag.contents[i]
691                    yield a
692                    if isinstance(a, Tag) and tag.contents:
693                        if i < len(tag.contents) - 1:
694                            stack.append((tag, i+1))
695                        stack.append((a, 0))
696                        break
697        raise StopIteration
698
699# Next, a couple classes to represent queries and their results.
700