| 1 | """Beautiful Soup |
|---|
| 2 | Elixir and Tonic |
|---|
| 3 | "The Screen-Scraper's Friend" |
|---|
| 4 | http://www.crummy.com/software/BeautifulSoup/ |
|---|
| 5 | |
|---|
| 6 | Beautiful Soup parses a (possibly invalid) XML or HTML document into a |
|---|
| 7 | tree representation. It provides methods and Pythonic idioms that make |
|---|
| 8 | it easy to navigate, search, and modify the tree. |
|---|
| 9 | |
|---|
| 10 | A well-formed XML/HTML document yields a well-formed data |
|---|
| 11 | structure. An ill-formed XML/HTML document yields a correspondingly |
|---|
| 12 | ill-formed data structure. If your document is only locally |
|---|
| 13 | well-formed, you can use this library to find and process the |
|---|
| 14 | well-formed part of it. The BeautifulSoup class |
|---|
| 15 | |
|---|
| 16 | Beautiful Soup works with Python 2.2 and up. It has no external |
|---|
| 17 | dependencies, but you'll have more success at converting data to UTF-8 |
|---|
| 18 | if you also install these three packages: |
|---|
| 19 | |
|---|
| 20 | * chardet, for auto-detecting character encodings |
|---|
| 21 | http://chardet.feedparser.org/ |
|---|
| 22 | * cjkcodecs and iconv_codec, which add more encodings to the ones supported |
|---|
| 23 | by stock Python. |
|---|
| 24 | http://cjkpython.i18n.org/ |
|---|
| 25 | |
|---|
| 26 | Beautiful Soup defines classes for two main parsing strategies: |
|---|
| 27 | |
|---|
| 28 | * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific |
|---|
| 29 | language that kind of looks like XML. |
|---|
| 30 | |
|---|
| 31 | * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid |
|---|
| 32 | or invalid. This class has web browser-like heuristics for |
|---|
| 33 | obtaining a sensible parse tree in the face of common HTML errors. |
|---|
| 34 | |
|---|
| 35 | Beautiful Soup also defines a class (UnicodeDammit) for autodetecting |
|---|
| 36 | the encoding of an HTML or XML document, and converting it to |
|---|
| 37 | Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. |
|---|
| 38 | |
|---|
| 39 | For more than you ever wanted to know about Beautiful Soup, see the |
|---|
| 40 | documentation: |
|---|
| 41 | http://www.crummy.com/software/BeautifulSoup/documentation.html |
|---|
| 42 | |
|---|
| 43 | """ |
|---|
| 44 | from __future__ import generators |
|---|
| 45 | |
|---|
| 46 | __author__ = "Leonard Richardson (leonardr@segfault.org)" |
|---|
| 47 | __version__ = "3.0.4" |
|---|
| 48 | __copyright__ = "Copyright (c) 2004-2007 Leonard Richardson" |
|---|
| 49 | __license__ = "PSF" |
|---|
| 50 | |
|---|
| 51 | from sgmllib import SGMLParser, SGMLParseError |
|---|
| 52 | import codecs |
|---|
| 53 | import types |
|---|
| 54 | import re |
|---|
| 55 | import sgmllib |
|---|
| 56 | try: |
|---|
| 57 | from htmlentitydefs import name2codepoint |
|---|
| 58 | except ImportError: |
|---|
| 59 | name2codepoint = {} |
|---|
| 60 | |
|---|
| 61 | #This hack makes Beautiful Soup able to parse XML with namespaces |
|---|
| 62 | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') |
|---|
| 63 | |
|---|
| 64 | DEFAULT_OUTPUT_ENCODING = "utf-8" |
|---|
| 65 | |
|---|
| 66 | # First, the classes that represent markup elements. |
|---|
| 67 | |
|---|
| 68 | class PageElement: |
|---|
| 69 | """Contains the navigational information for some part of the page |
|---|
| 70 | (either a tag or a piece of text)""" |
|---|
| 71 | |
|---|
| 72 | def setup(self, parent=None, previous=None): |
|---|
| 73 | """Sets up the initial relations between this element and |
|---|
| 74 | other elements.""" |
|---|
| 75 | self.parent = parent |
|---|
| 76 | self.previous = previous |
|---|
| 77 | self.next = None |
|---|
| 78 | self.previousSibling = None |
|---|
| 79 | self.nextSibling = None |
|---|
| 80 | if self.parent and self.parent.contents: |
|---|
| 81 | self.previousSibling = self.parent.contents[-1] |
|---|
| 82 | self.previousSibling.nextSibling = self |
|---|
| 83 | |
|---|
| 84 | def replaceWith(self, replaceWith): |
|---|
| 85 | oldParent = self.parent |
|---|
| 86 | myIndex = self.parent.contents.index(self) |
|---|
| 87 | if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent: |
|---|
| 88 | # We're replacing this element with one of its siblings. |
|---|
| 89 | index = self.parent.contents.index(replaceWith) |
|---|
| 90 | if index and index < myIndex: |
|---|
| 91 | # Furthermore, it comes before this element. That |
|---|
| 92 | # means that when we extract it, the index of this |
|---|
| 93 | # element will change. |
|---|
| 94 | myIndex = myIndex - 1 |
|---|
| 95 | self.extract() |
|---|
| 96 | oldParent.insert(myIndex, replaceWith) |
|---|
| 97 | |
|---|
| 98 | def extract(self): |
|---|
| 99 | """Destructively rips this element out of the tree.""" |
|---|
| 100 | if self.parent: |
|---|
| 101 | try: |
|---|
| 102 | self.parent.contents.remove(self) |
|---|
| 103 | except ValueError: |
|---|
| 104 | pass |
|---|
| 105 | |
|---|
| 106 | #Find the two elements that would be next to each other if |
|---|
| 107 | #this element (and any children) hadn't been parsed. Connect |
|---|
| 108 | #the two. |
|---|
| 109 | lastChild = self._lastRecursiveChild() |
|---|
| 110 | nextElement = lastChild.next |
|---|
| 111 | |
|---|
| 112 | if self.previous: |
|---|
| 113 | self.previous.next = nextElement |
|---|
| 114 | if nextElement: |
|---|
| 115 | nextElement.previous = self.previous |
|---|
| 116 | self.previous = None |
|---|
| 117 | lastChild.next = None |
|---|
| 118 | |
|---|
| 119 | self.parent = None |
|---|
| 120 | if self.previousSibling: |
|---|
| 121 | self.previousSibling.nextSibling = self.nextSibling |
|---|
| 122 | if self.nextSibling: |
|---|
| 123 | self.nextSibling.previousSibling = self.previousSibling |
|---|
| 124 | self.previousSibling = self.nextSibling = None |
|---|
| 125 | |
|---|
| 126 | def _lastRecursiveChild(self): |
|---|
| 127 | "Finds the last element beneath this object to be parsed." |
|---|
| 128 | lastChild = self |
|---|
| 129 | while hasattr(lastChild, 'contents') and lastChild.contents: |
|---|
| 130 | lastChild = lastChild.contents[-1] |
|---|
| 131 | return lastChild |
|---|
| 132 | |
|---|
| 133 | def insert(self, position, newChild): |
|---|
| 134 | if (isinstance(newChild, basestring) |
|---|
| 135 | or isinstance(newChild, unicode)) \ |
|---|
| 136 | and not isinstance(newChild, NavigableString): |
|---|
| 137 | newChild = NavigableString(newChild) |
|---|
| 138 | |
|---|
| 139 | position = min(position, len(self.contents)) |
|---|
| 140 | if hasattr(newChild, 'parent') and newChild.parent != None: |
|---|
| 141 | # We're 'inserting' an element that's already one |
|---|
| 142 | # of this object's children. |
|---|
| 143 | if newChild.parent == self: |
|---|
| 144 | index = self.find(newChild) |
|---|
| 145 | if index and index < position: |
|---|
| 146 | # Furthermore we're moving it further down the |
|---|
| 147 | # list of this object's children. That means that |
|---|
| 148 | # when we extract this element, our target index |
|---|
| 149 | # will jump down one. |
|---|
| 150 | position = position - 1 |
|---|
| 151 | newChild.extract() |
|---|
| 152 | |
|---|
| 153 | newChild.parent = self |
|---|
| 154 | previousChild = None |
|---|
| 155 | if position == 0: |
|---|
| 156 | newChild.previousSibling = None |
|---|
| 157 | newChild.previous = self |
|---|
| 158 | else: |
|---|
| 159 | previousChild = self.contents[position-1] |
|---|
| 160 | newChild.previousSibling = previousChild |
|---|
| 161 | newChild.previousSibling.nextSibling = newChild |
|---|
| 162 | newChild.previous = previousChild._lastRecursiveChild() |
|---|
| 163 | if newChild.previous: |
|---|
| 164 | newChild.previous.next = newChild |
|---|
| 165 | |
|---|
| 166 | newChildsLastElement = newChild._lastRecursiveChild() |
|---|
| 167 | |
|---|
| 168 | if position >= len(self.contents): |
|---|
| 169 | newChild.nextSibling = None |
|---|
| 170 | |
|---|
| 171 | parent = self |
|---|
| 172 | parentsNextSibling = None |
|---|
| 173 | while not parentsNextSibling: |
|---|
| 174 | parentsNextSibling = parent.nextSibling |
|---|
| 175 | parent = parent.parent |
|---|
| 176 | if not parent: # This is the last element in the document. |
|---|
| 177 | break |
|---|
| 178 | if parentsNextSibling: |
|---|
| 179 | newChildsLastElement.next = parentsNextSibling |
|---|
| 180 | else: |
|---|
| 181 | newChildsLastElement.next = None |
|---|
| 182 | else: |
|---|
| 183 | nextChild = self.contents[position] |
|---|
| 184 | newChild.nextSibling = nextChild |
|---|
| 185 | if newChild.nextSibling: |
|---|
| 186 | newChild.nextSibling.previousSibling = newChild |
|---|
| 187 | newChildsLastElement.next = nextChild |
|---|
| 188 | |
|---|
| 189 | if newChildsLastElement.next: |
|---|
| 190 | newChildsLastElement.next.previous = newChildsLastElement |
|---|
| 191 | self.contents.insert(position, newChild) |
|---|
| 192 | |
|---|
| 193 | def findNext(self, name=None, attrs={}, text=None, **kwargs): |
|---|
| 194 | """Returns the first item that matches the given criteria and |
|---|
| 195 | appears after this Tag in the document.""" |
|---|
| 196 | return self._findOne(self.findAllNext, name, attrs, text, **kwargs) |
|---|
| 197 | |
|---|
| 198 | def findAllNext(self, name=None, attrs={}, text=None, limit=None, |
|---|
| 199 | **kwargs): |
|---|
| 200 | """Returns all items that match the given criteria and appear |
|---|
| 201 | before after Tag in the document.""" |
|---|
| 202 | return self._findAll(name, attrs, text, limit, self.nextGenerator) |
|---|
| 203 | |
|---|
| 204 | def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): |
|---|
| 205 | """Returns the closest sibling to this Tag that matches the |
|---|
| 206 | given criteria and appears after this Tag in the document.""" |
|---|
| 207 | return self._findOne(self.findNextSiblings, name, attrs, text, |
|---|
| 208 | **kwargs) |
|---|
| 209 | |
|---|
| 210 | def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, |
|---|
| 211 | **kwargs): |
|---|
| 212 | """Returns the siblings of this Tag that match the given |
|---|
| 213 | criteria and appear after this Tag in the document.""" |
|---|
| 214 | return self._findAll(name, attrs, text, limit, |
|---|
| 215 | self.nextSiblingGenerator, **kwargs) |
|---|
| 216 | fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x |
|---|
| 217 | |
|---|
| 218 | def findPrevious(self, name=None, attrs={}, text=None, **kwargs): |
|---|
| 219 | """Returns the first item that matches the given criteria and |
|---|
| 220 | appears before this Tag in the document.""" |
|---|
| 221 | return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) |
|---|
| 222 | |
|---|
| 223 | def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, |
|---|
| 224 | **kwargs): |
|---|
| 225 | """Returns all items that match the given criteria and appear |
|---|
| 226 | before this Tag in the document.""" |
|---|
| 227 | return self._findAll(name, attrs, text, limit, self.previousGenerator, |
|---|
| 228 | **kwargs) |
|---|
| 229 | fetchPrevious = findAllPrevious # Compatibility with pre-3.x |
|---|
| 230 | |
|---|
| 231 | def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): |
|---|
| 232 | """Returns the closest sibling to this Tag that matches the |
|---|
| 233 | given criteria and appears before this Tag in the document.""" |
|---|
| 234 | return self._findOne(self.findPreviousSiblings, name, attrs, text, |
|---|
| 235 | **kwargs) |
|---|
| 236 | |
|---|
| 237 | def findPreviousSiblings(self, name=None, attrs={}, text=None, |
|---|
| 238 | limit=None, **kwargs): |
|---|
| 239 | """Returns the siblings of this Tag that match the given |
|---|
| 240 | criteria and appear before this Tag in the document.""" |
|---|
| 241 | return self._findAll(name, attrs, text, limit, |
|---|
| 242 | self.previousSiblingGenerator, **kwargs) |
|---|
| 243 | fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x |
|---|
| 244 | |
|---|
| 245 | def findParent(self, name=None, attrs={}, **kwargs): |
|---|
| 246 | """Returns the closest parent of this Tag that matches the given |
|---|
| 247 | criteria.""" |
|---|
| 248 | # NOTE: We can't use _findOne because findParents takes a different |
|---|
| 249 | # set of arguments. |
|---|
| 250 | r = None |
|---|
| 251 | l = self.findParents(name, attrs, 1) |
|---|
| 252 | if l: |
|---|
| 253 | r = l[0] |
|---|
| 254 | return r |
|---|
| 255 | |
|---|
| 256 | def findParents(self, name=None, attrs={}, limit=None, **kwargs): |
|---|
| 257 | """Returns the parents of this Tag that match the given |
|---|
| 258 | criteria.""" |
|---|
| 259 | |
|---|
| 260 | return self._findAll(name, attrs, None, limit, self.parentGenerator, |
|---|
| 261 | **kwargs) |
|---|
| 262 | fetchParents = findParents # Compatibility with pre-3.x |
|---|
| 263 | |
|---|
| 264 | #These methods do the real heavy lifting. |
|---|
| 265 | |
|---|
| 266 | def _findOne(self, method, name, attrs, text, **kwargs): |
|---|
| 267 | r = None |
|---|
| 268 | l = method(name, attrs, text, 1, **kwargs) |
|---|
| 269 | if l: |
|---|
| 270 | r = l[0] |
|---|
| 271 | return r |
|---|
| 272 | |
|---|
| 273 | def _findAll(self, name, attrs, text, limit, generator, **kwargs): |
|---|
| 274 | "Iterates over a generator looking for things that match." |
|---|
| 275 | |
|---|
| 276 | if isinstance(name, SoupStrainer): |
|---|
| 277 | strainer = name |
|---|
| 278 | else: |
|---|
| 279 | # Build a SoupStrainer |
|---|
| 280 | strainer = SoupStrainer(name, attrs, text, **kwargs) |
|---|
| 281 | results = ResultSet(strainer) |
|---|
| 282 | g = generator() |
|---|
| 283 | while True: |
|---|
| 284 | try: |
|---|
| 285 | i = g.next() |
|---|
| 286 | except StopIteration: |
|---|
| 287 | break |
|---|
| 288 | if i: |
|---|
| 289 | found = strainer.search(i) |
|---|
| 290 | if found: |
|---|
| 291 | results.append(found) |
|---|
| 292 | if limit and len(results) >= limit: |
|---|
| 293 | break |
|---|
| 294 | return results |
|---|
| 295 | |
|---|
| 296 | #These Generators can be used to navigate starting from both |
|---|
| 297 | #NavigableStrings and Tags. |
|---|
| 298 | def nextGenerator(self): |
|---|
| 299 | i = self |
|---|
| 300 | while i: |
|---|
| 301 | i = i.next |
|---|
| 302 | yield i |
|---|
| 303 | |
|---|
| 304 | def nextSiblingGenerator(self): |
|---|
| 305 | i = self |
|---|
| 306 | while i: |
|---|
| 307 | i = i.nextSibling |
|---|
| 308 | yield i |
|---|
| 309 | |
|---|
| 310 | def previousGenerator(self): |
|---|
| 311 | i = self |
|---|
| 312 | while i: |
|---|
| 313 | i = i.previous |
|---|
| 314 | yield i |
|---|
| 315 | |
|---|
| 316 | def previousSiblingGenerator(self): |
|---|
| 317 | i = self |
|---|
| 318 | while i: |
|---|
| 319 | i = i.previousSibling |
|---|
| 320 | yield i |
|---|
| 321 | |
|---|
| 322 | def parentGenerator(self): |
|---|
| 323 | i = self |
|---|
| 324 | while i: |
|---|
| 325 | i = i.parent |
|---|
| 326 | yield i |
|---|
| 327 | |
|---|
| 328 | # Utility methods |
|---|
| 329 | def substituteEncoding(self, str, encoding=None): |
|---|
| 330 | encoding = encoding or "utf-8" |
|---|
| 331 | return str.replace("%SOUP-ENCODING%", encoding) |
|---|
| 332 | |
|---|
| 333 | def toEncoding(self, s, encoding=None): |
|---|
| 334 | """Encodes an object to a string in some encoding, or to Unicode. |
|---|
| 335 | .""" |
|---|
| 336 | if isinstance(s, unicode): |
|---|
| 337 | if encoding: |
|---|
| 338 | s = s.encode(encoding) |
|---|
| 339 | elif isinstance(s, str): |
|---|
| 340 | if encoding: |
|---|
| 341 | s = s.encode(encoding) |
|---|
| 342 | else: |
|---|
| 343 | s = unicode(s) |
|---|
| 344 | else: |
|---|
| 345 | if encoding: |
|---|
| 346 | s = self.toEncoding(str(s), encoding) |
|---|
| 347 | else: |
|---|
| 348 | s = unicode(s) |
|---|
| 349 | return s |
|---|
| 350 | |
|---|
| 351 | class NavigableString(unicode, PageElement): |
|---|
| 352 | |
|---|
| 353 | def __getattr__(self, attr): |
|---|
| 354 | """text.string gives you text. This is for backwards |
|---|
| 355 | compatibility for Navigable*String, but for CData* it lets you |
|---|
| 356 | get the string without the CData wrapper.""" |
|---|
| 357 | if attr == 'string': |
|---|
| 358 | return self |
|---|
| 359 | else: |
|---|
| 360 | raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) |
|---|
| 361 | |
|---|
| 362 | def __unicode__(self): |
|---|
| 363 | return self.__str__(None) |
|---|
| 364 | |
|---|
| 365 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 366 | if encoding: |
|---|
| 367 | return self.encode(encoding) |
|---|
| 368 | else: |
|---|
| 369 | return self |
|---|
| 370 | |
|---|
| 371 | class CData(NavigableString): |
|---|
| 372 | |
|---|
| 373 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 374 | return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding) |
|---|
| 375 | |
|---|
| 376 | class ProcessingInstruction(NavigableString): |
|---|
| 377 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 378 | output = self |
|---|
| 379 | if "%SOUP-ENCODING%" in output: |
|---|
| 380 | output = self.substituteEncoding(output, encoding) |
|---|
| 381 | return "<?%s?>" % self.toEncoding(output, encoding) |
|---|
| 382 | |
|---|
| 383 | class Comment(NavigableString): |
|---|
| 384 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 385 | return "<!--%s-->" % NavigableString.__str__(self, encoding) |
|---|
| 386 | |
|---|
| 387 | class Declaration(NavigableString): |
|---|
| 388 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 389 | return "<!%s>" % NavigableString.__str__(self, encoding) |
|---|
| 390 | |
|---|
| 391 | class Tag(PageElement): |
|---|
| 392 | |
|---|
| 393 | """Represents a found HTML tag with its attributes and contents.""" |
|---|
| 394 | |
|---|
| 395 | XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot", |
|---|
| 396 | '"' : "quote", |
|---|
| 397 | "&" : "amp", |
|---|
| 398 | "<" : "lt", |
|---|
| 399 | ">" : "gt" } |
|---|
| 400 | |
|---|
| 401 | def __init__(self, parser, name, attrs=None, parent=None, |
|---|
| 402 | previous=None): |
|---|
| 403 | "Basic constructor." |
|---|
| 404 | |
|---|
| 405 | # We don't actually store the parser object: that lets extracted |
|---|
| 406 | # chunks be garbage-collected |
|---|
| 407 | self.parserClass = parser.__class__ |
|---|
| 408 | self.isSelfClosing = parser.isSelfClosingTag(name) |
|---|
| 409 | self.name = name |
|---|
| 410 | if attrs == None: |
|---|
| 411 | attrs = [] |
|---|
| 412 | self.attrs = attrs |
|---|
| 413 | self.contents = [] |
|---|
| 414 | self.setup(parent, previous) |
|---|
| 415 | self.hidden = False |
|---|
| 416 | self.containsSubstitutions = False |
|---|
| 417 | |
|---|
| 418 | def get(self, key, default=None): |
|---|
| 419 | """Returns the value of the 'key' attribute for the tag, or |
|---|
| 420 | the value given for 'default' if it doesn't have that |
|---|
| 421 | attribute.""" |
|---|
| 422 | return self._getAttrMap().get(key, default) |
|---|
| 423 | |
|---|
| 424 | def has_key(self, key): |
|---|
| 425 | return self._getAttrMap().has_key(key) |
|---|
| 426 | |
|---|
| 427 | def __getitem__(self, key): |
|---|
| 428 | """tag[key] returns the value of the 'key' attribute for the tag, |
|---|
| 429 | and throws an exception if it's not there.""" |
|---|
| 430 | return self._getAttrMap()[key] |
|---|
| 431 | |
|---|
| 432 | def __iter__(self): |
|---|
| 433 | "Iterating over a tag iterates over its contents." |
|---|
| 434 | return iter(self.contents) |
|---|
| 435 | |
|---|
| 436 | def __len__(self): |
|---|
| 437 | "The length of a tag is the length of its list of contents." |
|---|
| 438 | return len(self.contents) |
|---|
| 439 | |
|---|
| 440 | def __contains__(self, x): |
|---|
| 441 | return x in self.contents |
|---|
| 442 | |
|---|
| 443 | def __nonzero__(self): |
|---|
| 444 | "A tag is non-None even if it has no contents." |
|---|
| 445 | return True |
|---|
| 446 | |
|---|
| 447 | def __setitem__(self, key, value): |
|---|
| 448 | """Setting tag[key] sets the value of the 'key' attribute for the |
|---|
| 449 | tag.""" |
|---|
| 450 | self._getAttrMap() |
|---|
| 451 | self.attrMap[key] = value |
|---|
| 452 | found = False |
|---|
| 453 | for i in range(0, len(self.attrs)): |
|---|
| 454 | if self.attrs[i][0] == key: |
|---|
| 455 | self.attrs[i] = (key, value) |
|---|
| 456 | found = True |
|---|
| 457 | if not found: |
|---|
| 458 | self.attrs.append((key, value)) |
|---|
| 459 | self._getAttrMap()[key] = value |
|---|
| 460 | |
|---|
| 461 | def __delitem__(self, key): |
|---|
| 462 | "Deleting tag[key] deletes all 'key' attributes for the tag." |
|---|
| 463 | for item in self.attrs: |
|---|
| 464 | if item[0] == key: |
|---|
| 465 | self.attrs.remove(item) |
|---|
| 466 | #We don't break because bad HTML can define the same |
|---|
| 467 | #attribute multiple times. |
|---|
| 468 | self._getAttrMap() |
|---|
| 469 | if self.attrMap.has_key(key): |
|---|
| 470 | del self.attrMap[key] |
|---|
| 471 | |
|---|
| 472 | def __call__(self, *args, **kwargs): |
|---|
| 473 | """Calling a tag like a function is the same as calling its |
|---|
| 474 | findAll() method. Eg. tag('a') returns a list of all the A tags |
|---|
| 475 | found within this tag.""" |
|---|
| 476 | return apply(self.findAll, args, kwargs) |
|---|
| 477 | |
|---|
| 478 | def __getattr__(self, tag): |
|---|
| 479 | #print "Getattr %s.%s" % (self.__class__, tag) |
|---|
| 480 | if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: |
|---|
| 481 | return self.find(tag[:-3]) |
|---|
| 482 | elif tag.find('__') != 0: |
|---|
| 483 | return self.find(tag) |
|---|
| 484 | |
|---|
| 485 | def __eq__(self, other): |
|---|
| 486 | """Returns true iff this tag has the same name, the same attributes, |
|---|
| 487 | and the same contents (recursively) as the given tag. |
|---|
| 488 | |
|---|
| 489 | NOTE: right now this will return false if two tags have the |
|---|
| 490 | same attributes in a different order. Should this be fixed?""" |
|---|
| 491 | if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): |
|---|
| 492 | return False |
|---|
| 493 | for i in range(0, len(self.contents)): |
|---|
| 494 | if self.contents[i] != other.contents[i]: |
|---|
| 495 | return False |
|---|
| 496 | return True |
|---|
| 497 | |
|---|
| 498 | def __ne__(self, other): |
|---|
| 499 | """Returns true iff this tag is not identical to the other tag, |
|---|
| 500 | as defined in __eq__.""" |
|---|
| 501 | return not self == other |
|---|
| 502 | |
|---|
| 503 | def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 504 | """Renders this tag as a string.""" |
|---|
| 505 | return self.__str__(encoding) |
|---|
| 506 | |
|---|
| 507 | def __unicode__(self): |
|---|
| 508 | return self.__str__(None) |
|---|
| 509 | |
|---|
| 510 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, |
|---|
| 511 | prettyPrint=False, indentLevel=0): |
|---|
| 512 | """Returns a string or Unicode representation of this tag and |
|---|
| 513 | its contents. To get Unicode, pass None for encoding. |
|---|
| 514 | |
|---|
| 515 | NOTE: since Python's HTML parser consumes whitespace, this |
|---|
| 516 | method is not certain to reproduce the whitespace present in |
|---|
| 517 | the original string.""" |
|---|
| 518 | |
|---|
| 519 | encodedName = self.toEncoding(self.name, encoding) |
|---|
| 520 | |
|---|
| 521 | attrs = [] |
|---|
| 522 | if self.attrs: |
|---|
| 523 | for key, val in self.attrs: |
|---|
| 524 | fmt = '%s="%s"' |
|---|
| 525 | if isString(val): |
|---|
| 526 | if self.containsSubstitutions and '%SOUP-ENCODING%' in val: |
|---|
| 527 | val = self.substituteEncoding(val, encoding) |
|---|
| 528 | |
|---|
| 529 | # The attribute value either: |
|---|
| 530 | # |
|---|
| 531 | # * Contains no embedded double quotes or single quotes. |
|---|
| 532 | # No problem: we enclose it in double quotes. |
|---|
| 533 | # * Contains embedded single quotes. No problem: |
|---|
| 534 | # double quotes work here too. |
|---|
| 535 | # * Contains embedded double quotes. No problem: |
|---|
| 536 | # we enclose it in single quotes. |
|---|
| 537 | # * Embeds both single _and_ double quotes. This |
|---|
| 538 | # can't happen naturally, but it can happen if |
|---|
| 539 | # you modify an attribute value after parsing |
|---|
| 540 | # the document. Now we have a bit of a |
|---|
| 541 | # problem. We solve it by enclosing the |
|---|
| 542 | # attribute in single quotes, and escaping any |
|---|
| 543 | # embedded single quotes to XML entities. |
|---|
| 544 | if '"' in val: |
|---|
| 545 | fmt = "%s='%s'" |
|---|
| 546 | # This can't happen naturally, but it can happen |
|---|
| 547 | # if you modify an attribute value after parsing. |
|---|
| 548 | if "'" in val: |
|---|
| 549 | val = val.replace("'", "&squot;") |
|---|
| 550 | |
|---|
| 551 | # Now we're okay w/r/t quotes. But the attribute |
|---|
| 552 | # value might also contain angle brackets, or |
|---|
| 553 | # ampersands that aren't part of entities. We need |
|---|
| 554 | # to escape those to XML entities too. |
|---|
| 555 | val = re.sub("([<>]|&(?![^\s]+;))", |
|---|
| 556 | lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";", |
|---|
| 557 | val) |
|---|
| 558 | |
|---|
| 559 | attrs.append(fmt % (self.toEncoding(key, encoding), |
|---|
| 560 | self.toEncoding(val, encoding))) |
|---|
| 561 | close = '' |
|---|
| 562 | closeTag = '' |
|---|
| 563 | if self.isSelfClosing: |
|---|
| 564 | close = ' /' |
|---|
| 565 | else: |
|---|
| 566 | closeTag = '</%s>' % encodedName |
|---|
| 567 | |
|---|
| 568 | indentTag, indentContents = 0, 0 |
|---|
| 569 | if prettyPrint: |
|---|
| 570 | indentTag = indentLevel |
|---|
| 571 | space = (' ' * (indentTag-1)) |
|---|
| 572 | indentContents = indentTag + 1 |
|---|
| 573 | contents = self.renderContents(encoding, prettyPrint, indentContents) |
|---|
| 574 | if self.hidden: |
|---|
| 575 | s = contents |
|---|
| 576 | else: |
|---|
| 577 | s = [] |
|---|
| 578 | attributeString = '' |
|---|
| 579 | if attrs: |
|---|
| 580 | attributeString = ' ' + ' '.join(attrs) |
|---|
| 581 | if prettyPrint: |
|---|
| 582 | s.append(space) |
|---|
| 583 | s.append('<%s%s%s>' % (encodedName, attributeString, close)) |
|---|
| 584 | if prettyPrint: |
|---|
| 585 | s.append("\n") |
|---|
| 586 | s.append(contents) |
|---|
| 587 | if prettyPrint and contents and contents[-1] != "\n": |
|---|
| 588 | s.append("\n") |
|---|
| 589 | if prettyPrint and closeTag: |
|---|
| 590 | s.append(space) |
|---|
| 591 | s.append(closeTag) |
|---|
| 592 | if prettyPrint and closeTag and self.nextSibling: |
|---|
| 593 | s.append("\n") |
|---|
| 594 | s = ''.join(s) |
|---|
| 595 | return s |
|---|
| 596 | |
|---|
| 597 | def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): |
|---|
| 598 | return self.__str__(encoding, True) |
|---|
| 599 | |
|---|
| 600 | def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, |
|---|
| 601 | prettyPrint=False, indentLevel=0): |
|---|
| 602 | """Renders the contents of this tag as a string in the given |
|---|
| 603 | encoding. If encoding is None, returns a Unicode string..""" |
|---|
| 604 | s=[] |
|---|
| 605 | for c in self: |
|---|
| 606 | text = None |
|---|
| 607 | if isinstance(c, NavigableString): |
|---|
| 608 | text = c.__str__(encoding) |
|---|
| 609 | elif isinstance(c, Tag): |
|---|
| 610 | s.append(c.__str__(encoding, prettyPrint, indentLevel)) |
|---|
| 611 | if text and prettyPrint: |
|---|
| 612 | text = text.strip() |
|---|
| 613 | if text: |
|---|
| 614 | if prettyPrint: |
|---|
| 615 | s.append(" " * (indentLevel-1)) |
|---|
| 616 | s.append(text) |
|---|
| 617 | if prettyPrint: |
|---|
| 618 | s.append("\n") |
|---|
| 619 | return ''.join(s) |
|---|
| 620 | |
|---|
| 621 | #Soup methods |
|---|
| 622 | |
|---|
| 623 | def find(self, name=None, attrs={}, recursive=True, text=None, |
|---|
| 624 | **kwargs): |
|---|
| 625 | """Return only the first child of this Tag matching the given |
|---|
| 626 | criteria.""" |
|---|
| 627 | r = None |
|---|
| 628 | l = self.findAll(name, attrs, recursive, text, 1, **kwargs) |
|---|
| 629 | if l: |
|---|
| 630 | r = l[0] |
|---|
| 631 | return r |
|---|
| 632 | findChild = find |
|---|
| 633 | |
|---|
| 634 | def findAll(self, name=None, attrs={}, recursive=True, text=None, |
|---|
| 635 | limit=None, **kwargs): |
|---|
| 636 | """Extracts a list of Tag objects that match the given |
|---|
| 637 | criteria. You can specify the name of the Tag and any |
|---|
| 638 | attributes you want the Tag to have. |
|---|
| 639 | |
|---|
| 640 | The value of a key-value pair in the 'attrs' map can be a |
|---|
| 641 | string, a list of strings, a regular expression object, or a |
|---|
| 642 | callable that takes a string and returns whether or not the |
|---|
| 643 | string matches for some custom definition of 'matches'. The |
|---|
| 644 | same is true of the tag name.""" |
|---|
| 645 | generator = self.recursiveChildGenerator |
|---|
| 646 | if not recursive: |
|---|
| 647 | generator = self.childGenerator |
|---|
| 648 | return self._findAll(name, attrs, text, limit, generator, **kwargs) |
|---|
| 649 | findChildren = findAll |
|---|
| 650 | |
|---|
| 651 | # Pre-3.x compatibility methods |
|---|
| 652 | first = find |
|---|
| 653 | fetch = findAll |
|---|
| 654 | |
|---|
| 655 | def fetchText(self, text=None, recursive=True, limit=None): |
|---|
| 656 | return self.findAll(text=text, recursive=recursive, limit=limit) |
|---|
| 657 | |
|---|
| 658 | def firstText(self, text=None, recursive=True): |
|---|
| 659 | return self.find(text=text, recursive=recursive) |
|---|
| 660 | |
|---|
| 661 | #Utility methods |
|---|
| 662 | |
|---|
| 663 | def append(self, tag): |
|---|
| 664 | """Appends the given tag to the contents of this tag.""" |
|---|
| 665 | self.contents.append(tag) |
|---|
| 666 | |
|---|
| 667 | #Private methods |
|---|
| 668 | |
|---|
| 669 | def _getAttrMap(self): |
|---|
| 670 | """Initializes a map representation of this tag's attributes, |
|---|
| 671 | if not already initialized.""" |
|---|
| 672 | if not getattr(self, 'attrMap'): |
|---|
| 673 | self.attrMap = {} |
|---|
| 674 | for (key, value) in self.attrs: |
|---|
| 675 | self.attrMap[key] = value |
|---|
| 676 | return self.attrMap |
|---|
| 677 | |
|---|
| 678 | #Generator methods |
|---|
| 679 | def childGenerator(self): |
|---|
| 680 | for i in range(0, len(self.contents)): |
|---|
| 681 | yield self.contents[i] |
|---|
| 682 | raise StopIteration |
|---|
| 683 | |
|---|
| 684 | def recursiveChildGenerator(self): |
|---|
| 685 | stack = [(self, 0)] |
|---|
| 686 | while stack: |
|---|
| 687 | tag, start = stack.pop() |
|---|
| 688 | if isinstance(tag, Tag): |
|---|
| 689 | for i in range(start, len(tag.contents)): |
|---|
| 690 | a = tag.contents[i] |
|---|
| 691 | yield a |
|---|
| 692 | if isinstance(a, Tag) and tag.contents: |
|---|
| 693 | if i < len(tag.contents) - 1: |
|---|
| 694 | stack.append((tag, i+1)) |
|---|
| 695 | stack.append((a, 0)) |
|---|
| 696 | break |
|---|
| 697 | raise StopIteration |
|---|
| 698 | |
|---|
| 699 | # Next, a couple classes to represent queries and their results. |
|---|
| 700 | |
|---|