| 1 | """ |
|---|
| 2 | soupselect.py |
|---|
| 3 | |
|---|
| 4 | CSS selector support for BeautifulSoup. |
|---|
| 5 | |
|---|
| 6 | soup = BeautifulSoup('<html>...') |
|---|
| 7 | select(soup, 'div') |
|---|
| 8 | - returns a list of div elements |
|---|
| 9 | |
|---|
| 10 | select(soup, 'div#main ul a') |
|---|
| 11 | - returns a list of links inside a ul inside div#main |
|---|
| 12 | |
|---|
| 13 | """ |
|---|
| 14 | |
|---|
| 15 | import re |
|---|
| 16 | |
|---|
| 17 | tag_re = re.compile('^[a-z0-9]+$') |
|---|
| 18 | |
|---|
| 19 | attribselect_re = re.compile( |
|---|
| 20 | r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + |
|---|
| 21 | r'=?"?(?P<value>[^\]"]*)"?\]$' |
|---|
| 22 | ) |
|---|
| 23 | |
|---|
| 24 | # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ |
|---|
| 25 | # \---/ \---/\-------------/ \-------/ |
|---|
| 26 | # | | | | |
|---|
| 27 | # | | | The value |
|---|
| 28 | # | | ~,|,^,$,* or = |
|---|
| 29 | # | Attribute |
|---|
| 30 | # Tag |
|---|
| 31 | |
|---|
| 32 | def attribute_checker(operator, attribute, value=''): |
|---|
| 33 | """ |
|---|
| 34 | Takes an operator, attribute and optional value; returns a function that |
|---|
| 35 | will return True for elements that match that combination. |
|---|
| 36 | """ |
|---|
| 37 | return { |
|---|
| 38 | '=': lambda el: el.get(attribute) == value, |
|---|
| 39 | # attribute includes value as one of a set of space separated tokens |
|---|
| 40 | '~': lambda el: value in el.get(attribute, '').split(), |
|---|
| 41 | # attribute starts with value |
|---|
| 42 | '^': lambda el: el.get(attribute, '').startswith(value), |
|---|
| 43 | # attribute ends with value |
|---|
| 44 | '$': lambda el: el.get(attribute, '').endswith(value), |
|---|
| 45 | # attribute contains value |
|---|
| 46 | '*': lambda el: value in el.get(attribute, ''), |
|---|
| 47 | # attribute is either exactly value or starts with value- |
|---|
| 48 | '|': lambda el: el.get(attribute, '') == value \ |
|---|
| 49 | or el.get(attribute, '').startswith('%s-' % value), |
|---|
| 50 | }.get(operator, lambda el: el.has_key(attribute)) |
|---|
| 51 | |
|---|
| 52 | |
|---|
| 53 | def select(soup, selector): |
|---|
| 54 | """ |
|---|
| 55 | soup should be a BeautifulSoup instance; selector is a CSS selector |
|---|
| 56 | specifying the elements you want to retrieve. |
|---|
| 57 | """ |
|---|
| 58 | tokens = selector.split() |
|---|
| 59 | current_context = [soup] |
|---|
| 60 | for token in tokens: |
|---|
| 61 | m = attribselect_re.match(token) |
|---|
| 62 | if m: |
|---|
| 63 | # Attribute selector |
|---|
| 64 | tag, attribute, operator, value = m.groups() |
|---|
| 65 | if not tag: |
|---|
| 66 | tag = True |
|---|
| 67 | checker = attribute_checker(operator, attribute, value) |
|---|
| 68 | found = [] |
|---|
| 69 | for context in current_context: |
|---|
| 70 | found.extend([el for el in context.findAll(tag) if checker(el)]) |
|---|
| 71 | current_context = found |
|---|
| 72 | continue |
|---|
| 73 | if '#' in token: |
|---|
| 74 | # ID selector |
|---|
| 75 | tag, id = token.split('#', 1) |
|---|
| 76 | if not tag: |
|---|
| 77 | tag = True |
|---|
| 78 | el = current_context[0].find(tag, {'id': id}) |
|---|
| 79 | if not el: |
|---|
| 80 | return [] # No match |
|---|
| 81 | current_context = [el] |
|---|
| 82 | continue |
|---|
| 83 | if '.' in token: |
|---|
| 84 | # Class selector |
|---|
| 85 | tag, klass = token.split('.', 1) |
|---|
| 86 | if not tag: |
|---|
| 87 | tag = True |
|---|
| 88 | found = [] |
|---|
| 89 | for context in current_context: |
|---|
| 90 | found.extend( |
|---|
| 91 | context.findAll(tag, |
|---|
| 92 | {'class': lambda attr: attr and klass in attr.split()} |
|---|
| 93 | ) |
|---|
| 94 | ) |
|---|
| 95 | current_context = found |
|---|
| 96 | continue |
|---|
| 97 | if token == '*': |
|---|
| 98 | # Star selector |
|---|
| 99 | found = [] |
|---|
| 100 | for context in current_context: |
|---|
| 101 | found.extend(context.findAll(True)) |
|---|
| 102 | current_context = found |
|---|
| 103 | continue |
|---|
| 104 | # Here we should just have a regular tag |
|---|
| 105 | if not tag_re.match(token): |
|---|
| 106 | return [] |
|---|
| 107 | found = [] |
|---|
| 108 | for context in current_context: |
|---|
| 109 | found.extend(context.findAll(token)) |
|---|
| 110 | current_context = found |
|---|
| 111 | return current_context |
|---|
| 112 | |
|---|
| 113 | def monkeypatch(BeautifulSoupClass=None): |
|---|
| 114 | """ |
|---|
| 115 | If you don't explicitly state the class to patch, defaults to the most |
|---|
| 116 | common import location for BeautifulSoup. |
|---|
| 117 | """ |
|---|
| 118 | if not BeautifulSoupClass: |
|---|
| 119 | from BeautifulSoup import BeautifulSoup as BeautifulSoupClass |
|---|
| 120 | BeautifulSoupClass.findSelect = select |
|---|
| 121 | |
|---|
| 122 | def unmonkeypatch(BeautifulSoupClass=None): |
|---|
| 123 | if not BeautifulSoupClass: |
|---|
| 124 | from BeautifulSoup import BeautifulSoup as BeautifulSoupClass |
|---|
| 125 | delattr(BeautifulSoupClass, 'findSelect') |
|---|