root/django/trunk/contrib/soupselect.py

Revision 88, 4.0 kB (checked in by steadicat, 13 months ago)

Added BeatifulSoup? with soupselect for testing web pages.

  • Property svn:keywords set to Id
Line 
1"""
2soupselect.py
3
4CSS selector support for BeautifulSoup.
5
6soup = BeautifulSoup('<html>...')
7select(soup, 'div')
8- returns a list of div elements
9
10select(soup, 'div#main ul a')
11- returns a list of links inside a ul inside div#main
12
13"""
14
15import re
16
17tag_re = re.compile('^[a-z0-9]+$')
18
19attribselect_re = re.compile(
20    r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' + 
21    r'=?"?(?P<value>[^\]"]*)"?\]$'
22)
23
24# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
25#   \---/  \---/\-------------/    \-------/
26#     |      |         |               |
27#     |      |         |           The value
28#     |      |    ~,|,^,$,* or =
29#     |   Attribute
30#    Tag
31
32def attribute_checker(operator, attribute, value=''):
33    """
34    Takes an operator, attribute and optional value; returns a function that
35    will return True for elements that match that combination.
36    """
37    return {
38        '=': lambda el: el.get(attribute) == value,
39        # attribute includes value as one of a set of space separated tokens
40        '~': lambda el: value in el.get(attribute, '').split(),
41        # attribute starts with value
42        '^': lambda el: el.get(attribute, '').startswith(value),
43        # attribute ends with value
44        '$': lambda el: el.get(attribute, '').endswith(value),
45        # attribute contains value
46        '*': lambda el: value in el.get(attribute, ''),
47        # attribute is either exactly value or starts with value-
48        '|': lambda el: el.get(attribute, '') == value \
49            or el.get(attribute, '').startswith('%s-' % value),
50    }.get(operator, lambda el: el.has_key(attribute))
51
52
53def select(soup, selector):
54    """
55    soup should be a BeautifulSoup instance; selector is a CSS selector
56    specifying the elements you want to retrieve.
57    """
58    tokens = selector.split()
59    current_context = [soup]
60    for token in tokens:
61        m = attribselect_re.match(token)
62        if m:
63            # Attribute selector
64            tag, attribute, operator, value = m.groups()
65            if not tag:
66                tag = True
67            checker = attribute_checker(operator, attribute, value)
68            found = []
69            for context in current_context:
70                found.extend([el for el in context.findAll(tag) if checker(el)])
71            current_context = found
72            continue
73        if '#' in token:
74            # ID selector
75            tag, id = token.split('#', 1)
76            if not tag:
77                tag = True
78            el = current_context[0].find(tag, {'id': id})
79            if not el:
80                return [] # No match
81            current_context = [el]
82            continue
83        if '.' in token:
84            # Class selector
85            tag, klass = token.split('.', 1)
86            if not tag:
87                tag = True
88            found = []
89            for context in current_context:
90                found.extend(
91                    context.findAll(tag,
92                        {'class': lambda attr: attr and klass in attr.split()}
93                    )
94                )
95            current_context = found
96            continue
97        if token == '*':
98            # Star selector
99            found = []
100            for context in current_context:
101                found.extend(context.findAll(True))
102            current_context = found
103            continue
104        # Here we should just have a regular tag
105        if not tag_re.match(token):
106            return []
107        found = []
108        for context in current_context:
109            found.extend(context.findAll(token))
110        current_context = found
111    return current_context
112
113def monkeypatch(BeautifulSoupClass=None):
114    """
115    If you don't explicitly state the class to patch, defaults to the most
116    common import location for BeautifulSoup.
117    """
118    if not BeautifulSoupClass:
119        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
120    BeautifulSoupClass.findSelect = select
121
122def unmonkeypatch(BeautifulSoupClass=None):
123    if not BeautifulSoupClass:
124        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
125    delattr(BeautifulSoupClass, 'findSelect')
Note: See TracBrowser for help on using the browser.