| 1 | #!/usr/bin/python |
|---|
| 2 | |
|---|
| 3 | r""" |
|---|
| 4 | ============== |
|---|
| 5 | smartypants.py |
|---|
| 6 | ============== |
|---|
| 7 | |
|---|
| 8 | ---------------------------- |
|---|
| 9 | SmartyPants ported to Python |
|---|
| 10 | ---------------------------- |
|---|
| 11 | |
|---|
| 12 | Ported by `Chad Miller`_ |
|---|
| 13 | Copyright (c) 2004, 2007 Chad Miller |
|---|
| 14 | |
|---|
| 15 | original `SmartyPants`_ by `John Gruber`_ |
|---|
| 16 | Copyright (c) 2003 John Gruber |
|---|
| 17 | |
|---|
| 18 | |
|---|
| 19 | Synopsis |
|---|
| 20 | ======== |
|---|
| 21 | |
|---|
| 22 | A smart-quotes plugin for Pyblosxom_. |
|---|
| 23 | |
|---|
| 24 | The priginal "SmartyPants" is a free web publishing plug-in for Movable Type, |
|---|
| 25 | Blosxom, and BBEdit that easily translates plain ASCII punctuation characters |
|---|
| 26 | into "smart" typographic punctuation HTML entities. |
|---|
| 27 | |
|---|
| 28 | This software, *smartypants.py*, endeavours to be a functional port of |
|---|
| 29 | SmartyPants to Python, for use with Pyblosxom_. |
|---|
| 30 | |
|---|
| 31 | |
|---|
| 32 | Description |
|---|
| 33 | =========== |
|---|
| 34 | |
|---|
| 35 | SmartyPants can perform the following transformations: |
|---|
| 36 | |
|---|
| 37 | - Straight quotes ( " and ' ) into "curly" quote HTML entities |
|---|
| 38 | - Backticks-style quotes (\`\`like this'') into "curly" quote HTML entities |
|---|
| 39 | - Dashes (``--`` and ``---``) into en- and em-dash entities |
|---|
| 40 | - Three consecutive dots (``...`` or ``. . .``) into an ellipsis entity |
|---|
| 41 | |
|---|
| 42 | This means you can write, edit, and save your posts using plain old |
|---|
| 43 | ASCII straight quotes, plain dashes, and plain dots, but your published |
|---|
| 44 | posts (and final HTML output) will appear with smart quotes, em-dashes, |
|---|
| 45 | and proper ellipses. |
|---|
| 46 | |
|---|
| 47 | SmartyPants does not modify characters within ``<pre>``, ``<code>``, ``<kbd>``, |
|---|
| 48 | ``<math>`` or ``<script>`` tag blocks. Typically, these tags are used to |
|---|
| 49 | display text where smart quotes and other "smart punctuation" would not be |
|---|
| 50 | appropriate, such as source code or example markup. |
|---|
| 51 | |
|---|
| 52 | |
|---|
| 53 | Backslash Escapes |
|---|
| 54 | ================= |
|---|
| 55 | |
|---|
| 56 | If you need to use literal straight quotes (or plain hyphens and |
|---|
| 57 | periods), SmartyPants accepts the following backslash escape sequences |
|---|
| 58 | to force non-smart punctuation. It does so by transforming the escape |
|---|
| 59 | sequence into a decimal-encoded HTML entity: |
|---|
| 60 | |
|---|
| 61 | (FIXME: table here.) |
|---|
| 62 | |
|---|
| 63 | .. comment It sucks that there's a disconnect between the visual layout and table markup when special characters are involved. |
|---|
| 64 | .. comment ====== ===== ========= |
|---|
| 65 | .. comment Escape Value Character |
|---|
| 66 | .. comment ====== ===== ========= |
|---|
| 67 | .. comment \\\\\\\\ \ \\\\ |
|---|
| 68 | .. comment \\\\" " " |
|---|
| 69 | .. comment \\\\' ' ' |
|---|
| 70 | .. comment \\\\. . . |
|---|
| 71 | .. comment \\\\- - \- |
|---|
| 72 | .. comment \\\\` ` \` |
|---|
| 73 | .. comment ====== ===== ========= |
|---|
| 74 | |
|---|
| 75 | This is useful, for example, when you want to use straight quotes as |
|---|
| 76 | foot and inch marks: 6'2" tall; a 17" iMac. |
|---|
| 77 | |
|---|
| 78 | Options |
|---|
| 79 | ======= |
|---|
| 80 | |
|---|
| 81 | For Pyblosxom users, the ``smartypants_attributes`` attribute is where you |
|---|
| 82 | specify configuration options. |
|---|
| 83 | |
|---|
| 84 | Numeric values are the easiest way to configure SmartyPants' behavior: |
|---|
| 85 | |
|---|
| 86 | "0" |
|---|
| 87 | Suppress all transformations. (Do nothing.) |
|---|
| 88 | "1" |
|---|
| 89 | Performs default SmartyPants transformations: quotes (including |
|---|
| 90 | \`\`backticks'' -style), em-dashes, and ellipses. "``--``" (dash dash) |
|---|
| 91 | is used to signify an em-dash; there is no support for en-dashes. |
|---|
| 92 | |
|---|
| 93 | "2" |
|---|
| 94 | Same as smarty_pants="1", except that it uses the old-school typewriter |
|---|
| 95 | shorthand for dashes: "``--``" (dash dash) for en-dashes, "``---``" |
|---|
| 96 | (dash dash dash) |
|---|
| 97 | for em-dashes. |
|---|
| 98 | |
|---|
| 99 | "3" |
|---|
| 100 | Same as smarty_pants="2", but inverts the shorthand for dashes: |
|---|
| 101 | "``--``" (dash dash) for em-dashes, and "``---``" (dash dash dash) for |
|---|
| 102 | en-dashes. |
|---|
| 103 | |
|---|
| 104 | "-1" |
|---|
| 105 | Stupefy mode. Reverses the SmartyPants transformation process, turning |
|---|
| 106 | the HTML entities produced by SmartyPants into their ASCII equivalents. |
|---|
| 107 | E.g. "“" is turned into a simple double-quote ("), "—" is |
|---|
| 108 | turned into two dashes, etc. |
|---|
| 109 | |
|---|
| 110 | |
|---|
| 111 | The following single-character attribute values can be combined to toggle |
|---|
| 112 | individual transformations from within the smarty_pants attribute. For |
|---|
| 113 | example, to educate normal quotes and em-dashes, but not ellipses or |
|---|
| 114 | \`\`backticks'' -style quotes: |
|---|
| 115 | |
|---|
| 116 | ``py['smartypants_attributes'] = "1"`` |
|---|
| 117 | |
|---|
| 118 | "q" |
|---|
| 119 | Educates normal quote characters: (") and ('). |
|---|
| 120 | |
|---|
| 121 | "b" |
|---|
| 122 | Educates \`\`backticks'' -style double quotes. |
|---|
| 123 | |
|---|
| 124 | "B" |
|---|
| 125 | Educates \`\`backticks'' -style double quotes and \`single' quotes. |
|---|
| 126 | |
|---|
| 127 | "d" |
|---|
| 128 | Educates em-dashes. |
|---|
| 129 | |
|---|
| 130 | "D" |
|---|
| 131 | Educates em-dashes and en-dashes, using old-school typewriter shorthand: |
|---|
| 132 | (dash dash) for en-dashes, (dash dash dash) for em-dashes. |
|---|
| 133 | |
|---|
| 134 | "i" |
|---|
| 135 | Educates em-dashes and en-dashes, using inverted old-school typewriter |
|---|
| 136 | shorthand: (dash dash) for em-dashes, (dash dash dash) for en-dashes. |
|---|
| 137 | |
|---|
| 138 | "e" |
|---|
| 139 | Educates ellipses. |
|---|
| 140 | |
|---|
| 141 | "w" |
|---|
| 142 | Translates any instance of ``"`` into a normal double-quote character. |
|---|
| 143 | This should be of no interest to most people, but of particular interest |
|---|
| 144 | to anyone who writes their posts using Dreamweaver, as Dreamweaver |
|---|
| 145 | inexplicably uses this entity to represent a literal double-quote |
|---|
| 146 | character. SmartyPants only educates normal quotes, not entities (because |
|---|
| 147 | ordinarily, entities are used for the explicit purpose of representing the |
|---|
| 148 | specific character they represent). The "w" option must be used in |
|---|
| 149 | conjunction with one (or both) of the other quote options ("q" or "b"). |
|---|
| 150 | Thus, if you wish to apply all SmartyPants transformations (quotes, en- |
|---|
| 151 | and em-dashes, and ellipses) and also translate ``"`` entities into |
|---|
| 152 | regular quotes so SmartyPants can educate them, you should pass the |
|---|
| 153 | following to the smarty_pants attribute: |
|---|
| 154 | |
|---|
| 155 | The ``smartypants_forbidden_flavours`` list contains pyblosxom flavours for |
|---|
| 156 | which no Smarty Pants rendering will occur. |
|---|
| 157 | |
|---|
| 158 | |
|---|
| 159 | Caveats |
|---|
| 160 | ======= |
|---|
| 161 | |
|---|
| 162 | Why You Might Not Want to Use Smart Quotes in Your Weblog |
|---|
| 163 | --------------------------------------------------------- |
|---|
| 164 | |
|---|
| 165 | For one thing, you might not care. |
|---|
| 166 | |
|---|
| 167 | Most normal, mentally stable individuals do not take notice of proper |
|---|
| 168 | typographic punctuation. Many design and typography nerds, however, break |
|---|
| 169 | out in a nasty rash when they encounter, say, a restaurant sign that uses |
|---|
| 170 | a straight apostrophe to spell "Joe's". |
|---|
| 171 | |
|---|
| 172 | If you're the sort of person who just doesn't care, you might well want to |
|---|
| 173 | continue not caring. Using straight quotes -- and sticking to the 7-bit |
|---|
| 174 | ASCII character set in general -- is certainly a simpler way to live. |
|---|
| 175 | |
|---|
| 176 | Even if you I *do* care about accurate typography, you still might want to |
|---|
| 177 | think twice before educating the quote characters in your weblog. One side |
|---|
| 178 | effect of publishing curly quote HTML entities is that it makes your |
|---|
| 179 | weblog a bit harder for others to quote from using copy-and-paste. What |
|---|
| 180 | happens is that when someone copies text from your blog, the copied text |
|---|
| 181 | contains the 8-bit curly quote characters (as well as the 8-bit characters |
|---|
| 182 | for em-dashes and ellipses, if you use these options). These characters |
|---|
| 183 | are not standard across different text encoding methods, which is why they |
|---|
| 184 | need to be encoded as HTML entities. |
|---|
| 185 | |
|---|
| 186 | People copying text from your weblog, however, may not notice that you're |
|---|
| 187 | using curly quotes, and they'll go ahead and paste the unencoded 8-bit |
|---|
| 188 | characters copied from their browser into an email message or their own |
|---|
| 189 | weblog. When pasted as raw "smart quotes", these characters are likely to |
|---|
| 190 | get mangled beyond recognition. |
|---|
| 191 | |
|---|
| 192 | That said, my own opinion is that any decent text editor or email client |
|---|
| 193 | makes it easy to stupefy smart quote characters into their 7-bit |
|---|
| 194 | equivalents, and I don't consider it my problem if you're using an |
|---|
| 195 | indecent text editor or email client. |
|---|
| 196 | |
|---|
| 197 | |
|---|
| 198 | Algorithmic Shortcomings |
|---|
| 199 | ------------------------ |
|---|
| 200 | |
|---|
| 201 | One situation in which quotes will get curled the wrong way is when |
|---|
| 202 | apostrophes are used at the start of leading contractions. For example: |
|---|
| 203 | |
|---|
| 204 | ``'Twas the night before Christmas.`` |
|---|
| 205 | |
|---|
| 206 | In the case above, SmartyPants will turn the apostrophe into an opening |
|---|
| 207 | single-quote, when in fact it should be a closing one. I don't think |
|---|
| 208 | this problem can be solved in the general case -- every word processor |
|---|
| 209 | I've tried gets this wrong as well. In such cases, it's best to use the |
|---|
| 210 | proper HTML entity for closing single-quotes (``’``) by hand. |
|---|
| 211 | |
|---|
| 212 | |
|---|
| 213 | Bugs |
|---|
| 214 | ==== |
|---|
| 215 | |
|---|
| 216 | To file bug reports or feature requests (other than topics listed in the |
|---|
| 217 | Caveats section above) please send email to: mailto:smartypantspy@chad.org |
|---|
| 218 | |
|---|
| 219 | If the bug involves quotes being curled the wrong way, please send example |
|---|
| 220 | text to illustrate. |
|---|
| 221 | |
|---|
| 222 | To Do list |
|---|
| 223 | ---------- |
|---|
| 224 | |
|---|
| 225 | - Provide a function for use within templates to quote anything at all. |
|---|
| 226 | |
|---|
| 227 | |
|---|
| 228 | Version History |
|---|
| 229 | =============== |
|---|
| 230 | |
|---|
| 231 | 1.5_1.6: Fri, 27 Jul 2007 07:06:40 -0400 |
|---|
| 232 | - Fixed bug where blocks of precious unalterable text was instead |
|---|
| 233 | interpreted. Thanks to Le Roux and Dirk van Oosterbosch. |
|---|
| 234 | |
|---|
| 235 | 1.5_1.5: Sat, 13 Aug 2005 15:50:24 -0400 |
|---|
| 236 | - Fix bogus magical quotation when there is no hint that the |
|---|
| 237 | user wants it, e.g., in "21st century". Thanks to Nathan Hamblen. |
|---|
| 238 | - Be smarter about quotes before terminating numbers in an en-dash'ed |
|---|
| 239 | range. |
|---|
| 240 | |
|---|
| 241 | 1.5_1.4: Thu, 10 Feb 2005 20:24:36 -0500 |
|---|
| 242 | - Fix a date-processing bug, as reported by jacob childress. |
|---|
| 243 | - Begin a test-suite for ensuring correct output. |
|---|
| 244 | - Removed import of "string", since I didn't really need it. |
|---|
| 245 | (This was my first every Python program. Sue me!) |
|---|
| 246 | |
|---|
| 247 | 1.5_1.3: Wed, 15 Sep 2004 18:25:58 -0400 |
|---|
| 248 | - Abort processing if the flavour is in forbidden-list. Default of |
|---|
| 249 | [ "rss" ] (Idea of Wolfgang SCHNERRING.) |
|---|
| 250 | - Remove stray virgules from en-dashes. Patch by Wolfgang SCHNERRING. |
|---|
| 251 | |
|---|
| 252 | 1.5_1.2: Mon, 24 May 2004 08:14:54 -0400 |
|---|
| 253 | - Some single quotes weren't replaced properly. Diff-tesuji played |
|---|
| 254 | by Benjamin GEIGER. |
|---|
| 255 | |
|---|
| 256 | 1.5_1.1: Sun, 14 Mar 2004 14:38:28 -0500 |
|---|
| 257 | - Support upcoming pyblosxom 0.9 plugin verification feature. |
|---|
| 258 | |
|---|
| 259 | 1.5_1.0: Tue, 09 Mar 2004 08:08:35 -0500 |
|---|
| 260 | - Initial release |
|---|
| 261 | |
|---|
| 262 | Version Information |
|---|
| 263 | ------------------- |
|---|
| 264 | |
|---|
| 265 | Version numbers will track the SmartyPants_ version numbers, with the addition |
|---|
| 266 | of an underscore and the smartypants.py version on the end. |
|---|
| 267 | |
|---|
| 268 | New versions will be available at `http://wiki.chad.org/SmartyPantsPy`_ |
|---|
| 269 | |
|---|
| 270 | .. _http://wiki.chad.org/SmartyPantsPy: http://wiki.chad.org/SmartyPantsPy |
|---|
| 271 | |
|---|
| 272 | Authors |
|---|
| 273 | ======= |
|---|
| 274 | |
|---|
| 275 | `John Gruber`_ did all of the hard work of writing this software in Perl for |
|---|
| 276 | `Movable Type`_ and almost all of this useful documentation. `Chad Miller`_ |
|---|
| 277 | ported it to Python to use with Pyblosxom_. |
|---|
| 278 | |
|---|
| 279 | |
|---|
| 280 | Additional Credits |
|---|
| 281 | ================== |
|---|
| 282 | |
|---|
| 283 | Portions of the SmartyPants original work are based on Brad Choate's nifty |
|---|
| 284 | MTRegex plug-in. `Brad Choate`_ also contributed a few bits of source code to |
|---|
| 285 | this plug-in. Brad Choate is a fine hacker indeed. |
|---|
| 286 | |
|---|
| 287 | `Jeremy Hedley`_ and `Charles Wiltgen`_ deserve mention for exemplary beta |
|---|
| 288 | testing of the original SmartyPants. |
|---|
| 289 | |
|---|
| 290 | `Rael Dornfest`_ ported SmartyPants to Blosxom. |
|---|
| 291 | |
|---|
| 292 | .. _Brad Choate: http://bradchoate.com/ |
|---|
| 293 | .. _Jeremy Hedley: http://antipixel.com/ |
|---|
| 294 | .. _Charles Wiltgen: http://playbacktime.com/ |
|---|
| 295 | .. _Rael Dornfest: http://raelity.org/ |
|---|
| 296 | |
|---|
| 297 | |
|---|
| 298 | Copyright and License |
|---|
| 299 | ===================== |
|---|
| 300 | |
|---|
| 301 | SmartyPants_ license:: |
|---|
| 302 | |
|---|
| 303 | Copyright (c) 2003 John Gruber |
|---|
| 304 | (http://daringfireball.net/) |
|---|
| 305 | All rights reserved. |
|---|
| 306 | |
|---|
| 307 | Redistribution and use in source and binary forms, with or without |
|---|
| 308 | modification, are permitted provided that the following conditions are |
|---|
| 309 | met: |
|---|
| 310 | |
|---|
| 311 | * Redistributions of source code must retain the above copyright |
|---|
| 312 | notice, this list of conditions and the following disclaimer. |
|---|
| 313 | |
|---|
| 314 | * Redistributions in binary form must reproduce the above copyright |
|---|
| 315 | notice, this list of conditions and the following disclaimer in |
|---|
| 316 | the documentation and/or other materials provided with the |
|---|
| 317 | distribution. |
|---|
| 318 | |
|---|
| 319 | * Neither the name "SmartyPants" nor the names of its contributors |
|---|
| 320 | may be used to endorse or promote products derived from this |
|---|
| 321 | software without specific prior written permission. |
|---|
| 322 | |
|---|
| 323 | This software is provided by the copyright holders and contributors "as |
|---|
| 324 | is" and any express or implied warranties, including, but not limited |
|---|
| 325 | to, the implied warranties of merchantability and fitness for a |
|---|
| 326 | particular purpose are disclaimed. In no event shall the copyright |
|---|
| 327 | owner or contributors be liable for any direct, indirect, incidental, |
|---|
| 328 | special, exemplary, or consequential damages (including, but not |
|---|
| 329 | limited to, procurement of substitute goods or services; loss of use, |
|---|
| 330 | data, or profits; or business interruption) however caused and on any |
|---|
| 331 | theory of liability, whether in contract, strict liability, or tort |
|---|
| 332 | (including negligence or otherwise) arising in any way out of the use |
|---|
| 333 | of this software, even if advised of the possibility of such damage. |
|---|
| 334 | |
|---|
| 335 | |
|---|
| 336 | smartypants.py license:: |
|---|
| 337 | |
|---|
| 338 | smartypants.py is a derivative work of SmartyPants. |
|---|
| 339 | |
|---|
| 340 | Redistribution and use in source and binary forms, with or without |
|---|
| 341 | modification, are permitted provided that the following conditions are |
|---|
| 342 | met: |
|---|
| 343 | |
|---|
| 344 | * Redistributions of source code must retain the above copyright |
|---|
| 345 | notice, this list of conditions and the following disclaimer. |
|---|
| 346 | |
|---|
| 347 | * Redistributions in binary form must reproduce the above copyright |
|---|
| 348 | notice, this list of conditions and the following disclaimer in |
|---|
| 349 | the documentation and/or other materials provided with the |
|---|
| 350 | distribution. |
|---|
| 351 | |
|---|
| 352 | This software is provided by the copyright holders and contributors "as |
|---|
| 353 | is" and any express or implied warranties, including, but not limited |
|---|
| 354 | to, the implied warranties of merchantability and fitness for a |
|---|
| 355 | particular purpose are disclaimed. In no event shall the copyright |
|---|
| 356 | owner or contributors be liable for any direct, indirect, incidental, |
|---|
| 357 | special, exemplary, or consequential damages (including, but not |
|---|
| 358 | limited to, procurement of substitute goods or services; loss of use, |
|---|
| 359 | data, or profits; or business interruption) however caused and on any |
|---|
| 360 | theory of liability, whether in contract, strict liability, or tort |
|---|
| 361 | (including negligence or otherwise) arising in any way out of the use |
|---|
| 362 | of this software, even if advised of the possibility of such damage. |
|---|
| 363 | |
|---|
| 364 | |
|---|
| 365 | |
|---|
| 366 | .. _John Gruber: http://daringfireball.net/ |
|---|
| 367 | .. _Chad Miller: http://web.chad.org/ |
|---|
| 368 | |
|---|
| 369 | .. _Pyblosxom: http://roughingit.subtlehints.net/pyblosxom |
|---|
| 370 | .. _SmartyPants: http://daringfireball.net/projects/smartypants/ |
|---|
| 371 | .. _Movable Type: http://www.movabletype.org/ |
|---|
| 372 | |
|---|
| 373 | """ |
|---|
| 374 | |
|---|
| 375 | default_smartypants_attr = "1" |
|---|
| 376 | |
|---|
| 377 | import re |
|---|
| 378 | |
|---|
| 379 | tags_to_skip_regex = re.compile(r"<(/)?(pre|code|kbd|script|math)[^>]*>", re.I) |
|---|
| 380 | |
|---|
| 381 | |
|---|
| 382 | def verify_installation(request): |
|---|
| 383 | return 1 |
|---|
| 384 | # assert the plugin is functional |
|---|
| 385 | |
|---|
| 386 | |
|---|
| 387 | def cb_story(args): |
|---|
| 388 | global default_smartypants_attr |
|---|
| 389 | |
|---|
| 390 | try: |
|---|
| 391 | forbidden_flavours = args["entry"]["smartypants_forbidden_flavours"] |
|---|
| 392 | except KeyError: |
|---|
| 393 | forbidden_flavours = [ "rss" ] |
|---|
| 394 | |
|---|
| 395 | try: |
|---|
| 396 | attributes = args["entry"]["smartypants_attributes"] |
|---|
| 397 | except KeyError: |
|---|
| 398 | attributes = default_smartypants_attr |
|---|
| 399 | |
|---|
| 400 | if attributes is None: |
|---|
| 401 | attributes = default_smartypants_attr |
|---|
| 402 | |
|---|
| 403 | entryData = args["entry"].getData() |
|---|
| 404 | |
|---|
| 405 | try: |
|---|
| 406 | if args["request"]["flavour"] in forbidden_flavours: |
|---|
| 407 | return |
|---|
| 408 | except KeyError: |
|---|
| 409 | if "<" in args["entry"]["body"][0:15]: # sniff the stream |
|---|
| 410 | return # abort if it looks like escaped HTML. FIXME |
|---|
| 411 | |
|---|
| 412 | # FIXME: make these configurable, perhaps? |
|---|
| 413 | args["entry"]["body"] = smartyPants(entryData, attributes) |
|---|
| 414 | args["entry"]["title"] = smartyPants(args["entry"]["title"], attributes) |
|---|
| 415 | |
|---|
| 416 | |
|---|
| 417 | ### interal functions below here |
|---|
| 418 | |
|---|
| 419 | def smartyPants(text, attr=default_smartypants_attr): |
|---|
| 420 | convert_quot = False # should we translate " entities into normal quotes? |
|---|
| 421 | |
|---|
| 422 | # Parse attributes: |
|---|
| 423 | # 0 : do nothing |
|---|
| 424 | # 1 : set all |
|---|
| 425 | # 2 : set all, using old school en- and em- dash shortcuts |
|---|
| 426 | # 3 : set all, using inverted old school en and em- dash shortcuts |
|---|
| 427 | # |
|---|
| 428 | # q : quotes |
|---|
| 429 | # b : backtick quotes (``double'' only) |
|---|
| 430 | # B : backtick quotes (``double'' and `single') |
|---|
| 431 | # d : dashes |
|---|
| 432 | # D : old school dashes |
|---|
| 433 | # i : inverted old school dashes |
|---|
| 434 | # e : ellipses |
|---|
| 435 | # w : convert " entities to " for Dreamweaver users |
|---|
| 436 | |
|---|
| 437 | skipped_tag_stack = [] |
|---|
| 438 | do_dashes = "0" |
|---|
| 439 | do_backticks = "0" |
|---|
| 440 | do_quotes = "0" |
|---|
| 441 | do_ellipses = "0" |
|---|
| 442 | do_stupefy = "0" |
|---|
| 443 | |
|---|
| 444 | if attr == "0": |
|---|
| 445 | # Do nothing. |
|---|
| 446 | return text |
|---|
| 447 | elif attr == "1": |
|---|
| 448 | do_quotes = "1" |
|---|
| 449 | do_backticks = "1" |
|---|
| 450 | do_dashes = "1" |
|---|
| 451 | do_ellipses = "1" |
|---|
| 452 | elif attr == "2": |
|---|
| 453 | # Do everything, turn all options on, use old school dash shorthand. |
|---|
| 454 | do_quotes = "1" |
|---|
| 455 | do_backticks = "1" |
|---|
| 456 | do_dashes = "2" |
|---|
| 457 | do_ellipses = "1" |
|---|
| 458 | elif attr == "3": |
|---|
| 459 | # Do everything, turn all options on, use inverted old school dash shorthand. |
|---|
| 460 | do_quotes = "1" |
|---|
| 461 | do_backticks = "1" |
|---|
| 462 | do_dashes = "3" |
|---|
| 463 | do_ellipses = "1" |
|---|
| 464 | elif attr == "-1": |
|---|
| 465 | # Special "stupefy" mode. |
|---|
| 466 | do_stupefy = "1" |
|---|
| 467 | else: |
|---|
| 468 | for c in attr: |
|---|
| 469 | if c == "q": do_quotes = "1" |
|---|
| 470 | elif c == "b": do_backticks = "1" |
|---|
| 471 | elif c == "B": do_backticks = "2" |
|---|
| 472 | elif c == "d": do_dashes = "1" |
|---|
| 473 | elif c == "D": do_dashes = "2" |
|---|
| 474 | elif c == "i": do_dashes = "3" |
|---|
| 475 | elif c == "e": do_ellipses = "1" |
|---|
| 476 | elif c == "w": convert_quot = "1" |
|---|
| 477 | else: |
|---|
| 478 | pass |
|---|
| 479 | # ignore unknown option |
|---|
| 480 | |
|---|
| 481 | tokens = _tokenize(text) |
|---|
| 482 | result = [] |
|---|
| 483 | in_pre = False |
|---|
| 484 | |
|---|
| 485 | prev_token_last_char = "" |
|---|
| 486 | # This is a cheat, used to get some context |
|---|
| 487 | # for one-character tokens that consist of |
|---|
| 488 | # just a quote char. What we do is remember |
|---|
| 489 | # the last character of the previous text |
|---|
| 490 | # token, to use as context to curl single- |
|---|
| 491 | # character quote tokens correctly. |
|---|
| 492 | |
|---|
| 493 | for cur_token in tokens: |
|---|
| 494 | if cur_token[0] == "tag": |
|---|
| 495 | # Don't mess with quotes inside some tags. This does not handle self <closing/> tags! |
|---|
| 496 | result.append(cur_token[1]) |
|---|
| 497 | skip_match = tags_to_skip_regex.match(cur_token[1]) |
|---|
| 498 | if skip_match is not None: |
|---|
| 499 | if not skip_match.group(1): |
|---|
| 500 | skipped_tag_stack.append(skip_match.group(2).lower()) |
|---|
| 501 | in_pre = True |
|---|
| 502 | else: |
|---|
| 503 | if len(skipped_tag_stack) > 0: |
|---|
| 504 | if skip_match.group(2).lower() == skipped_tag_stack[-1]: |
|---|
| 505 | skipped_tag_stack.pop() |
|---|
| 506 | else: |
|---|
| 507 | pass |
|---|
| 508 | # This close doesn't match the open. This isn't XHTML. We should barf here. |
|---|
| 509 | if len(skipped_tag_stack) == 0: |
|---|
| 510 | in_pre = False |
|---|
| 511 | else: |
|---|
| 512 | t = cur_token[1] |
|---|
| 513 | last_char = t[-1:] # Remember last char of this token before processing. |
|---|
| 514 | if not in_pre: |
|---|
| 515 | oldstr = t |
|---|
| 516 | t = processEscapes(t) |
|---|
| 517 | |
|---|
| 518 | if convert_quot != "0": |
|---|
| 519 | t = re.sub('"', '"', t) |
|---|
| 520 | |
|---|
| 521 | if do_dashes != "0": |
|---|
| 522 | if do_dashes == "1": |
|---|
| 523 | t = educateDashes(t) |
|---|
| 524 | if do_dashes == "2": |
|---|
| 525 | t = educateDashesOldSchool(t) |
|---|
| 526 | if do_dashes == "3": |
|---|
| 527 | t = educateDashesOldSchoolInverted(t) |
|---|
| 528 | |
|---|
| 529 | if do_ellipses != "0": |
|---|
| 530 | t = educateEllipses(t) |
|---|
| 531 | |
|---|
| 532 | # Note: backticks need to be processed before quotes. |
|---|
| 533 | if do_backticks != "0": |
|---|
| 534 | t = educateBackticks(t) |
|---|
| 535 | |
|---|
| 536 | if do_backticks == "2": |
|---|
| 537 | t = educateSingleBackticks(t) |
|---|
| 538 | |
|---|
| 539 | if do_quotes != "0": |
|---|
| 540 | if t == "'": |
|---|
| 541 | # Special case: single-character ' token |
|---|
| 542 | if re.match("\S", prev_token_last_char): |
|---|
| 543 | t = "’" |
|---|
| 544 | else: |
|---|
| 545 | t = "‘" |
|---|
| 546 | elif t == '"': |
|---|
| 547 | # Special case: single-character " token |
|---|
| 548 | if re.match("\S", prev_token_last_char): |
|---|
| 549 | t = "”" |
|---|
| 550 | else: |
|---|
| 551 | t = "“" |
|---|
| 552 | |
|---|
| 553 | else: |
|---|
| 554 | # Normal case: |
|---|
| 555 | t = educateQuotes(t) |
|---|
| 556 | |
|---|
| 557 | if do_stupefy == "1": |
|---|
| 558 | t = stupefyEntities(t) |
|---|
| 559 | |
|---|
| 560 | prev_token_last_char = last_char |
|---|
| 561 | result.append(t) |
|---|
| 562 | |
|---|
| 563 | return "".join(result) |
|---|
| 564 | |
|---|
| 565 | |
|---|
| 566 | def educateQuotes(str): |
|---|
| 567 | """ |
|---|
| 568 | Parameter: String. |
|---|
| 569 | |
|---|
| 570 | Returns: The string, with "educated" curly quote HTML entities. |
|---|
| 571 | |
|---|
| 572 | Example input: "Isn't this fun?" |
|---|
| 573 | Example output: “Isn’t this fun?” |
|---|
| 574 | """ |
|---|
| 575 | |
|---|
| 576 | oldstr = str |
|---|
| 577 | punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]""" |
|---|
| 578 | |
|---|
| 579 | # Special case if the very first character is a quote |
|---|
| 580 | # followed by punctuation at a non-word-break. Close the quotes by brute force: |
|---|
| 581 | str = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), r"""’""", str) |
|---|
| 582 | str = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), r"""”""", str) |
|---|
| 583 | |
|---|
| 584 | # Special case for double sets of quotes, e.g.: |
|---|
| 585 | # <p>He said, "'Quoted' words in a larger quote."</p> |
|---|
| 586 | str = re.sub(r""""'(?=\w)""", """“‘""", str) |
|---|
| 587 | str = re.sub(r"""'"(?=\w)""", """‘“""", str) |
|---|
| 588 | |
|---|
| 589 | # Special case for decade abbreviations (the '80s): |
|---|
| 590 | str = re.sub(r"""\b'(?=\d{2}s)""", r"""’""", str) |
|---|
| 591 | |
|---|
| 592 | close_class = r"""[^\ \t\r\n\[\{\(\-]""" |
|---|
| 593 | dec_dashes = r"""–|—""" |
|---|
| 594 | |
|---|
| 595 | # Get most opening single quotes: |
|---|
| 596 | opening_single_quotes_regex = re.compile(r""" |
|---|
| 597 | ( |
|---|
| 598 | \s | # a whitespace char, or |
|---|
| 599 | | # a non-breaking space entity, or |
|---|
| 600 | -- | # dashes, or |
|---|
| 601 | &[mn]dash; | # named dash entities |
|---|
| 602 | %s | # or decimal entities |
|---|
| 603 | &\#x201[34]; # or hex |
|---|
| 604 | ) |
|---|
| 605 | ' # the quote |
|---|
| 606 | (?=\w) # followed by a word character |
|---|
| 607 | """ % (dec_dashes,), re.VERBOSE) |
|---|
| 608 | str = opening_single_quotes_regex.sub(r"""\1‘""", str) |
|---|
| 609 | |
|---|
| 610 | closing_single_quotes_regex = re.compile(r""" |
|---|
| 611 | (%s) |
|---|
| 612 | ' |
|---|
| 613 | (?!\s | s\b | \d) |
|---|
| 614 | """ % (close_class,), re.VERBOSE) |
|---|
| 615 | str = closing_single_quotes_regex.sub(r"""\1’""", str) |
|---|
| 616 | |
|---|
| 617 | closing_single_quotes_regex = re.compile(r""" |
|---|
| 618 | (%s) |
|---|
| 619 | ' |
|---|
| 620 | (\s | s\b) |
|---|
| 621 | """ % (close_class,), re.VERBOSE) |
|---|
| 622 | str = closing_single_quotes_regex.sub(r"""\1’\2""", str) |
|---|
| 623 | |
|---|
| 624 | # Any remaining single quotes should be opening ones: |
|---|
| 625 | str = re.sub(r"""'""", r"""‘""", str) |
|---|
| 626 | |
|---|
| 627 | # Get most opening double quotes: |
|---|
| 628 | opening_double_quotes_regex = re.compile(r""" |
|---|
| 629 | ( |
|---|
| 630 | \s | # a whitespace char, or |
|---|
| 631 | | # a non-breaking space entity, or |
|---|
| 632 | -- | # dashes, or |
|---|
| 633 | &[mn]dash; | # named dash entities |
|---|
| 634 | %s | # or decimal entities |
|---|
| 635 | &\#x201[34]; # or hex |
|---|
| 636 | ) |
|---|
| 637 | " # the quote |
|---|
| 638 | (?=\w) # followed by a word character |
|---|
| 639 | """ % (dec_dashes,), re.VERBOSE) |
|---|
| 640 | str = opening_double_quotes_regex.sub(r"""\1“""", str) |
|---|
| 641 | |
|---|
| 642 | # Double closing quotes: |
|---|
| 643 | closing_double_quotes_regex = re.compile(r""" |
|---|
| 644 | #(%s)? # character that indicates the quote should be closing |
|---|
| 645 | " |
|---|
| 646 | (?=\s) |
|---|
| 647 | """ % (close_class,), re.VERBOSE) |
|---|
| 648 | str = closing_double_quotes_regex.sub(r"""”""", str) |
|---|
| 649 | |
|---|
| 650 | closing_double_quotes_regex = re.compile(r""" |
|---|
| 651 | (%s) # character that indicates the quote should be closing |
|---|
| 652 | " |
|---|
| 653 | """ % (close_class,), re.VERBOSE) |
|---|
| 654 | str = closing_double_quotes_regex.sub(r"""\1”""", str) |
|---|
| 655 | |
|---|
| 656 | # Any remaining quotes should be opening ones. |
|---|
| 657 | str = re.sub(r'"', r"""“""", str) |
|---|
| 658 | |
|---|
| 659 | return str |
|---|
| 660 | |
|---|
| 661 | |
|---|
| 662 | def educateBackticks(str): |
|---|
| 663 | """ |
|---|
| 664 | Parameter: String. |
|---|
| 665 | Returns: The string, with ``backticks'' -style double quotes |
|---|
| 666 | translated into HTML curly quote entities. |
|---|
| 667 | Example input: ``Isn't this fun?'' |
|---|
| 668 | Example output: “Isn't this fun?” |
|---|
| 669 | """ |
|---|
| 670 | |
|---|
| 671 | str = re.sub(r"""``""", r"""“""", str) |
|---|
| 672 | str = re.sub(r"""''""", r"""”""", str) |
|---|
| 673 | return str |
|---|
| 674 | |
|---|
| 675 | |
|---|
| 676 | def educateSingleBackticks(str): |
|---|
| 677 | """ |
|---|
| 678 | Parameter: String. |
|---|
| 679 | Returns: The string, with `backticks' -style single quotes |
|---|
| 680 | translated into HTML curly quote entities. |
|---|
| 681 | |
|---|
| 682 | Example input: `Isn't this fun?' |
|---|
| 683 | Example output: ‘Isn’t this fun?’ |
|---|
| 684 | """ |
|---|
| 685 | |
|---|
| 686 | str = re.sub(r"""`""", r"""‘""", str) |
|---|
| 687 | str = re.sub(r"""'""", r"""’""", str) |
|---|
| 688 | return str |
|---|
| 689 | |
|---|
| 690 | |
|---|
| 691 | def educateDashes(str): |
|---|
| 692 | """ |
|---|
| 693 | Parameter: String. |
|---|
| 694 | |
|---|
| 695 | Returns: The string, with each instance of "--" translated to |
|---|
| 696 | an em-dash HTML entity. |
|---|
| 697 | """ |
|---|
| 698 | |
|---|
| 699 | str = re.sub(r"""---""", r"""–""", str) # en (yes, backwards) |
|---|
| 700 | str = re.sub(r"""--""", r"""—""", str) # em (yes, backwards) |
|---|
| 701 | return str |
|---|
| 702 | |
|---|
| 703 | |
|---|
| 704 | def educateDashesOldSchool(str): |
|---|
| 705 | """ |
|---|
| 706 | Parameter: String. |
|---|
| 707 | |
|---|
| 708 | Returns: The string, with each instance of "--" translated to |
|---|
| 709 | an en-dash HTML entity, and each "---" translated to |
|---|
| 710 | an em-dash HTML entity. |
|---|
| 711 | """ |
|---|
| 712 | |
|---|
| 713 | str = re.sub(r"""---""", r"""—""", str) # em (yes, backwards) |
|---|
| 714 | str = re.sub(r"""--""", r"""–""", str) # en (yes, backwards) |
|---|
| 715 | return str |
|---|
| 716 | |
|---|
| 717 | |
|---|
| 718 | def educateDashesOldSchoolInverted(str): |
|---|
| 719 | """ |
|---|
| 720 | Parameter: String. |
|---|
| 721 | |
|---|
| 722 | Returns: The string, with each instance of "--" translated to |
|---|
| 723 | an em-dash HTML entity, and each "---" translated to |
|---|
| 724 | an en-dash HTML entity. Two reasons why: First, unlike the |
|---|
| 725 | en- and em-dash syntax supported by |
|---|
| 726 | EducateDashesOldSchool(), it's compatible with existing |
|---|
| 727 | entries written before SmartyPants 1.1, back when "--" was |
|---|
| 728 | only used for em-dashes. Second, em-dashes are more |
|---|
| 729 | common than en-dashes, and so it sort of makes sense that |
|---|
| 730 | the shortcut should be shorter to type. (Thanks to Aaron |
|---|
| 731 | Swartz for the idea.) |
|---|
| 732 | """ |
|---|
| 733 | str = re.sub(r"""---""", r"""–""", str) # em |
|---|
| 734 | str = re.sub(r"""--""", r"""—""", str) # en |
|---|
| 735 | return str |
|---|
| 736 | |
|---|
| 737 | |
|---|
| 738 | |
|---|
| 739 | def educateEllipses(str): |
|---|
| 740 | """ |
|---|
| 741 | Parameter: String. |
|---|
| 742 | Returns: The string, with each instance of "..." translated to |
|---|
| 743 | an ellipsis HTML entity. |
|---|
| 744 | |
|---|
| 745 | Example input: Huh...? |
|---|
| 746 | Example output: Huh…? |
|---|
| 747 | """ |
|---|
| 748 | |
|---|
| 749 | str = re.sub(r"""\.\.\.""", r"""…""", str) |
|---|
| 750 | str = re.sub(r"""\. \. \.""", r"""…""", str) |
|---|
| 751 | return str |
|---|
| 752 | |
|---|
| 753 | |
|---|
| 754 | def stupefyEntities(str): |
|---|
| 755 | """ |
|---|
| 756 | Parameter: String. |
|---|
| 757 | Returns: The string, with each SmartyPants HTML entity translated to |
|---|
| 758 | its ASCII counterpart. |
|---|
| 759 | |
|---|
| 760 | Example input: “Hello — world.” |
|---|
| 761 | Example output: "Hello -- world." |
|---|
| 762 | """ |
|---|
| 763 | |
|---|
| 764 | str = re.sub(r"""–""", r"""-""", str) # en-dash |
|---|
| 765 | str = re.sub(r"""—""", r"""--""", str) # em-dash |
|---|
| 766 | |
|---|
| 767 | str = re.sub(r"""‘""", r"""'""", str) # open single quote |
|---|
| 768 | str = re.sub(r"""’""", r"""'""", str) # close single quote |
|---|
| 769 | |
|---|
| 770 | str = re.sub(r"""“""", r'''"''', str) # open double quote |
|---|
| 771 | str = re.sub(r"""”""", r'''"''', str) # close double quote |
|---|
| 772 | |
|---|
| 773 | str = re.sub(r"""…""", r"""...""", str)# ellipsis |
|---|
| 774 | |
|---|
| 775 | return str |
|---|
| 776 | |
|---|
| 777 | |
|---|
| 778 | def processEscapes(str): |
|---|
| 779 | r""" |
|---|
| 780 | Parameter: String. |
|---|
| 781 | Returns: The string, with after processing the following backslash |
|---|
| 782 | escape sequences. This is useful if you want to force a "dumb" |
|---|
| 783 | quote or other character to appear. |
|---|
| 784 | |
|---|
| 785 | Escape Value |
|---|
| 786 | ------ ----- |
|---|
| 787 | \\ \ |
|---|
| 788 | \" " |
|---|
| 789 | \' ' |
|---|
| 790 | \. . |
|---|
| 791 | \- - |
|---|
| 792 | \` ` |
|---|
| 793 | """ |
|---|
| 794 | str = re.sub(r"""\\\\""", r"""\""", str) |
|---|
| 795 | str = re.sub(r'''\\"''', r""""""", str) |
|---|
| 796 | str = re.sub(r"""\\'""", r"""'""", str) |
|---|
| 797 | str = re.sub(r"""\\\.""", r""".""", str) |
|---|
| 798 | str = re.sub(r"""\\-""", r"""-""", str) |
|---|
| 799 | str = re.sub(r"""\\`""", r"""`""", str) |
|---|
| 800 | |
|---|
| 801 | return str |
|---|
| 802 | |
|---|
| 803 | |
|---|
| 804 | def _tokenize(str): |
|---|
| 805 | """ |
|---|
| 806 | Parameter: String containing HTML markup. |
|---|
| 807 | Returns: Reference to an array of the tokens comprising the input |
|---|
| 808 | string. Each token is either a tag (possibly with nested, |
|---|
| 809 | tags contained therein, such as <a href="<MTFoo>">, or a |
|---|
| 810 | run of text between tags. Each element of the array is a |
|---|
| 811 | two-element array; the first is either 'tag' or 'text'; |
|---|
| 812 | the second is the actual value. |
|---|
| 813 | |
|---|
| 814 | Based on the _tokenize() subroutine from Brad Choate's MTRegex plugin. |
|---|
| 815 | <http://www.bradchoate.com/past/mtregex.php> |
|---|
| 816 | """ |
|---|
| 817 | |
|---|
| 818 | pos = 0 |
|---|
| 819 | length = len(str) |
|---|
| 820 | tokens = [] |
|---|
| 821 | |
|---|
| 822 | depth = 6 |
|---|
| 823 | nested_tags = "|".join(['(?:<(?:[^<>]',] * depth) + (')*>)' * depth) |
|---|
| 824 | #match = r"""(?: <! ( -- .*? -- \s* )+ > ) | # comments |
|---|
| 825 | # (?: <\? .*? \?> ) | # directives |
|---|
| 826 | # %s # nested tags """ % (nested_tags,) |
|---|
| 827 | tag_soup = re.compile(r"""([^<]*)(<[^>]*>)""") |
|---|
| 828 | |
|---|
| 829 | token_match = tag_soup.search(str) |
|---|
| 830 | |
|---|
| 831 | previous_end = 0 |
|---|
| 832 | while token_match is not None: |
|---|
| 833 | if token_match.group(1): |
|---|
| 834 | tokens.append(['text', token_match.group(1)]) |
|---|
| 835 | |
|---|
| 836 | tokens.append(['tag', token_match.group(2)]) |
|---|
| 837 | |
|---|
| 838 | previous_end = token_match.end() |
|---|
| 839 | token_match = tag_soup.search(str, token_match.end()) |
|---|
| 840 | |
|---|
| 841 | if previous_end < len(str): |
|---|
| 842 | tokens.append(['text', str[previous_end:]]) |
|---|
| 843 | |
|---|
| 844 | return tokens |
|---|
| 845 | |
|---|
| 846 | |
|---|
| 847 | |
|---|
| 848 | if __name__ == "__main__": |
|---|
| 849 | |
|---|
| 850 | import locale |
|---|