Changesets can be listed by changeset number.
The Git repository is here.
- Revision:
- 13
- Log:
Initial import of Typo 2.6.0 sources from a downloaded Tarball.
Typo is a Ruby On Rails based blog engine.
- Author:
- adh
- Date:
- Sat Jul 22 22:25:02 +0100 2006
- Size:
- 30462 Bytes
1 | #!/usr/bin/ruby |
2 | # |
3 | # Bluecloth is a Ruby implementation of Markdown, a text-to-HTML conversion |
4 | # tool. |
5 | # |
6 | # == Synopsis |
7 | # |
8 | # doc = BlueCloth::new " |
9 | # ## Test document ## |
10 | # |
11 | # Just a simple test. |
12 | # " |
13 | # |
14 | # puts doc.to_html |
15 | # |
16 | # == Authors |
17 | # |
18 | # * Michael Granger <ged@FaerieMUD.org> |
19 | # |
20 | # == Contributors |
21 | # |
22 | # * Martin Chase <stillflame@FaerieMUD.org> - Peer review, helpful suggestions |
23 | # * Florian Gross <flgr@ccan.de> - Filter options, suggestions |
24 | # |
25 | # == Copyright |
26 | # |
27 | # Original version: |
28 | # Copyright (c) 2003-2004 John Gruber |
29 | # <http://daringfireball.net/> |
30 | # All rights reserved. |
31 | # |
32 | # Ruby port: |
33 | # Copyright (c) 2004 The FaerieMUD Consortium. |
34 | # |
35 | # BlueCloth is free software; you can redistribute it and/or modify it under the |
36 | # terms of the GNU General Public License as published by the Free Software |
37 | # Foundation; either version 2 of the License, or (at your option) any later |
38 | # version. |
39 | # |
40 | # BlueCloth is distributed in the hope that it will be useful, but WITHOUT ANY |
41 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR |
42 | # A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
43 | # |
44 | # == To-do |
45 | # |
46 | # * Refactor some of the larger uglier methods that have to do their own |
47 | # brute-force scanning because of lack of Perl features in Ruby's Regexp |
48 | # class. Alternately, could add a dependency on 'pcre' and use most Perl |
49 | # regexps. |
50 | # |
51 | # * Put the StringScanner in the render state for thread-safety. |
52 | # |
53 | # == Version |
54 | # |
55 | # $Id: bluecloth.rb 69 2004-08-25 05:27:15Z ged $ |
56 | # |
57 | |
58 | require 'digest/md5' |
59 | require 'logger' |
60 | require 'strscan' |
61 | |
62 | |
63 | ### BlueCloth is a Ruby implementation of Markdown, a text-to-HTML conversion |
64 | ### tool. |
65 | class BlueCloth < String |
66 | |
67 | ### Exception class for formatting errors. |
68 | class FormatError < RuntimeError |
69 | |
70 | ### Create a new FormatError with the given source +str+ and an optional |
71 | ### message about the +specific+ error. |
72 | def initialize( str, specific=nil ) |
73 | if specific |
74 | msg = "Bad markdown format near %p: %s" % [ str, specific ] |
75 | else |
76 | msg = "Bad markdown format near %p" % str |
77 | end |
78 | |
79 | super( msg ) |
80 | end |
81 | end |
82 | |
83 | |
84 | # Release Version |
85 | Version = '0.0.3' |
86 | |
87 | # SVN Revision |
88 | SvnRev = %q$Rev: 69 $ |
89 | |
90 | # SVN Id tag |
91 | SvnId = %q$Id: bluecloth.rb 69 2004-08-25 05:27:15Z ged $ |
92 | |
93 | # SVN URL |
94 | SvnUrl = %q$URL: svn+ssh://svn.faeriemud.org/usr/local/svn/BlueCloth/trunk/lib/bluecloth.rb $ |
95 | |
96 | |
97 | # Rendering state struct. Keeps track of URLs, titles, and HTML blocks |
98 | # midway through a render. I prefer this to the globals of the Perl version |
99 | # because globals make me break out in hives. Or something. |
100 | RenderState = Struct::new( "RenderState", :urls, :titles, :html_blocks, :log ) |
101 | |
102 | # Tab width for #detab! if none is specified |
103 | TabWidth = 4 |
104 | |
105 | # The tag-closing string -- set to '>' for HTML |
106 | EmptyElementSuffix = "/>"; |
107 | |
108 | # Table of MD5 sums for escaped characters |
109 | EscapeTable = {} |
110 | '\\`*_{}[]()#.!'.split(//).each {|char| |
111 | hash = Digest::MD5::hexdigest( char ) |
112 | |
113 | EscapeTable[ char ] = { |
114 | :md5 => hash, |
115 | :md5re => Regexp::new( hash ), |
116 | :re => Regexp::new( '\\\\' + Regexp::escape(char) ), |
117 | } |
118 | } |
119 | |
120 | |
121 | ################################################################# |
122 | ### I N S T A N C E M E T H O D S |
123 | ################################################################# |
124 | |
125 | ### Create a new BlueCloth string. |
126 | def initialize( content="", *restrictions ) |
127 | @log = Logger::new( $deferr ) |
128 | @log.level = $DEBUG ? |
129 | Logger::DEBUG : |
130 | ($VERBOSE ? Logger::INFO : Logger::WARN) |
131 | @scanner = nil |
132 | |
133 | # Add any restrictions, and set the line-folding attribute to reflect |
134 | # what happens by default. |
135 | @filter_html = nil |
136 | @filter_styles = nil |
137 | restrictions.flatten.each {|r| __send__("#{r}=", true) } |
138 | @fold_lines = true |
139 | |
140 | super( content ) |
141 | |
142 | @log.debug "String is: %p" % self |
143 | end |
144 | |
145 | |
146 | ###### |
147 | public |
148 | ###### |
149 | |
150 | # Filters for controlling what gets output for untrusted input. (But really, |
151 | # you're filtering bad stuff out of untrusted input at submission-time via |
152 | # untainting, aren't you?) |
153 | attr_accessor :filter_html, :filter_styles |
154 | |
155 | # RedCloth-compatibility accessor. Line-folding is part of Markdown syntax, |
156 | # so this isn't used by anything. |
157 | attr_accessor :fold_lines |
158 | |
159 | |
160 | ### Render Markdown-formatted text in this string object as HTML and return |
161 | ### it. The parameter is for compatibility with RedCloth, and is currently |
162 | ### unused, though that may change in the future. |
163 | def to_html( lite=false ) |
164 | |
165 | # Create a StringScanner we can reuse for various lexing tasks |
166 | @scanner = StringScanner::new( '' ) |
167 | |
168 | # Make a structure to carry around stuff that gets placeholdered out of |
169 | # the source. |
170 | rs = RenderState::new( {}, {}, {} ) |
171 | |
172 | # Make a copy of the string with normalized line endings, tabs turned to |
173 | # spaces, and a couple of guaranteed newlines at the end |
174 | text = self.gsub( /\r\n?/, "\n" ).detab |
175 | text += "\n\n" |
176 | @log.debug "Normalized line-endings: %p" % text |
177 | |
178 | # Filter HTML if we're asked to do so |
179 | if self.filter_html |
180 | text.gsub!( "<", "<" ) |
181 | text.gsub!( ">", ">" ) |
182 | @log.debug "Filtered HTML: %p" % text |
183 | end |
184 | |
185 | # Simplify blank lines |
186 | text.gsub!( /^ +$/, '' ) |
187 | @log.debug "Tabs -> spaces/blank lines stripped: %p" % text |
188 | |
189 | # Replace HTML blocks with placeholders |
190 | text = hide_html_blocks( text, rs ) |
191 | @log.debug "Hid HTML blocks: %p" % text |
192 | @log.debug "Render state: %p" % rs |
193 | |
194 | # Strip link definitions, store in render state |
195 | text = strip_link_definitions( text, rs ) |
196 | @log.debug "Stripped link definitions: %p" % text |
197 | @log.debug "Render state: %p" % rs |
198 | |
199 | # Escape meta-characters |
200 | text = escape_special_chars( text ) |
201 | @log.debug "Escaped special characters: %p" % text |
202 | |
203 | # Transform block-level constructs |
204 | text = apply_block_transforms( text, rs ) |
205 | @log.debug "After block-level transforms: %p" % text |
206 | |
207 | # Now swap back in all the escaped characters |
208 | text = unescape_special_chars( text ) |
209 | @log.debug "After unescaping special characters: %p" % text |
210 | |
211 | return text |
212 | end |
213 | |
214 | |
215 | ### Convert tabs in +str+ to spaces. |
216 | def detab( tabwidth=TabWidth ) |
217 | copy = self.dup |
218 | copy.detab!( tabwidth ) |
219 | return copy |
220 | end |
221 | |
222 | |
223 | ### Convert tabs to spaces in place and return self if any were converted. |
224 | def detab!( tabwidth=TabWidth ) |
225 | newstr = self.split( /\n/ ).collect {|line| |
226 | line.gsub( /(.*?)\t/ ) do |
227 | $1 + ' ' * (tabwidth - $1.length % tabwidth) |
228 | end |
229 | }.join("\n") |
230 | self.replace( newstr ) |
231 | end |
232 | |
233 | |
234 | ####### |
235 | #private |
236 | ####### |
237 | |
238 | ### Do block-level transforms on a copy of +str+ using the specified render |
239 | ### state +rs+ and return the results. |
240 | def apply_block_transforms( str, rs ) |
241 | # Port: This was called '_runBlockGamut' in the original |
242 | |
243 | @log.debug "Applying block transforms to:\n %p" % str |
244 | text = transform_headers( str, rs ) |
245 | text = transform_hrules( text, rs ) |
246 | text = transform_lists( text, rs ) |
247 | text = transform_code_blocks( text, rs ) |
248 | text = transform_block_quotes( text, rs ) |
249 | text = transform_auto_links( text, rs ) |
250 | text = hide_html_blocks( text, rs ) |
251 | |
252 | text = form_paragraphs( text, rs ) |
253 | |
254 | @log.debug "Done with block transforms:\n %p" % text |
255 | return text |
256 | end |
257 | |
258 | |
259 | ### Apply Markdown span transforms to a copy of the specified +str+ with the |
260 | ### given render state +rs+ and return it. |
261 | def apply_span_transforms( str, rs ) |
262 | @log.debug "Applying span transforms to:\n %p" % str |
263 | |
264 | str = transform_code_spans( str, rs ) |
265 | str = encode_html( str ) |
266 | str = transform_images( str, rs ) |
267 | str = transform_anchors( str, rs ) |
268 | str = transform_italic_and_bold( str, rs ) |
269 | |
270 | # Hard breaks |
271 | str.gsub!( / {2,}\n/, "<br#{EmptyElementSuffix}\n" ) |
272 | |
273 | @log.debug "Done with span transforms:\n %p" % str |
274 | return str |
275 | end |
276 | |
277 | |
278 | # The list of tags which are considered block-level constructs and an |
279 | # alternation pattern suitable for use in regexps made from the list |
280 | StrictBlockTags = %w[ p div h[1-6] blockquote pre table dl ol ul script noscript |
281 | form fieldset iframe math ins del ] |
282 | StrictTagPattern = StrictBlockTags.join('|') |
283 | |
284 | LooseBlockTags = StrictBlockTags - %w[ins del] |
285 | LooseTagPattern = LooseBlockTags.join('|') |
286 | |
287 | # Nested blocks: |
288 | # <div> |
289 | # <div> |
290 | # tags for inner block must be indented. |
291 | # </div> |
292 | # </div> |
293 | StrictBlockRegex = %r{ |
294 | ^ # Start of line |
295 | <(#{StrictTagPattern}) # Start tag: \2 |
296 | \b # word break |
297 | (.*\n)*? # Any number of lines, minimal match |
298 | </\1> # Matching end tag |
299 | [ ]* # trailing spaces |
300 | $ # End of line or document |
301 | }ix |
302 | |
303 | # More-liberal block-matching |
304 | LooseBlockRegex = %r{ |
305 | ^ # Start of line |
306 | <(#{LooseTagPattern}) # start tag: \2 |
307 | \b # word break |
308 | (.*\n)*? # Any number of lines, minimal match |
309 | .*</\1> # Anything + Matching end tag |
310 | [ ]* # trailing spaces |
311 | $ # End of line or document |
312 | }ix |
313 | |
314 | # Special case for <hr />. |
315 | HruleBlockRegex = %r{ |
316 | ( # $1 |
317 | \A\n? # Start of doc + optional \n |
318 | | # or |
319 | .*\n\n # anything + blank line |
320 | ) |
321 | ( # save in $2 |
322 | [ ]* # Any spaces |
323 | <hr # Tag open |
324 | \b # Word break |
325 | ([^<>])*? # Attributes |
326 | /?> # Tag close |
327 | $ # followed by a blank line or end of document |
328 | ) |
329 | }ix |
330 | |
331 | ### Replace all blocks of HTML in +str+ that start in the left margin with |
332 | ### tokens. |
333 | def hide_html_blocks( str, rs ) |
334 | @log.debug "Hiding HTML blocks in %p" % str |
335 | |
336 | # Tokenizer proc to pass to gsub |
337 | tokenize = lambda {|match| |
338 | key = Digest::MD5::hexdigest( match ) |
339 | rs.html_blocks[ key ] = match |
340 | @log.debug "Replacing %p with %p" % [ match, key ] |
341 | "\n\n#{key}\n\n" |
342 | } |
343 | |
344 | rval = str.dup |
345 | |
346 | @log.debug "Finding blocks with the strict regex..." |
347 | rval.gsub!( StrictBlockRegex, &tokenize ) |
348 | |
349 | @log.debug "Finding blocks with the loose regex..." |
350 | rval.gsub!( LooseBlockRegex, &tokenize ) |
351 | |
352 | @log.debug "Finding hrules..." |
353 | rval.gsub!( HruleBlockRegex ) {|match| $1 + tokenize[$2] } |
354 | |
355 | return rval |
356 | end |
357 | |
358 | |
359 | # Link defs are in the form: ^[id]: url "optional title" |
360 | LinkRegex = %r{ |
361 | ^[ ]*\[(.+)\]: # id = $1 |
362 | [ ]* |
363 | \n? # maybe *one* newline |
364 | [ ]* |
365 | <?(\S+?)>? # url = $2 |
366 | [ ]* |
367 | \n? # maybe one newline |
368 | [ ]* |
369 | (?: |
370 | # Titles are delimited by "quotes" or (parens). |
371 | ["(] |
372 | (.+?) # title = $3 |
373 | [")] # Matching ) or " |
374 | [ ]* |
375 | )? # title is optional |
376 | (?:\n+|\Z) |
377 | }x |
378 | |
379 | ### Strip link definitions from +str+, storing them in the given RenderState |
380 | ### +rs+. |
381 | def strip_link_definitions( str, rs ) |
382 | str.gsub( LinkRegex ) {|match| |
383 | id, url, title = $1, $2, $3 |
384 | |
385 | rs.urls[ id.downcase ] = encode_html( url ) |
386 | unless title.nil? |
387 | rs.titles[ id.downcase ] = title.gsub( /"/, """ ) |
388 | end |
389 | "" |
390 | } |
391 | end |
392 | |
393 | |
394 | ### Escape special characters in the given +str+ |
395 | def escape_special_chars( str ) |
396 | @log.debug " Escaping special characters" |
397 | text = '' |
398 | |
399 | # The original Markdown source has something called '$tags_to_skip' |
400 | # declared here, but it's never used, so I don't define it. |
401 | |
402 | tokenize_html( str ) {|token, str| |
403 | @log.debug " Adding %p token %p" % [ token, str ] |
404 | case token |
405 | |
406 | # Within tags, encode * and _ |
407 | when :tag |
408 | text += str. |
409 | gsub( /\*/, EscapeTable['*'][:md5] ). |
410 | gsub( /_/, EscapeTable['_'][:md5] ) |
411 | |
412 | # Encode backslashed stuff in regular text |
413 | when :text |
414 | text += encode_backslash_escapes( str ) |
415 | else |
416 | raise TypeError, "Unknown token type %p" % token |
417 | end |
418 | } |
419 | |
420 | @log.debug " Text with escapes is now: %p" % text |
421 | return text |
422 | end |
423 | |
424 | |
425 | ### Swap escaped special characters in a copy of the given +str+ and return |
426 | ### it. |
427 | def unescape_special_chars( str ) |
428 | EscapeTable.each {|char, hash| |
429 | @log.debug "Unescaping escaped %p with %p" % [ char, hash[:md5re] ] |
430 | str.gsub!( hash[:md5re], char ) |
431 | } |
432 | |
433 | return str |
434 | end |
435 | |
436 | |
437 | ### Return a copy of the given +str+ with any backslashed special character |
438 | ### in it replaced with MD5 placeholders. |
439 | def encode_backslash_escapes( str ) |
440 | # Make a copy with any double-escaped backslashes encoded |
441 | text = str.gsub( /\\\\/, EscapeTable['\\'][:md5] ) |
442 | |
443 | EscapeTable.each_pair {|char, esc| |
444 | next if char == '\\' |
445 | text.gsub!( esc[:re], esc[:md5] ) |
446 | } |
447 | |
448 | return text |
449 | end |
450 | |
451 | |
452 | ### Transform any Markdown-style horizontal rules in a copy of the specified |
453 | ### +str+ and return it. |
454 | def transform_hrules( str, rs ) |
455 | @log.debug " Transforming horizontal rules" |
456 | str.gsub( /^( ?[\-\*_] ?){3,}$/, "\n<hr#{EmptyElementSuffix}\n" ) |
457 | end |
458 | |
459 | |
460 | |
461 | # Patterns to match and transform lists |
462 | ListMarkerOl = %r{\d+\.} |
463 | ListMarkerUl = %r{[*+-]} |
464 | ListMarkerAny = Regexp::union( ListMarkerOl, ListMarkerUl ) |
465 | |
466 | ListRegexp = %r{ |
467 | (?: |
468 | ^[ ]{0,#{TabWidth - 1}} # Indent < tab width |
469 | (#{ListMarkerAny}) # unordered or ordered ($1) |
470 | [ ]+ # At least one space |
471 | ) |
472 | (?m:.+?) # item content (include newlines) |
473 | (?: |
474 | \z # Either EOF |
475 | | # or |
476 | \n{2,} # Blank line... |
477 | (?=\S) # ...followed by non-space |
478 | (?![ ]* # ...but not another item |
479 | (#{ListMarkerAny}) |
480 | [ ]+) |
481 | ) |
482 | }x |
483 | |
484 | ### Transform Markdown-style lists in a copy of the specified +str+ and |
485 | ### return it. |
486 | def transform_lists( str, rs ) |
487 | @log.debug " Transforming lists at %p" % (str[0,100] + '...') |
488 | |
489 | str.gsub( ListRegexp ) {|list| |
490 | @log.debug " Found list %p" % list |
491 | bullet = $1 |
492 | list_type = (ListMarkerUl.match(bullet) ? "ul" : "ol") |
493 | list.gsub!( /\n{2,}/, "\n\n\n" ) |
494 | |
495 | %{<%s>\n%s</%s>\n} % [ |
496 | list_type, |
497 | transform_list_items( list, rs ), |
498 | list_type, |
499 | ] |
500 | } |
501 | end |
502 | |
503 | |
504 | # Pattern for transforming list items |
505 | ListItemRegexp = %r{ |
506 | (\n)? # leading line = $1 |
507 | (^[ ]*) # leading whitespace = $2 |
508 | (#{ListMarkerAny}) [ ]+ # list marker = $3 |
509 | ((?m:.+?) # list item text = $4 |
510 | (\n{1,2})) |
511 | (?= \n* (\z | \2 (#{ListMarkerAny}) [ ]+)) |
512 | }x |
513 | |
514 | ### Transform list items in a copy of the given +str+ and return it. |
515 | def transform_list_items( str, rs ) |
516 | @log.debug " Transforming list items" |
517 | |
518 | # Trim trailing blank lines |
519 | str = str.sub( /\n{2,}\z/, "\n" ) |
520 | |
521 | str.gsub( ListItemRegexp ) {|line| |
522 | @log.debug " Found item line %p" % line |
523 | leading_line, item = $1, $4 |
524 | |
525 | if leading_line or /\n{2,}/.match( item ) |
526 | @log.debug " Found leading line or item has a blank" |
527 | item = apply_block_transforms( outdent(item), rs ) |
528 | else |
529 | # Recursion for sub-lists |
530 | @log.debug " Recursing for sublist" |
531 | item = transform_lists( outdent(item), rs ).chomp |
532 | item = apply_span_transforms( item, rs ) |
533 | end |
534 | |
535 | %{<li>%s</li>\n} % item |
536 | } |
537 | end |
538 | |
539 | |
540 | # Pattern for matching codeblocks |
541 | CodeBlockRegexp = %r{ |
542 | (?:\n\n|\A) |
543 | ( # $1 = the code block |
544 | (?: |
545 | (?:[ ]{#{TabWidth}} | \t) # a tab or tab-width of spaces |
546 | .*\n+ |
547 | )+ |
548 | ) |
549 | (^[ ]{0,#{TabWidth - 1}}\S|\Z) # Lookahead for non-space at |
550 | # line-start, or end of doc |
551 | }x |
552 | |
553 | ### Transform Markdown-style codeblocks in a copy of the specified +str+ and |
554 | ### return it. |
555 | def transform_code_blocks( str, rs ) |
556 | @log.debug " Transforming code blocks" |
557 | |
558 | str.gsub( CodeBlockRegexp ) {|block| |
559 | codeblock = $1 |
560 | remainder = $2 |
561 | |
562 | # Generate the codeblock |
563 | %{\n\n<pre><code>%s\n</code></pre>\n\n%s} % |
564 | [ encode_code( outdent(codeblock), rs ).rstrip, remainder ] |
565 | } |
566 | end |
567 | |
568 | |
569 | # Pattern for matching Markdown blockquote blocks |
570 | BlockQuoteRegexp = %r{ |
571 | (?: |
572 | ^[ ]*>[ ]? # '>' at the start of a line |
573 | .+\n # rest of the first line |
574 | (?:.+\n)* # subsequent consecutive lines |
575 | \n* # blanks |
576 | )+ |
577 | }x |
578 | PreChunk = %r{ ( ^ \s* <pre> .+? </pre> ) }xm |
579 | |
580 | ### Transform Markdown-style blockquotes in a copy of the specified +str+ |
581 | ### and return it. |
582 | def transform_block_quotes( str, rs ) |
583 | @log.debug " Transforming block quotes" |
584 | |
585 | str.gsub( BlockQuoteRegexp ) {|quote| |
586 | @log.debug "Making blockquote from %p" % quote |
587 | |
588 | quote.gsub!( /^ *> ?/, '' ) # Trim one level of quoting |
589 | quote.gsub!( /^ +$/, '' ) # Trim whitespace-only lines |
590 | |
591 | indent = " " * TabWidth |
592 | quoted = %{<blockquote>\n%s\n</blockquote>\n\n} % |
593 | apply_block_transforms( quote, rs ). |
594 | gsub( /^/, indent ). |
595 | gsub( PreChunk ) {|m| m.gsub(/^#{indent}/o, '') } |
596 | @log.debug "Blockquoted chunk is: %p" % quoted |
597 | quoted |
598 | } |
599 | end |
600 | |
601 | |
602 | AutoAnchorURLRegexp = /<((https?|ftp):[^'">\s]+)>/ |
603 | AutoAnchorEmailRegexp = %r{ |
604 | < |
605 | ( |
606 | [-.\w]+ |
607 | \@ |
608 | [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+ |
609 | ) |
610 | > |
611 | }xi |
612 | |
613 | ### Transform URLs in a copy of the specified +str+ into links and return |
614 | ### it. |
615 | def transform_auto_links( str, rs ) |
616 | @log.debug " Transforming auto-links" |
617 | str.gsub( AutoAnchorURLRegexp, %{<a href="\\1">\\1</a>}). |
618 | gsub( AutoAnchorEmailRegexp ) {|addr| |
619 | encode_email_address( unescape_special_chars($1) ) |
620 | } |
621 | end |
622 | |
623 | |
624 | # Encoder functions to turn characters of an email address into encoded |
625 | # entities. |
626 | Encoders = [ |
627 | lambda {|char| "&#%03d;" % char}, |
628 | lambda {|char| "&#x%X;" % char}, |
629 | lambda {|char| char.chr }, |
630 | ] |
631 | |
632 | ### Transform a copy of the given email +addr+ into an escaped version safer |
633 | ### for posting publicly. |
634 | def encode_email_address( addr ) |
635 | |
636 | rval = '' |
637 | ("mailto:" + addr).each_byte {|b| |
638 | case b |
639 | when ?: |
640 | rval += ":" |
641 | when ?@ |
642 | rval += Encoders[ rand(2) ][ b ] |
643 | else |
644 | r = rand(100) |
645 | rval += ( |
646 | r > 90 ? Encoders[2][ b ] : |
647 | r < 45 ? Encoders[1][ b ] : |
648 | Encoders[0][ b ] |
649 | ) |
650 | end |
651 | } |
652 | |
653 | return %{<a href="%s">%s</a>} % [ rval, rval.sub(/.+?:/, '') ] |
654 | end |
655 | |
656 | |
657 | # Regex for matching Setext-style headers |
658 | SetextHeaderRegexp = %r{ |
659 | (.+) # The title text ($1) |
660 | \n |
661 | ([\-=])+ # Match a line of = or -. Save only one in $2. |
662 | [ ]*\n+ |
663 | }x |
664 | |
665 | # Regexp for matching ATX-style headers |
666 | AtxHeaderRegexp = %r{ |
667 | ^(\#{1,6}) # $1 = string of #'s |
668 | [ ]* |
669 | (.+?) # $2 = Header text |
670 | [ ]* |
671 | \#* # optional closing #'s (not counted) |
672 | \n+ |
673 | }x |
674 | |
675 | ### Apply Markdown header transforms to a copy of the given +str+ amd render |
676 | ### state +rs+ and return the result. |
677 | def transform_headers( str, rs ) |
678 | @log.debug " Transforming headers" |
679 | |
680 | # Setext-style headers: |
681 | # Header 1 |
682 | # ======== |
683 | # |
684 | # Header 2 |
685 | # -------- |
686 | # |
687 | str. |
688 | gsub( SetextHeaderRegexp ) {|m| |
689 | @log.debug "Found setext-style header" |
690 | title, hdrchar = $1, $2 |
691 | title = apply_span_transforms( title, rs ) |
692 | |
693 | case hdrchar |
694 | when '=' |
695 | %[<h1>#{title}</h1>\n\n] |
696 | when '-' |
697 | %[<h2>#{title}</h2>\n\n] |
698 | else |
699 | title |
700 | end |
701 | }. |
702 | |
703 | gsub( AtxHeaderRegexp ) {|m| |
704 | @log.debug "Found ATX-style header" |
705 | hdrchars, title = $1, $2 |
706 | title = apply_span_transforms( title, rs ) |
707 | |
708 | level = hdrchars.length |
709 | %{<h%d>%s</h%d>\n\n} % [ level, title, level ] |
710 | } |
711 | end |
712 | |
713 | |
714 | ### Wrap all remaining paragraph-looking text in a copy of +str+ inside <p> |
715 | ### tags and return it. |
716 | def form_paragraphs( str, rs ) |
717 | @log.debug " Forming paragraphs" |
718 | grafs = str. |
719 | sub( /\A\n+/, '' ). |
720 | sub( /\n+\z/, '' ). |
721 | split( /\n{2,}/ ) |
722 | |
723 | rval = grafs.collect {|graf| |
724 | |
725 | # Unhashify HTML blocks if this is a placeholder |
726 | if rs.html_blocks.key?( graf ) |
727 | rs.html_blocks[ graf ] |
728 | |
729 | # Otherwise, wrap in <p> tags |
730 | else |
731 | apply_span_transforms(graf, rs). |
732 | sub( /^[ ]*/, '<p>' ) + '</p>' |
733 | end |
734 | }.join( "\n\n" ) |
735 | |
736 | @log.debug " Formed paragraphs: %p" % rval |
737 | return rval |
738 | end |
739 | |
740 | |
741 | # Pattern to match the linkid part of an anchor tag for reference-style |
742 | # links. |
743 | RefLinkIdRegex = %r{ |
744 | [ ]? # Optional leading space |
745 | (?:\n[ ]*)? # Optional newline + spaces |
746 | \[ |
747 | (.*?) # Id = $1 |
748 | \] |
749 | }x |
750 | |
751 | InlineLinkRegex = %r{ |
752 | \( # Literal paren |
753 | [ ]* # Zero or more spaces |
754 | <?(.+?)>? # URI = $1 |
755 | [ ]* # Zero or more spaces |
756 | (?: # |
757 | ([\"\']) # Opening quote char = $2 |
758 | (.*?) # Title = $3 |
759 | \2 # Matching quote char |
760 | )? # Title is optional |
761 | \) |
762 | }x |
763 | |
764 | ### Apply Markdown anchor transforms to a copy of the specified +str+ with |
765 | ### the given render state +rs+ and return it. |
766 | def transform_anchors( str, rs ) |
767 | @log.debug " Transforming anchors" |
768 | @scanner.string = str.dup |
769 | text = '' |
770 | |
771 | # Scan the whole string |
772 | until @scanner.empty? |
773 | |
774 | if @scanner.scan( /\[/ ) |
775 | link = ''; linkid = '' |
776 | depth = 1 |
777 | startpos = @scanner.pos |
778 | @log.debug " Found a bracket-open at %d" % startpos |
779 | |
780 | # Scan the rest of the tag, allowing unlimited nested []s. If |
781 | # the scanner runs out of text before the opening bracket is |
782 | # closed, append the text and return (wasn't a valid anchor). |
783 | while depth.nonzero? |
784 | linktext = @scanner.scan_until( /\]|\[/ ) |
785 | |
786 | if linktext |
787 | @log.debug " Found a bracket at depth %d: %p" % [ depth, linktext ] |
788 | link += linktext |
789 | |
790 | # Decrement depth for each closing bracket |
791 | depth += ( linktext[-1, 1] == ']' ? -1 : 1 ) |
792 | @log.debug " Depth is now #{depth}" |
793 | |
794 | # If there's no more brackets, it must not be an anchor, so |
795 | # just abort. |
796 | else |
797 | @log.debug " Missing closing brace, assuming non-link." |
798 | link += @scanner.rest |
799 | @scanner.terminate |
800 | return text + '[' + link |
801 | end |
802 | end |
803 | link.slice!( -1 ) # Trim final ']' |
804 | @log.debug " Found leading link %p" % link |
805 | |
806 | # Look for a reference-style second part |
807 | if @scanner.scan( RefLinkIdRegex ) |
808 | linkid = @scanner[1] |
809 | linkid = link.dup if linkid.empty? |
810 | linkid.downcase! |
811 | @log.debug " Found a linkid: %p" % linkid |
812 | |
813 | # If there's a matching link in the link table, build an |
814 | # anchor tag for it. |
815 | if rs.urls.key?( linkid ) |
816 | @log.debug " Found link key in the link table: %p" % rs.urls[linkid] |
817 | url = escape_md( rs.urls[linkid] ) |
818 | |
819 | text += %{<a href="#{url}"} |
820 | if rs.titles.key?(linkid) |
821 | text += %{ title="%s"} % escape_md( rs.titles[linkid] ) |
822 | end |
823 | text += %{>#{link}</a>} |
824 | |
825 | # If the link referred to doesn't exist, just append the raw |
826 | # source to the result |
827 | else |
828 | @log.debug " Linkid %p not found in link table" % linkid |
829 | @log.debug " Appending original string instead: " |
830 | @log.debug "%p" % @scanner.string[ startpos-1 .. @scanner.pos-1 ] |
831 | text += @scanner.string[ startpos-1 .. @scanner.pos-1 ] |
832 | end |
833 | |
834 | # ...or for an inline style second part |
835 | elsif @scanner.scan( InlineLinkRegex ) |
836 | url = @scanner[1] |
837 | title = @scanner[3] |
838 | @log.debug " Found an inline link to %p" % url |
839 | |
840 | text += %{<a href="%s"} % escape_md( url ) |
841 | if title |
842 | title.gsub!( /"/, """ ) |
843 | text += %{ title="%s"} % escape_md( title ) |
844 | end |
845 | text += %{>#{link}</a>} |
846 | |
847 | # No linkid part: just append the first part as-is. |
848 | else |
849 | @log.debug "No linkid, so no anchor. Appending literal text." |
850 | text += @scanner.string[ startpos-1 .. @scanner.pos-1 ] |
851 | end # if linkid |
852 | |
853 | # Plain text |
854 | else |
855 | @log.debug " Scanning to the next link from %p" % @scanner.rest |
856 | text += @scanner.scan( /[^\[]+/ ) |
857 | end |
858 | |
859 | end # until @scanner.empty? |
860 | |
861 | return text |
862 | end |
863 | |
864 | |
865 | # Pattern to match strong emphasis in Markdown text |
866 | BoldRegexp = %r{ (\*\*|__) (\S|\S.+?\S) \1 }x |
867 | |
868 | # Pattern to match normal emphasis in Markdown text |
869 | ItalicRegexp = %r{ (\*|_) (\S|\S.+?\S) \1 }x |
870 | |
871 | ### Transform italic- and bold-encoded text in a copy of the specified +str+ |
872 | ### and return it. |
873 | def transform_italic_and_bold( str, rs ) |
874 | @log.debug " Transforming italic and bold" |
875 | |
876 | str. |
877 | gsub( BoldRegexp, %{<strong>\\2</strong>} ). |
878 | gsub( ItalicRegexp, %{<em>\\2</em>} ) |
879 | end |
880 | |
881 | |
882 | ### Transform backticked spans into <code> spans. |
883 | def transform_code_spans( str, rs ) |
884 | @log.debug " Transforming code spans" |
885 | |
886 | # Set up the string scanner and just return the string unless there's at |
887 | # least one backtick. |
888 | @scanner.string = str.dup |
889 | unless @scanner.exist?( /`/ ) |
890 | @scanner.terminate |
891 | @log.debug "No backticks found for code span in %p" % str |
892 | return str |
893 | end |
894 | |
895 | @log.debug "Transforming code spans in %p" % str |
896 | |
897 | # Build the transformed text anew |
898 | text = '' |
899 | |
900 | # Scan to the end of the string |
901 | until @scanner.empty? |
902 | |
903 | # Scan up to an opening backtick |
904 | if pre = @scanner.scan_until( /.?(?=`)/m ) |
905 | text += pre |
906 | @log.debug "Found backtick at %d after '...%s'" % [ @scanner.pos, text[-10, 10] ] |
907 | |
908 | # Make a pattern to find the end of the span |
909 | opener = @scanner.scan( /`+/ ) |
910 | len = opener.length |
911 | closer = Regexp::new( opener ) |
912 | @log.debug "Scanning for end of code span with %p" % closer |
913 | |
914 | # Scan until the end of the closing backtick sequence. Chop the |
915 | # backticks off the resultant string, strip leading and trailing |
916 | # whitespace, and encode any enitites contained in it. |
917 | codespan = @scanner.scan_until( closer ) or |
918 | raise FormatError::new( @scanner.rest[0,20], |
919 | "No %p found before end" % opener ) |
920 | |
921 | @log.debug "Found close of code span at %d: %p" % [ @scanner.pos - len, codespan ] |
922 | codespan.slice!( -len, len ) |
923 | text += "<code>%s</code>" % |
924 | encode_code( codespan.strip, rs ) |
925 | |
926 | # If there's no more backticks, just append the rest of the string |
927 | # and move the scan pointer to the end |
928 | else |
929 | text += @scanner.rest |
930 | @scanner.terminate |
931 | end |
932 | end |
933 | |
934 | return text |
935 | end |
936 | |
937 | |
938 | # Next, handle inline images: ![alt text](url "optional title") |
939 | # Don't forget: encode * and _ |
940 | InlineImageRegexp = %r{ |
941 | ( # Whole match = $1 |
942 | !\[ (.*?) \] # alt text = $2 |
943 | \([ ]* |
944 | <?(\S+?)>? # source url = $3 |
945 | [ ]* |
946 | (?: # |
947 | (["']) # quote char = $4 |
948 | (.*?) # title = $5 |
949 | \4 # matching quote |
950 | [ ]* |
951 | )? # title is optional |
952 | \) |
953 | ) |
954 | }xs #" |
955 | |
956 | |
957 | # Reference-style images |
958 | ReferenceImageRegexp = %r{ |
959 | ( # Whole match = $1 |
960 | !\[ (.*?) \] # Alt text = $2 |
961 | [ ]? # Optional space |
962 | (?:\n[ ]*)? # One optional newline + spaces |
963 | \[ (.*?) \] # id = $3 |
964 | ) |
965 | }xs |
966 | |
967 | ### Turn image markup into image tags. |
968 | def transform_images( str, rs ) |
969 | @log.debug " Transforming images" % str |
970 | |
971 | # Handle reference-style labeled images: ![alt text][id] |
972 | str. |
973 | gsub( ReferenceImageRegexp ) {|match| |
974 | whole, alt, linkid = $1, $2, $3.downcase |
975 | @log.debug "Matched %p" % match |
976 | res = nil |
977 | alt.gsub!( /"/, '"' ) |
978 | |
979 | # for shortcut links like ![this][]. |
980 | linkid = alt.downcase if linkid.empty? |
981 | |
982 | if rs.urls.key?( linkid ) |
983 | url = escape_md( rs.urls[linkid] ) |
984 | @log.debug "Found url '%s' for linkid '%s' " % [ url, linkid ] |
985 | |
986 | # Build the tag |
987 | result = %{<img src="%s" alt="%s"} % [ url, alt ] |
988 | if rs.titles.key?( linkid ) |
989 | result += %{ title="%s"} % escape_md( rs.titles[linkid] ) |
990 | end |
991 | result += EmptyElementSuffix |
992 | |
993 | else |
994 | result = whole |
995 | end |
996 | |
997 | @log.debug "Replacing %p with %p" % [ match, result ] |
998 | result |
999 | }. |
1000 | |
1001 | # Inline image style |
1002 | gsub( InlineImageRegexp ) {|match| |
1003 | @log.debug "Found inline image %p" % match |
1004 | whole, alt, title = $1, $2, $5 |
1005 | url = escape_md( $3 ) |
1006 | alt.gsub!( /"/, '"' ) |
1007 | |
1008 | # Build the tag |
1009 | result = %{<img src="%s" alt="%s"} % [ url, alt ] |
1010 | unless title.nil? |
1011 | title.gsub!( /"/, '"' ) |
1012 | result += %{ title="%s"} % escape_md( title ) |
1013 | end |
1014 | result += EmptyElementSuffix |
1015 | |
1016 | @log.debug "Replacing %p with %p" % [ match, result ] |
1017 | result |
1018 | } |
1019 | end |
1020 | |
1021 | |
1022 | # Regexp to match special characters in a code block |
1023 | CodeEscapeRegexp = %r{( \* | _ | \{ | \} | \[ | \] | \\ )}x |
1024 | |
1025 | ### Escape any characters special to HTML and encode any characters special |
1026 | ### to Markdown in a copy of the given +str+ and return it. |
1027 | def encode_code( str, rs ) |
1028 | str.gsub( %r{&}, '&' ). |
1029 | gsub( %r{<}, '<' ). |
1030 | gsub( %r{>}, '>' ). |
1031 | gsub( CodeEscapeRegexp ) {|match| EscapeTable[match][:md5]} |
1032 | end |
1033 | |
1034 | |
1035 | |
1036 | ################################################################# |
1037 | ### U T I L I T Y F U N C T I O N S |
1038 | ################################################################# |
1039 | |
1040 | ### Escape any markdown characters in a copy of the given +str+ and return |
1041 | ### it. |
1042 | def escape_md( str ) |
1043 | str. |
1044 | gsub( /\*/, EscapeTable['*'][:md5] ). |
1045 | gsub( /_/, EscapeTable['_'][:md5] ) |
1046 | end |
1047 | |
1048 | |
1049 | # Matching constructs for tokenizing X/HTML |
1050 | HTMLCommentRegexp = %r{ <! ( -- .*? -- \s* )+ > }mx |
1051 | XMLProcInstRegexp = %r{ <\? .*? \?> }mx |
1052 | MetaTag = Regexp::union( HTMLCommentRegexp, XMLProcInstRegexp ) |
1053 | |
1054 | HTMLTagOpenRegexp = %r{ < [a-z/!$] [^<>]* }imx |
1055 | HTMLTagCloseRegexp = %r{ > }x |
1056 | HTMLTagPart = Regexp::union( HTMLTagOpenRegexp, HTMLTagCloseRegexp ) |
1057 | |
1058 | ### Break the HTML source in +str+ into a series of tokens and return |
1059 | ### them. The tokens are just 2-element Array tuples with a type and the |
1060 | ### actual content. If this function is called with a block, the type and |
1061 | ### text parts of each token will be yielded to it one at a time as they are |
1062 | ### extracted. |
1063 | def tokenize_html( str ) |
1064 | depth = 0 |
1065 | tokens = [] |
1066 | @scanner.string = str.dup |
1067 | type, token = nil, nil |
1068 | |
1069 | until @scanner.empty? |
1070 | @log.debug "Scanning from %p" % @scanner.rest |
1071 | |
1072 | # Match comments and PIs without nesting |
1073 | if (( token = @scanner.scan(MetaTag) )) |
1074 | type = :tag |
1075 | |
1076 | # Do nested matching for HTML tags |
1077 | elsif (( token = @scanner.scan(HTMLTagOpenRegexp) )) |
1078 | tagstart = @scanner.pos |
1079 | @log.debug " Found the start of a plain tag at %d" % tagstart |
1080 | |
1081 | # Start the token with the opening angle |
1082 | depth = 1 |
1083 | type = :tag |
1084 | |
1085 | # Scan the rest of the tag, allowing unlimited nested <>s. If |
1086 | # the scanner runs out of text before the tag is closed, raise |
1087 | # an error. |
1088 | while depth.nonzero? |
1089 | |
1090 | # Scan either an opener or a closer |
1091 | chunk = @scanner.scan( HTMLTagPart ) or |
1092 | raise "Malformed tag at character %d: %p" % |
1093 | [ tagstart, token + @scanner.rest ] |
1094 | |
1095 | @log.debug " Found another part of the tag at depth %d: %p" % [ depth, chunk ] |
1096 | |
1097 | token += chunk |
1098 | |
1099 | # If the last character of the token so far is a closing |
1100 | # angle bracket, decrement the depth. Otherwise increment |
1101 | # it for a nested tag. |
1102 | depth += ( token[-1, 1] == '>' ? -1 : 1 ) |
1103 | @log.debug " Depth is now #{depth}" |
1104 | end |
1105 | |
1106 | # Match text segments |
1107 | else |
1108 | @log.debug " Looking for a chunk of text" |
1109 | type = :text |
1110 | |
1111 | # Scan forward, always matching at least one character to move |
1112 | # the pointer beyond any non-tag '<'. |
1113 | token = @scanner.scan_until( /[^<]+/m ) |
1114 | end |
1115 | |
1116 | @log.debug " type: %p, token: %p" % [ type, token ] |
1117 | |
1118 | # If a block is given, feed it one token at a time. Add the token to |
1119 | # the token list to be returned regardless. |
1120 | if block_given? |
1121 | yield( type, token ) |
1122 | end |
1123 | tokens << [ type, token ] |
1124 | end |
1125 | |
1126 | return tokens |
1127 | end |
1128 | |
1129 | |
1130 | ### Return a copy of +str+ with angle brackets and ampersands HTML-encoded. |
1131 | def encode_html( str ) |
1132 | str.gsub( /&(?!#?[x]?(?:[0-9a-f]+|\w+);)/i, "&" ). |
1133 | gsub( %r{<(?![a-z/?\$!])}i, "<" ) |
1134 | end |
1135 | |
1136 | |
1137 | ### Return one level of line-leading tabs or spaces from a copy of +str+ and |
1138 | ### return it. |
1139 | def outdent( str ) |
1140 | str.gsub( /^(\t|[ ]{1,#{TabWidth}})/, '') |
1141 | end |
1142 | |
1143 | end # class BlueCloth |
1144 |