Changesets can be listed by changeset number.
The Git repository is here.
- Revision:
- 344
- Log:
Massive changeset which brings the old, ROOL customised Instiki
version up to date, but without any ROOL customisations in this
latest checked-in version (which is 0.19.1). This is deliberate,
so that it's easy to see the changes made for the ROOL version
in a subsequent changeset. The 'app/views/shared' directory is not
part of Instiki but is kept to maintain the change history with
updated ROOL customisations, some of which involve the same files
in that same directory.
- Author:
- rool
- Date:
- Sat Mar 19 19:52:13 +0000 2011
- Size:
- 12116 Bytes
1 | module Sanitizer |
2 | |
3 | # This module provides sanitization of XHTML+MathML+SVG |
4 | # and of inline style attributes. |
5 | # |
6 | # Based heavily on Sam Ruby's code in the Universal FeedParser. |
7 | |
8 | require 'action_controller/vendor/html-scanner/html/tokenizer' |
9 | require 'node' |
10 | require 'instiki_stringsupport' |
11 | require 'set' |
12 | |
13 | acceptable_elements = Set.new %w[a abbr acronym address area article aside |
14 | audio b big blockquote br button canvas caption center cite code |
15 | col colgroup command datalist dd del details dfn dialog dir div dl dt |
16 | em fieldset figcaption figure font footer form h1 h2 h3 h4 h5 h6 header |
17 | hgroup hr i img input ins kbd label legend li map mark menu meter nav |
18 | ol optgroup option p pre progress q rp rt ruby s samp section select small |
19 | source span strike strong sub summary sup table tbody td textarea tfoot |
20 | th thead time tr tt u ul var video wbr] |
21 | |
22 | mathml_elements = Set.new %w[annotation annotation-xml maction math menclose merror |
23 | mfrac mfenced mi mmultiscripts mn mo mover mpadded mphantom mprescripts mroot |
24 | mrow mspace msqrt mstyle msub msubsup msup mtable mtd mtext mtr munder |
25 | munderover none semantics] |
26 | |
27 | svg_elements = Set.new %w[a animate animateColor animateMotion animateTransform |
28 | circle clipPath defs desc ellipse feGaussianBlur filter font-face |
29 | font-face-name font-face-src foreignObject g glyph hkern linearGradient |
30 | line marker mask metadata missing-glyph mpath path pattern polygon |
31 | polyline radialGradient rect set stop svg switch text textPath title tspan use] |
32 | |
33 | acceptable_attributes = Set.new %w[abbr accept accept-charset accesskey action |
34 | align alt autocomplete axis bgcolor border cellpadding cellspacing char charoff |
35 | checked cite class clear cols colspan color compact contenteditable contextmenu |
36 | controls coords datetime dir disabled draggable enctype face for formaction frame |
37 | headers height high href hreflang hspace icon id ismap label list lang longdesc |
38 | loop low max maxlength media method min multiple name nohref noshade nowrap open |
39 | optimumpattern placeholder poster preload pubdate radiogroup readonly rel |
40 | required rev reversed rows rowspan rules spellcheck scope |
41 | selected shape size span src start step style summary tabindex target title |
42 | type usemap valign value vspace width wrap xml:lang] |
43 | |
44 | mathml_attributes = Set.new %w[actiontype align close |
45 | columnalign columnlines columnspacing columnspan depth display |
46 | displaystyle encoding equalcolumns equalrows fence fontstyle fontweight |
47 | frame height linethickness lspace mathbackground mathcolor mathvariant |
48 | maxsize minsize notation open other rowalign |
49 | rowlines rowspacing rowspan rspace scriptlevel selection separator |
50 | separators stretchy width xlink:href xlink:show xlink:type xmlns |
51 | xmlns:xlink] |
52 | |
53 | svg_attributes = Set.new %w[accent-height accumulate additive alphabetic |
54 | arabic-form ascent attributeName attributeType baseProfile bbox begin |
55 | by calcMode cap-height class clip-path clip-rule color |
56 | color-interpolation-filters color-rendering |
57 | content cx cy d dx dy descent display dur end fill fill-opacity fill-rule |
58 | filterRes filterUnits font-family font-size font-stretch font-style |
59 | font-variant font-weight from fx fy g1 g2 glyph-name gradientUnits |
60 | hanging height horiz-adv-x horiz-origin-x id ideographic k keyPoints |
61 | keySplines keyTimes lang marker-end marker-mid marker-start |
62 | markerHeight markerUnits markerWidth maskContentUnits maskUnits |
63 | mathematical max method min name offset opacity orient origin |
64 | overline-position overline-thickness panose-1 path pathLength |
65 | patternContentUnits patternTransform patternUnits points |
66 | preserveAspectRatio primitiveUnits r refX refY repeatCount repeatDur |
67 | requiredExtensions requiredFeatures restart rotate rx ry se:connector |
68 | se:nonce slope spacing |
69 | startOffset stdDeviation stemh stemv stop-color stop-opacity |
70 | strikethrough-position strikethrough-thickness stroke stroke-dasharray |
71 | stroke-dashoffset stroke-linecap stroke-linejoin stroke-miterlimit |
72 | stroke-opacity stroke-width systemLanguage target text-anchor |
73 | to transform type u1 u2 underline-position underline-thickness |
74 | unicode unicode-range units-per-em values version viewBox |
75 | visibility width widths x x-height x1 x2 xlink:actuate |
76 | xlink:arcrole xlink:href xlink:role xlink:show xlink:title xlink:type |
77 | xml:base xml:lang xml:space xmlns xmlns:xlink xmlns:se y y1 y2 zoomAndPan] |
78 | |
79 | attr_val_is_uri = Set.new %w[href src cite action formaction longdesc xlink:href xml:base] |
80 | |
81 | svg_attr_val_allows_ref = Set.new %w[clip-path color-profile cursor fill |
82 | filter marker marker-start marker-mid marker-end mask stroke] |
83 | |
84 | svg_allow_local_href = Set.new %w[altGlyph animate animateColor animateMotion |
85 | animateTransform cursor feImage filter linearGradient pattern |
86 | radialGradient textpath tref set use] |
87 | |
88 | acceptable_css_properties = Set.new %w[azimuth background-color |
89 | border-bottom-color border-collapse border-color border-left-color |
90 | border-right-color border-top-color clear color cursor direction |
91 | display elevation float font font-family font-size font-style |
92 | font-variant font-weight height letter-spacing line-height overflow |
93 | pause pause-after pause-before pitch pitch-range richness speak |
94 | speak-header speak-numeral speak-punctuation speech-rate stress |
95 | text-align text-decoration text-indent unicode-bidi vertical-align |
96 | voice-family volume white-space width] |
97 | |
98 | acceptable_css_keywords = Set.new %w[auto aqua black block blue bold both bottom |
99 | brown center collapse dashed dotted fuchsia gray green !important |
100 | italic left lime maroon medium none navy normal nowrap olive pointer |
101 | purple red right solid silver teal top transparent underline white |
102 | yellow] |
103 | |
104 | acceptable_svg_properties = Set.new %w[fill fill-opacity fill-rule stroke |
105 | stroke-width stroke-linecap stroke-linejoin stroke-opacity] |
106 | |
107 | acceptable_protocols = Set.new %w[ed2k ftp http https irc mailto news gopher nntp |
108 | telnet webcal xmpp callto feed urn aim rsync tag ssh sftp rtsp afs] |
109 | |
110 | SHORTHAND_CSS_PROPERTIES = Set.new %w[background border margin padding] |
111 | VOID_ELEMENTS = Set.new %w[img br hr link meta area base basefont |
112 | col frame input isindex param] |
113 | |
114 | ALLOWED_ELEMENTS = acceptable_elements + mathml_elements + svg_elements unless defined?(ALLOWED_ELEMENTS) |
115 | ALLOWED_ATTRIBUTES = acceptable_attributes + mathml_attributes + svg_attributes unless defined?(ALLOWED_ATTRIBUTES) |
116 | ALLOWED_CSS_PROPERTIES = acceptable_css_properties unless defined?(ALLOWED_CSS_PROPERTIES) |
117 | ALLOWED_CSS_KEYWORDS = acceptable_css_keywords unless defined?(ALLOWED_CSS_KEYWORDS) |
118 | ALLOWED_SVG_PROPERTIES = acceptable_svg_properties unless defined?(ALLOWED_SVG_PROPERTIES) |
119 | ALLOWED_PROTOCOLS = acceptable_protocols unless defined?(ALLOWED_PROTOCOLS) |
120 | ATTR_VAL_IS_URI = attr_val_is_uri unless defined?(ATTR_VAL_IS_URI) |
121 | SVG_ATTR_VAL_ALLOWS_REF = svg_attr_val_allows_ref unless defined?(SVG_ATTR_VAL_ALLOWS_REF) |
122 | SVG_ALLOW_LOCAL_HREF = svg_allow_local_href unless defined?(SVG_ALLOW_LOCAL_HREF) |
123 | |
124 | # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and stripping out all |
125 | # attributes not in ALLOWED_ATTRIBUTES. Style attributes are parsed, and a restricted set, |
126 | # specified by ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through. |
127 | # Attributes in ATTR_VAL_IS_URI are scanned, and only uri schemes specified in |
128 | # ALLOWED_PROTOCOLS are allowed. |
129 | # Certain SVG attributes (SVG_ATTR_VAL_ALLOWS_REF) may take a url as a value. These are restricted to |
130 | # fragment-id's (in-document references). Certain SVG elements (SVG_ALLOW_LOCAL_HREF) allow href attributes |
131 | # which, again, are restricted to be fragment-id's. |
132 | # |
133 | # You can adjust what gets sanitized, by defining these constant arrays before this Module is loaded. |
134 | # |
135 | # xhtml_sanitize('<script> do_nasty_stuff() </script>') |
136 | # => <script> do_nasty_stuff() </script> |
137 | # xhtml_sanitize('<a href="javascript: sucker();">Click here for $100</a>') |
138 | # => <a>Click here for $100</a> |
139 | def xhtml_sanitize(html) |
140 | return html unless sanitizeable?(html) |
141 | tokenizer = HTML::Tokenizer.new(html.to_utf8) |
142 | results = [] |
143 | |
144 | while token = tokenizer.next |
145 | node = XHTML::Node.parse(nil, 0, 0, token, false) |
146 | results << case node.tag? |
147 | when true |
148 | if ALLOWED_ELEMENTS.include?(node.name) |
149 | process_attributes_for node |
150 | node.to_s |
151 | else |
152 | node.to_s.gsub(/</, "<").gsub(/>/, ">") |
153 | end |
154 | else |
155 | node.to_s.unescapeHTML.escapeHTML |
156 | end |
157 | end |
158 | |
159 | results.join |
160 | end |
161 | |
162 | def sanitizeable?(text) |
163 | !(text.nil? || text.empty? || !text.index("<")) |
164 | end |
165 | |
166 | protected |
167 | |
168 | def process_attributes_for(node) |
169 | return unless node.attributes |
170 | node.attributes.each do |attr,val| |
171 | if String === val && ALLOWED_ATTRIBUTES.include?(attr) |
172 | val = val.unescapeHTML.escapeHTML |
173 | else |
174 | node.attributes.delete attr; next |
175 | end |
176 | if attr == 'xlink:href' && SVG_ALLOW_LOCAL_HREF.include?(node.name) && val =~ /^\s*[^#\s]/m |
177 | node.attributes.delete attr; next |
178 | end |
179 | if ATTR_VAL_IS_URI.include?(attr) |
180 | val_unescaped = val.unescapeHTML.as_bytes.gsub(/`|[\000-\040\177\s]+|\302[\200-\240]/,'').downcase |
181 | if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0]) |
182 | node.attributes.delete attr; next |
183 | end |
184 | end |
185 | val = val.to_s.gsub(/url\s*\(\s*[^#\s][^)]+?\)/mi, ' ') if SVG_ATTR_VAL_ALLOWS_REF.include?(attr) |
186 | val = sanitize_css(val) if attr == 'style' |
187 | node.attributes[attr] = val |
188 | end |
189 | end |
190 | |
191 | def sanitize_css(style) |
192 | # disallow urls |
193 | style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ') |
194 | |
195 | # gauntlet |
196 | return '' unless style =~ /^([-:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/ |
197 | return '' unless style =~ /^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$/ |
198 | |
199 | clean = [] |
200 | style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop, val| |
201 | next if val.empty? |
202 | prop.downcase! |
203 | if self.class.const_get("ALLOWED_CSS_PROPERTIES").include?(prop) |
204 | clean << "#{prop}: #{val};" |
205 | elsif self.class.const_get("SHORTHAND_CSS_PROPERTIES").include?(prop.split('-')[0]) |
206 | clean << "#{prop}: #{val};" unless val.split().any? do |keyword| |
207 | !self.class.const_get("ALLOWED_CSS_KEYWORDS").include?(keyword) and |
208 | keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/ |
209 | end |
210 | elsif self.class.const_get("ALLOWED_SVG_PROPERTIES").include?(prop) |
211 | clean << "#{prop}: #{val};" |
212 | end |
213 | end |
214 | |
215 | clean.join(' ') |
216 | end |
217 | |
218 | # Sanitize a string, parsed using XHTML parsing rules. Reparse the result to |
219 | # ensure well-formedness. |
220 | # |
221 | # :call-seq: |
222 | # safe_sanitize_xhtml(string) -> string |
223 | # |
224 | # Unless otherwise specified, the string is assumed to be utf-8 encoded. |
225 | # |
226 | # The string returned is utf-8 encoded. If you want, you can use iconv to convert it to some other encoding. |
227 | # (REXML trees are always utf-8 encoded.) |
228 | def safe_xhtml_sanitize(html, options = {}) |
229 | sanitized = xhtml_sanitize(html.purify) |
230 | doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{sanitized}</div>") |
231 | sanitized = doc.to_s.gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1') |
232 | rescue REXML::ParseException |
233 | sanitized = sanitized.escapeHTML |
234 | end |
235 | |
236 | end |