# # $Id: sanitize.rb 3 2005-04-05 12:51:14Z dwight $ # # Copyright (c) 2005 Dwight Shih # A derived work of the Perl version: # Copyright (c) 2002 Brad Choate, bradchoate.com # # Permission is hereby granted, free of charge, to # any person obtaining a copy of this software and # associated documentation files (the "Software"), to # deal in the Software without restriction, including # without limitation the rights to use, copy, modify, # merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to # whom the Software is furnished to do so, subject to # the following conditions: # # The above copyright notice and this permission # notice shall be included in all copies or # substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY # OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT # LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES # OR OTHER LIABILITY, WHETHER IN AN ACTION OF # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF # OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR # OTHER DEALINGS IN THE SOFTWARE. # def sanitize( html, okTags='a href, b, br, i, p' ) # no closing tag necessary for these soloTags = ["br","hr"] # Build hash of allowed tags with allowed attributes tags = okTags.downcase().split(',').collect!{ |s| s.split(' ') } allowed = Hash.new tags.each do |s| key = s.shift allowed[key] = s end # Analyze all <> elements stack = Array.new result = html.gsub( /(<.*?>)/m ) do | element | if element =~ /\A<\/(\w+)/ then # tag = $1.downcase if allowed.include?(tag) && stack.include?(tag) then # If allowed and on the stack # Then pop down the stack top = stack.pop out = "" until top == tag do top = stack.pop out << "" end out end elsif element =~ /\A<(\w+)\s*\/>/ # tag = $1.downcase if allowed.include?(tag) then "<#{tag} />" end elsif element =~ /\A<(\w+)/ then # tag = $1.downcase if allowed.include?(tag) then if ! soloTags.include?(tag) then stack.push(tag) end if allowed[tag].length == 0 then # no allowed attributes "<#{tag}>" else # allowed attributes? out = "<#{tag}" while ( $' =~ /(\w+)=("[^"]+")/ ) attr = $1.downcase valu = $2 if allowed[tag].include?(attr) then out << " #{attr}=#{valu}" end end out << ">" end end end end # eat up unmatched leading > while result.sub!(/\A([^<]*)>/m) { $1 } do end # eat up unmatched trailing < while result.sub!(/<([^>]*)\Z/m) { $1 } do end # clean up the stack if stack.length > 0 then result << "" end result end