Changesets can be listed by changeset number.
The Git repository is here.
Changeset 357
Having seen problems with character set conversions, comment out swathes of
unused code which tries to introduce Ruby 1.9 and 1.8 compatibility and give
UTF-8 support throughout. Instead, rely on Rails UTF-8 extensions and other
pass-through behaviour. Fix an invalid use of "Integer($1).chr" which was
probably in fact the sole root cause of the problems seen during migeration
of the I2 data (but leave the other changes present, since they don't hurt
and in some cases unnecessary calls have been made which can be removed).
- Comitted by: rool
- Date: Saturday March 19 21:57:38 2011 (over 13 years ago)
Affected files:
- rool/rails/instiki/trunk/lib/instiki_stringsupport.rb (diff)
- rool/rails/instiki/trunk/lib/sanitizer.rb (diff)
rool/rails/instiki/trunk/lib/instiki_stringsupport.rb:
prev. | current | |
# Some useful additions to the String class | ||
2 | # | |
3 | # 2011-03-10 (ADH): Numerous changes. Whatever Instiki's trying to do, the | |
4 | # results often seem broken - it's possible to get a 500 | |
5 | # error in a diff view, for example, because UTF 8 byte | |
6 | # sequences are getting split mid-way through. Took out as | |
7 | # much custom code as possible and tried to lean on the | |
8 | # Rails multibyte support. We're using Rails 2.3.11 and | |
9 | # Ruby 1.8; this combination seems to work properly. | |
class String | ||
5 | ||
6 | ||
7 | ||
8 | ||
9 | ||
10 | ||
11 | ||
12 | ||
13 | ||
14 | | |
15 | | |
16 | | |
17 | ||
18 | | |
19 | | |
20 | | |
13 | def num_chars | |
14 | self.mb_chars.length | |
end | ||
23 | ||
24 | ||
25 | ||
26 | ||
27 | ||
28 | ||
29 | ||
30 | ||
31 | ||
32 | | |
33 | | |
34 | | |
35 | ||
36 | | |
37 | | |
38 | | |
17 | # # Return the number of unicode characters in a string | |
18 | # # | |
19 | # # :call-seq: | |
20 | # # string.num_chars -> integer | |
21 | # # | |
22 | # # Because Rails 2.3.5's String#mb_chars.length is broken, | |
23 | # # we provide this method. | |
24 | # #-- | |
25 | # if "".respond_to?(:force_encoding) | |
26 | # def num_chars | |
27 | # length | |
28 | # end | |
29 | # else | |
30 | # def num_chars | |
31 | # unpack('U*').length | |
32 | # end | |
33 | # end | |
34 | ||
35 | def as_bytes | |
36 | self.mb_chars | |
end | ||
41 | ||
42 | ||
43 | ||
44 | ||
45 | ||
46 | ||
47 | ||
48 | ||
49 | ||
50 | | |
51 | | |
52 | | |
53 | ||
54 | | |
55 | | |
56 | | |
39 | # #++ | |
40 | # # A method to allow byte-oriented operations in both Ruby 1.8 and Ruby 1.9 | |
41 | # # | |
42 | # # :call-seq: | |
43 | # # string.to_utf_8 -> string (with the encoding set to "ASCII-8BIT") | |
44 | # # | |
45 | # # Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "ASCII-8BIT" | |
46 | # #-- | |
47 | # if "".respond_to?(:force_encoding) | |
48 | # def as_bytes | |
49 | # force_encoding("ASCII-8BIT") | |
50 | # end | |
51 | # else | |
52 | # def as_bytes | |
53 | # self | |
54 | # end | |
55 | # end | |
56 | ||
57 | def as_utf8 | |
58 | self # Same as below, basically | |
end | ||
59 | ||
60 | ||
61 | ||
62 | ||
63 | ||
64 | ||
65 | ||
66 | ||
67 | ||
68 | | |
69 | | |
70 | | |
71 | | |
72 | ||
73 | | |
74 | | |
75 | | |
76 | | |
61 | # #++ | |
62 | # # A method to allow string-oriented operations in both Ruby 1.8 and Ruby 1.9 | |
63 | # # | |
64 | # # :call-seq: | |
65 | # # string.to_utf_8 -> string (with the encoding set to "UTF-8") | |
66 | # # | |
67 | # # Under 1.8, this is a NOOP. Under 1.9, it sets the encoding to "UTF-8" | |
68 | # #-- | |
69 | # if "".respond_to?(:force_encoding) | |
70 | # def as_utf8 | |
71 | # force_encoding("UTF-8") | |
72 | # end | |
73 | # else | |
74 | # def as_utf8 | |
75 | # self | |
76 | # end | |
77 | # end | |
78 | ||
79 | def purify | |
80 | # OK, this really doesn't do anything... It tolerates bad sequences. But | |
81 | # isn't that a good thing? We just pass bytes through. | |
82 | self.mb_chars.to_s | |
end | ||
79 | | |
80 | | |
81 | | |
82 | | |
85 | # #++ | |
86 | # # Take a string, and remove any invalid substrings, returning a valid utf-8 string. | |
87 | # # | |
88 | # # :call-seq: | |
89 | # # string.purify -> new_string | |
90 | # # | |
91 | # # returns a valid utf-8 string, purged of any subsequences of illegal bytes. | |
92 | # #-- | |
93 | # if "".respond_to?(:force_encoding) | |
94 | # def purify | |
95 | # text = self.dup.check_ncrs.as_utf8 | |
96 | # text.chars.collect{|c| c.as_bytes}.grep(UTF8_REGEX).join.as_utf8 | |
97 | # end | |
98 | # else | |
99 | # def purify | |
100 | # text = check_ncrs | |
101 | # text.split(//u).grep(UTF8_REGEX).join | |
102 | # end | |
103 | # end | |
104 | # | |
105 | # def check_ncrs | |
106 | # text = gsub(/&#[xX]([a-fA-F0-9]+);/) { |m| [$1.hex].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' } | |
107 | # text.gsub(/&#(\d+);/) { |m| [$1.to_i].pack('U*').as_bytes =~ UTF8_REGEX ? m : '' } | |
108 | # end | |
109 | # | |
110 | # UTF8_REGEX = /\A( | |
111 | # [\x09\x0A\x0D\x20-\x7E] # ASCII | |
112 | # | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | |
113 | # | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | |
114 | # | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte | |
115 | # | \xEF[\x80-\xBE]{2} # | |
116 | # | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff | |
117 | # | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | |
118 | # | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | |
119 | # | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | |
120 | # | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 | |
121 | # )*\Z/nx; | |
122 | # #++ | |
84 | | |
85 | | |
86 | | |
87 | | |
88 | | |
89 | | |
90 | | |
91 | | |
92 | | |
93 | | |
94 | | |
95 | | |
96 | ||
97 | | |
98 | ||
124 | # # Check whether a string is valid utf-8 | |
125 | # # | |
126 | # # :call-seq: | |
127 | # # string.is_utf8? -> boolean | |
128 | # # | |
129 | # # returns true if the sequence of bytes in string is valid utf-8 | |
130 | # #-- | |
131 | # def is_utf8? | |
132 | # #expand NCRs to utf-8 | |
133 | # text = self.check_ncrs.as_bytes | |
134 | # | |
135 | # # You might think this is faster, but it isn't | |
136 | # #pieces = self.split(/&#[xX]([a-fA-F0-9]+);/) | |
137 | # #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].hex].pack('U*')} | |
138 | # #pieces = pieces.join.split(/&#(\d+);/) | |
139 | # #1.step(pieces.length-1, 2) {|i| pieces[i] = [pieces[i].to_i].pack('U*')} | |
140 | # #text = pieces.join | |
141 | # | |
142 | # #ensure the resulting string of bytes is valid utf-8 | |
143 | # text =~ UTF8_REGEX | |
144 | # end | |
# | ||
100 | ||
101 | ||
102 | ||
103 | ||
104 | ||
105 | | |
106 | | |
107 | | |
108 | | |
109 | | |
110 | | |
111 | | |
112 | | |
113 | | |
114 | | |
115 | | |
116 | | |
117 | | |
118 | | |
146 | # #:stopdoc: | |
147 | # | |
148 | # def blank? | |
149 | # self.dup.as_bytes !~ /\S/ | |
150 | # end | |
120 | ||
121 | ||
122 | | |
123 | | |
124 | | |
125 | ||
MATHML_ENTITIES = { | ||
'Alpha' => 'Α', | ||
'Beta' => 'Β', | ||
... | ... | |
'"' => '"', | ||
} | ||
TO_ESCAPE_PATTERN = Regexp.union(*TO_ESCAPE.keys) | ||
2317 | | |
2343 | ||
def escapeHTML | ||
self.gsub(TO_ESCAPE_PATTERN){|m| TO_ESCAPE[m]} | ||
end | ||
... | ... | |
when /\Aamp\z/ni then '&' | ||
when /\Agt\z/ni then '>' | ||
when /\Alt\z/ni then '<' | ||
2329 | | |
2330 | | |
2355 | when /\Aquot\z/ni then '"' | |
2356 | when /\Aapos\z/ni then "'" | |
when /\A#0*(\d+)\z/n then | ||
if Integer($1) < 256 | ||
2333 | | |
2359 | # 2011-03-10 (ADH): Wrong; in Ruby 1.8, there is no encoding support | |
2360 | # so this returns an ISO 8859-1 single byte value. | |
2361 | # In Ruby 1.9, Encoding::UTF8 would need to be | |
2362 | # specified, but isn't. | |
2363 | #Integer($1).chr | |
2364 | # | |
2365 | # Could do this: | |
2366 | # | |
2367 | # Iconv.iconv('utf-8', 'iso-8859-1', Integer($1).chr) | |
2368 | # | |
2369 | # ...but this is probably faster and copies the hex handling code | |
2370 | # below, which after experiments in "irb", does seem to work well: | |
2371 | [Integer($1)].pack("U") | |
else | ||
if Integer($1) < 1114111 | ||
[Integer($1)].pack("U") |
rool/rails/instiki/trunk/lib/sanitizer.rb:
prev. | current | |
node.attributes.delete attr; next | ||
end | ||
if ATTR_VAL_IS_URI.include?(attr) | ||
180 | | |
180 | # 2011-03-10 (ADH): Get rid of attempted UTF-8 hack; just rely on mb_chars | |
181 | val_unescaped = val.unescapeHTML.mb_chars.downcase.to_s | |
if val_unescaped =~ /^[a-z0-9][-+.a-z0-9]*:/ && !ALLOWED_PROTOCOLS.include?(val_unescaped.split(':')[0]) | ||
node.attributes.delete attr; next | ||
end |