Changesets can be listed by changeset number.
The Git repository is here.
- Revision:
- 86
- Log:
Initial import of I2, an Instiki clone.
- Author:
- adh
- Date:
- Mon Oct 16 10:40:36 +0100 2006
- Size:
- 13005 Bytes
1 | # heavily based off difflib.py - see that file for documentation |
2 | # ported from Python by Bill Atkins |
3 | |
4 | # This does not support all features offered by difflib; it |
5 | # implements only the subset of features necessary |
6 | # to support a Ruby version of HTML Differ. You're welcome to finish this off. |
7 | |
8 | # By default, String#each iterates by line. This isn't really appropriate |
9 | # for diff, so often a string will be split by // to get an array of one- |
10 | # character strings. |
11 | |
12 | # Some methods in Diff are untested and are not guaranteed to work. The |
13 | # methods in HTMLDiff and any methods it calls should work quite well. |
14 | |
15 | # changes by DenisMertz |
16 | # * main change: |
17 | # ** get the tag soup away |
18 | # the tag soup problem was first reported with <p> tags, but it appeared also with |
19 | # <li>, <ul> etc... tags |
20 | # this version should mostly fix these problems |
21 | # ** added a Builder class to manage the creation of the final htmldiff |
22 | # * minor changes: |
23 | # ** use symbols instead of string to represent opcodes |
24 | # ** small fix to html2list |
25 | # |
26 | |
27 | module Enumerable |
28 | def reduce(init) |
29 | result = init |
30 | each { |item| result = yield(result, item) } |
31 | result |
32 | end |
33 | end |
34 | |
35 | class Object |
36 | def nil_or_empty? |
37 | nil? or empty? |
38 | end |
39 | end |
40 | |
41 | module Diff |
42 | |
43 | module Utilities |
44 | def explode(sequence) |
45 | sequence.is_a?(String) ? sequence.split(//) : sequence |
46 | end |
47 | |
48 | def newline?(char) |
49 | %W(\n \r).include? char |
50 | end |
51 | |
52 | def tab?(char) |
53 | "\t" == char |
54 | end |
55 | |
56 | # XXX Could be more robust but unrecognized tags cause an infinite loop so |
57 | # better to be permissive |
58 | def open_tag?(char) |
59 | char =~ /\A<[^>]+>/ |
60 | end |
61 | |
62 | # See comment for open_tag? |
63 | def close_tag?(char) |
64 | char =~ %r!\A</[^>]+>! |
65 | end |
66 | |
67 | def end_of_tag?(char) |
68 | char == '>' |
69 | end |
70 | |
71 | def start_of_tag?(char) |
72 | char == '<' |
73 | end |
74 | |
75 | def html2list(x, use_brackets = false) |
76 | mode = :char |
77 | cur = '' |
78 | out = [] |
79 | |
80 | explode(x).each do |char| |
81 | case mode |
82 | when :tag |
83 | if end_of_tag? char |
84 | cur += use_brackets ? ']' : '>' |
85 | out.push cur |
86 | cur, mode = '', :char |
87 | else |
88 | cur += char |
89 | end |
90 | when :char |
91 | if start_of_tag? char |
92 | out.push cur |
93 | cur = use_brackets ? '[' : '<' |
94 | mode = :tag |
95 | elsif /\s/.match char |
96 | out.push cur + char |
97 | cur = '' |
98 | else |
99 | cur += char |
100 | end |
101 | end |
102 | end |
103 | |
104 | out.push(cur) |
105 | out.delete '' |
106 | out.map {|elt| newline?(elt) ? elt : elt.chomp} |
107 | end |
108 | end |
109 | |
110 | class SequenceMatcher |
111 | include Utilities |
112 | |
113 | def initialize(a = [''], b = [''], isjunk = nil, byline = false) |
114 | a, b = explode(a), explode(b) unless byline |
115 | @isjunk = isjunk || Proc.new {} |
116 | set_sequences a, b |
117 | end |
118 | |
119 | def set_sequences(a, b) |
120 | set_sequence_a a |
121 | set_sequence_b b |
122 | end |
123 | |
124 | def set_sequence_a(a) |
125 | @a = a |
126 | @matching_blocks = @opcodes = nil |
127 | end |
128 | |
129 | def set_sequence_b(b) |
130 | @b = b |
131 | @matching_blocks = @opcodes = nil |
132 | chain_b |
133 | end |
134 | |
135 | def chain_b |
136 | @fullbcount = nil |
137 | @b2j = {} |
138 | pophash = {} |
139 | junkdict = {} |
140 | |
141 | @b.each_with_index do |elt, idx| |
142 | if @b2j.has_key? elt |
143 | indices = @b2j[elt] |
144 | if @b.length >= 200 and indices.length * 100 > @b.length |
145 | pophash[elt] = 1 |
146 | indices.clear |
147 | else |
148 | indices.push idx |
149 | end |
150 | else |
151 | @b2j[elt] = [idx] |
152 | end |
153 | end |
154 | |
155 | pophash.each_key { |elt| @b2j.delete elt } |
156 | |
157 | unless @isjunk.nil? |
158 | [pophash, @b2j].each do |d| |
159 | d.each_key do |elt| |
160 | if @isjunk.call(elt) |
161 | junkdict[elt] = 1 |
162 | d.delete elt |
163 | end |
164 | end |
165 | end |
166 | end |
167 | |
168 | @isbjunk = junkdict.method(:has_key?) |
169 | @isbpopular = junkdict.method(:has_key?) |
170 | end |
171 | |
172 | def find_longest_match(a_low, a_high, b_low, b_high) |
173 | besti, bestj, bestsize = a_low, b_low, 0 |
174 | |
175 | j2len = {} |
176 | |
177 | (a_low..a_high).step do |i| |
178 | newj2len = {} |
179 | (@b2j[@a[i]] || []).each do |j| |
180 | next if j < b_low |
181 | break if j >= b_high |
182 | |
183 | k = newj2len[j] = (j2len[j - 1] || 0) + 1 |
184 | if k > bestsize |
185 | besti, bestj, bestsize = i - k + 1, j - k + 1, k |
186 | end |
187 | end |
188 | j2len = newj2len |
189 | end |
190 | |
191 | while besti > a_low and bestj > b_low and not @isbjunk.call(@b[bestj - 1]) and @a[besti - 1] == @b[bestj - 1] |
192 | besti, bestj, bestsize = besti - 1, bestj - 1, bestsize + 1 |
193 | end |
194 | |
195 | while besti + bestsize < a_high and bestj + bestsize < b_high and |
196 | not @isbjunk.call(@b[bestj + bestsize]) and |
197 | @a[besti + bestsize] == @b[bestj + bestsize] |
198 | bestsize += 1 |
199 | end |
200 | |
201 | while besti > a_low and bestj > b_low and @isbjunk.call(@b[bestj - 1]) and @a[besti - 1] == @b[bestj - 1] |
202 | besti, bestj, bestsize = besti - 1, bestj - 1, bestsize + 1 |
203 | end |
204 | |
205 | while besti + bestsize < a_high and bestj + bestsize < b_high and @isbjunk.call(@b[bestj+bestsize]) and |
206 | @a[besti+bestsize] == @b[bestj+bestsize] |
207 | bestsize += 1 |
208 | end |
209 | |
210 | [besti, bestj, bestsize] |
211 | end |
212 | |
213 | def get_matching_blocks |
214 | return @matching_blocks unless @matching_blocks.nil_or_empty? |
215 | |
216 | @matching_blocks = [] |
217 | size_of_a, size_of_b = @a.size, @b.size |
218 | match_block_helper(0, size_of_a, 0, size_of_b, @matching_blocks) |
219 | @matching_blocks.push [size_of_a, size_of_b, 0] |
220 | end |
221 | |
222 | def match_block_helper(a_low, a_high, b_low, b_high, answer) |
223 | i, j, k = x = find_longest_match(a_low, a_high, b_low, b_high) |
224 | unless k.zero? |
225 | match_block_helper(a_low, i, b_low, j, answer) if a_low < i and b_low < j |
226 | answer.push x |
227 | if i + k < a_high and j + k < b_high |
228 | match_block_helper(i + k, a_high, j + k, b_high, answer) |
229 | end |
230 | end |
231 | end |
232 | |
233 | def get_opcodes |
234 | return @opcodes unless @opcodes.nil_or_empty? |
235 | |
236 | i = j = 0 |
237 | @opcodes = answer = [] |
238 | get_matching_blocks.each do |ai, bj, size| |
239 | tag = if i < ai and j < bj |
240 | :replace |
241 | elsif i < ai |
242 | :delete |
243 | elsif j < bj |
244 | :insert |
245 | end |
246 | |
247 | answer.push [tag, i, ai, j, bj] if tag |
248 | i, j = ai + size, bj + size |
249 | answer.push [:equal, ai, i, bj, j] unless size.zero? |
250 | end |
251 | answer |
252 | end |
253 | |
254 | # XXX: untested |
255 | def get_grouped_opcodes(n = 3) |
256 | codes = get_opcodes |
257 | if codes.first.first == :equal |
258 | tag, i1, i2, j1, j2 = codes.first |
259 | codes[0] = tag, [i1, i2 - n].max, i2, [j1, j2-n].max, j2 |
260 | end |
261 | |
262 | if codes.last.first == :equal |
263 | tag, i1, i2, j1, j2 = codes.last |
264 | codes[-1] = tag, i1, min(i2, i1+n), j1, min(j2, j1+n) |
265 | end |
266 | |
267 | nn = n + n |
268 | group = [] |
269 | codes.each do |tag, i1, i2, j1, j2| |
270 | if tag == :equal and i2 - i1 > nn |
271 | group.push [tag, i1, [i2, i1 + n].min, j1, [j2, j1 + n].min] |
272 | yield group |
273 | group = [] |
274 | i1, j1 = [i1, i2-n].max, [j1, j2-n].max |
275 | group.push [tag, i1, i2, j1 ,j2] |
276 | end |
277 | end |
278 | yield group if group and group.size != 1 and group.first.first == :equal |
279 | end |
280 | |
281 | def ratio |
282 | matches = get_matching_blocks.reduce(0) do |sum, triple| |
283 | sum + triple.last |
284 | end |
285 | Diff.calculate_ratio(matches, @a.size + @b.size) |
286 | end |
287 | |
288 | def quick_ratio |
289 | if @fullbcount.nil_or_empty? |
290 | @fullbcount = {} |
291 | @b.each do |elt| |
292 | @fullbcount[elt] = (@fullbcount[elt] || 0) + 1 |
293 | end |
294 | end |
295 | |
296 | avail = {} |
297 | matches = 0 |
298 | @a.each do |elt| |
299 | numb = avail.has_key?(elt) ? avail[elt] : (@fullbcount[elt] || 0) |
300 | avail[elt] = numb - 1 |
301 | matches += 1 if numb > 0 |
302 | end |
303 | Diff.calculate_ratio matches, @a.size + @b.size |
304 | end |
305 | |
306 | def real_quick_ratio |
307 | size_of_a, size_of_b = @a.size, @b.size |
308 | Diff.calculate_ratio([size_of_a, size_of_b].min, size_of_a + size_of_b) |
309 | end |
310 | |
311 | protected :chain_b, :match_block_helper |
312 | end # end class SequenceMatcher |
313 | |
314 | class << self |
315 | def calculate_ratio(matches, length) |
316 | return 1.0 if length.zero? |
317 | 2.0 * matches / length |
318 | end |
319 | |
320 | # XXX: untested |
321 | def get_close_matches(word, possibilities, n = 3, cutoff = 0.6) |
322 | raise "n must be > 0: #{n}" unless n > 0 |
323 | raise "cutoff must be in (0.0..1.0): #{cutoff}" unless cutoff.between 0.0..1.0 |
324 | |
325 | result = [] |
326 | sequence_matcher = Diff::SequenceMatcher.new |
327 | sequence_matcher.set_sequence_b word |
328 | possibilities.each do |possibility| |
329 | sequence_matcher.set_sequence_a possibility |
330 | if sequence_matcher.real_quick_ratio >= cutoff and |
331 | sequence_matcher.quick_ratio >= cutoff and |
332 | sequence_matcher.ratio >= cutoff |
333 | result.push [sequence_matcher.ratio, possibility] |
334 | end |
335 | end |
336 | |
337 | unless result.nil_or_empty? |
338 | result.sort |
339 | result.reverse! |
340 | result = result[-n..-1] |
341 | end |
342 | result.map {|score, x| x } |
343 | end |
344 | |
345 | def count_leading(line, ch) |
346 | count, size = 0, line.size |
347 | count += 1 while count < size and line[count].chr == ch |
348 | count |
349 | end |
350 | end |
351 | end |
352 | |
353 | module HTMLDiff |
354 | include Diff |
355 | class Builder |
356 | VALID_METHODS = [:replace, :insert, :delete, :equal] |
357 | def initialize(a, b) |
358 | @a, @b = a, b |
359 | @content = [] |
360 | end |
361 | |
362 | def do_op(opcode) |
363 | @opcode = opcode |
364 | op = @opcode.first |
365 | raise NameError, "Invalid opcode '#{op}'" unless VALID_METHODS.include? op |
366 | send op |
367 | end |
368 | |
369 | def result |
370 | @content.join |
371 | end |
372 | |
373 | # These methods have to be called via do_op(opcode) so that @opcode is set properly |
374 | private |
375 | |
376 | def replace |
377 | delete('diffmod') |
378 | insert('diffmod') |
379 | end |
380 | |
381 | def insert(tagclass = 'diffins') |
382 | op_helper('ins', tagclass, @b[@opcode[3]...@opcode[4]]) |
383 | end |
384 | |
385 | def delete(tagclass = 'diffdel') |
386 | op_helper('del', tagclass, @a[@opcode[1]...@opcode[2]]) |
387 | end |
388 | |
389 | def equal |
390 | @content += @b[@opcode[3]...@opcode[4]] |
391 | end |
392 | |
393 | # Using this as op_helper would be equivalent to the first version of diff.rb by Bill Atkins |
394 | def op_helper_simple(tagname, tagclass, to_add) |
395 | @content << %(<#{tagname} class="#{tagclass}">) << to_add << %(</#{tagname}>) |
396 | end |
397 | |
398 | # Tries to put <p> tags or newline chars before the opening diff tags (<ins> or <del>) |
399 | # or after the ending diff tags. |
400 | # As a result the diff tags should be the "most inside" possible. |
401 | def op_helper(tagname, tagclass, to_add) |
402 | predicate_methods = [:tab?, :newline?, :close_tag?, :open_tag?] |
403 | content_to_skip = Proc.new do |item| |
404 | predicate_methods.any? {|predicate| HTMLDiff.send(predicate, item)} |
405 | end |
406 | |
407 | unless to_add.any? {|element| content_to_skip.call element} |
408 | @content << wrap_text(to_add, tagname, tagclass) |
409 | else |
410 | loop do |
411 | @content << to_add and break if to_add.all? {|element| content_to_skip.call element} |
412 | # We are outside of a diff tag |
413 | @content << to_add.shift while content_to_skip.call to_add.first |
414 | @content << %(<#{tagname} class="#{tagclass}">) |
415 | # We are inside a diff tag |
416 | @content << to_add.shift until content_to_skip.call to_add.first |
417 | @content << %(</#{tagname}>) |
418 | end |
419 | end |
420 | #remove_empty_diff(tagname, tagclass) |
421 | end |
422 | |
423 | def wrap_text(text, tagname, tagclass) |
424 | %(<#{tagname} class="#{tagclass}">#{text}</#{tagname}>) |
425 | end |
426 | |
427 | def remove_empty_diff(tagname, tagclass) |
428 | @content = @content[0...-2] if last_elements_empty_diff?(@content, tagname, tagclass) |
429 | end |
430 | |
431 | def last_elements_empty_diff?(content, tagname, tagclass) |
432 | content[-2] == %(<#{tagname} class="#{tagclass}">) and content.last == %(</#{tagname}>) |
433 | end |
434 | end |
435 | |
436 | class << self |
437 | include Diff::Utilities |
438 | |
439 | def diff(a, b) |
440 | a, b = html2list(explode(a)), html2list(explode(b)) |
441 | |
442 | out = Builder.new(a, b) |
443 | sequence_matcher = Diff::SequenceMatcher.new(a, b) |
444 | |
445 | sequence_matcher.get_opcodes.each {|opcode| out.do_op(opcode)} |
446 | |
447 | out.result |
448 | end |
449 | end |
450 | end |
451 | |
452 | if __FILE__ == $0 |
453 | if ARGV.size == 2 |
454 | puts HTMLDiff.diff(IO.read(ARGV.pop), IO.read(ARGV.pop)) |
455 | else |
456 | puts "Usage: html_diff file1 file2" |
457 | end |
458 | end |