Changesets can be listed by changeset number.
The Git repository is here.
- Revision:
- 118
- Log:
Controller integrated with HubSsoLib v0.1.0; see Changeset #113. Page
model now has special HubSsoLib tags but they're no use on any page
behavior that caches output. The news behavior has a new "https"
attribute supported on the inclusion tag to force URLs extracted from
feeds to the HTTPS protocol if the enclosing page is itself being
fetched using HTTPS. Fixed delivery path of administration interface
cookie that records the page tree state. Fixed at typo in the
database migration script. Updated the command run when header, footer
and sidebar snippets are changed.
- Author:
- adh
- Date:
- Fri Oct 27 17:19:47 +0100 2006
- Size:
- 16750 Bytes
1 | #~ Copyright (C) 2002 Jeff Schilling |
2 | |
3 | #~ This program is free software; you can redistribute it and/or |
4 | #~ modify it under the terms of the GNU General Public License |
5 | #~ as published by the Free Software Foundation; either version 2 |
6 | #~ of the License, or (at your option) any later version. |
7 | |
8 | #~ This program is distributed in the hope that it will be useful, |
9 | #~ but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | #~ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | #~ GNU General Public License for more details. |
12 | |
13 | #~ You should have received a copy of the GNU General Public License |
14 | #~ along with this program; if not, write to the Free Software |
15 | #~ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
16 | |
17 | |
18 | |
19 | #~ """Ultra-liberal RSS parser |
20 | #~ Based on Mark Pilgrim's rssparser.py |
21 | |
22 | |
23 | |
24 | |
25 | require 'sgml-parser' |
26 | require 'cgi' |
27 | require 'time' |
28 | require 'uri' |
29 | require 'net/http' |
30 | require 'zlib' |
31 | |
32 | class RssParser < SGMLParser |
33 | def initialize() |
34 | super(nil) |
35 | @namespaces = {"http://backend.userland.com/rss" => "", |
36 | "http://backend.userland.com/rss2" => "", |
37 | "http://purl.org/rss/1.0/" => "", |
38 | "http://purl.org/rss/1.0/modules/textinput/" => "ti", |
39 | "http://purl.org/rss/1.0/modules/company/" =>"co", |
40 | "http://purl.org/rss/1.0/modules/syndication/" => "sy", |
41 | "http://purl.org/dc/elements/1.1/" => "dc"} |
42 | @_new_declname_match = Regexp.compile('[a-zA-Z][-_.a-zA-Z0-9:]*\s*') |
43 | @short_weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] |
44 | @long_weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"] |
45 | @months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"] |
46 | |
47 | |
48 | end |
49 | def decodeEntities(data) |
50 | data = data || '' |
51 | data = data.gsub('<','<') |
52 | data = data.gsub('>', '>') |
53 | data = data.gsub('"', '"') |
54 | data = data.gsub(''', "'") |
55 | data = data.gsub('&', '&') |
56 | return data |
57 | end |
58 | |
59 | def reset |
60 | super() |
61 | @channel = {} |
62 | @items = [] |
63 | @currentitem = [] |
64 | @elementstack = Array::new(0) |
65 | @inchannel = 0 |
66 | @initem = 0 |
67 | @namespacemap = {} |
68 | |
69 | end |
70 | |
71 | def push(element, expectingText) |
72 | @elementstack.push([element, expectingText, []]) |
73 | end |
74 | |
75 | def pop(element) |
76 | return if not @elementstack |
77 | return if @elementstack[-1][0] != element |
78 | element, expectingText, pieces = @elementstack.pop() |
79 | return if not expectingText |
80 | output = pieces.join |
81 | output = decodeEntities(output) |
82 | # print "[#{element}][#{expectingText}][#{pieces}][#{output}]\n" |
83 | if (@initem == 1) |
84 | @items.last[element] = output |
85 | elsif (@inchannel == 1) |
86 | @channel[element] = output |
87 | end |
88 | end |
89 | |
90 | def channel |
91 | return @channel |
92 | end |
93 | |
94 | def items |
95 | return @items |
96 | end |
97 | |
98 | def _addNamespaces(attrs) |
99 | attrs.each { | prefix,value | |
100 | next if not prefix =~ "^xmlns:" |
101 | prefix = prefix[6..(prefix.length)] |
102 | if @namespaces.has_key(value) |
103 | @namespacemap[prefix] = @namespaces[value] |
104 | end |
105 | } |
106 | end |
107 | |
108 | def start_channel(attrs) |
109 | push('channel', 0) |
110 | @inchannel = 1 |
111 | end |
112 | |
113 | def end_channel() |
114 | pop('channel') |
115 | @inchannel = 0 |
116 | end |
117 | |
118 | def start_item( attrs) |
119 | push('item', 0) |
120 | @items.push({}) |
121 | @initem = 1 |
122 | end |
123 | |
124 | def end_item() |
125 | pop('item') |
126 | @initem = 0 |
127 | end |
128 | |
129 | def start_dc_language( attrs) |
130 | push('language', 1) |
131 | start_language = start_dc_language |
132 | end |
133 | |
134 | def end_dc_language() |
135 | pop('language') |
136 | end_language = end_dc_language |
137 | end |
138 | |
139 | def start_dc_creator( attrs) |
140 | push('creator', 1) |
141 | @start_managingeditor = @start_dc_creator |
142 | @start_webmaster = @start_dc_creator |
143 | end |
144 | |
145 | def end_dc_creator() |
146 | pop('creator') |
147 | @end_managingeditor = @end_dc_creator |
148 | @end_webmaster = @end_dc_creator |
149 | end |
150 | |
151 | def start_dc_rights( attrs) |
152 | push('rights', 1) |
153 | @start_copyright = @start_dc_rights |
154 | end |
155 | |
156 | def end_dc_rights() |
157 | pop('rights') |
158 | @end_copyright = @end_dc_rights |
159 | end |
160 | |
161 | def start_dc_date( attrs) |
162 | push('date', 1) |
163 | @start_lastbuilddate = @start_dc_date |
164 | @start_pubdate = @start_dc_date |
165 | end |
166 | |
167 | def end_dc_date() |
168 | pop('date') |
169 | @end_lastbuilddate = @end_dc_date |
170 | @end_pubdate = @end_dc_date |
171 | end |
172 | |
173 | def start_dc_subject( attrs) |
174 | push('category', 1) |
175 | end |
176 | |
177 | def end_dc_subject() |
178 | pop('category') |
179 | end |
180 | |
181 | def start_link( attrs) |
182 | push('link', (@inchannel or @initem)) |
183 | end |
184 | |
185 | def end_link() |
186 | pop('link') |
187 | end |
188 | |
189 | def start_guid( attrs) |
190 | x = attrs.assoc('ispermalink') |
191 | |
192 | @guidislink = (x == "false") ? nil : x |
193 | push('guid', 1) |
194 | end |
195 | |
196 | def end_guid() |
197 | pop('guid') |
198 | if @guidislink |
199 | @items[-1]['link'] = @items[-1]['guid'] |
200 | end |
201 | end |
202 | |
203 | def start_title( attrs) |
204 | push('title', (@inchannel or @initem)) |
205 | end |
206 | |
207 | def start_description( attrs) |
208 | push('description', (@inchannel or @initem)) |
209 | end |
210 | |
211 | def start_content_encoded( attrs) |
212 | push('content_encoded', 1) |
213 | @start_fullitem = @start_content_encoded |
214 | end |
215 | |
216 | def end_content_encoded() |
217 | pop('content_encoded') |
218 | end_fullitem = end_content_encoded |
219 | end |
220 | |
221 | def unknown_starttag( tag, attrs) |
222 | _addNamespaces(attrs) |
223 | colonpos = tag.index(':') |
224 | if colonpos |
225 | prefix = tag[1..colonpos] |
226 | suffix = tag[colonpos+1..(tag.length)] |
227 | prefix = @namespacemap.get(prefix, prefix) |
228 | if prefix |
229 | prefix = prefix + '_' |
230 | end |
231 | methodname = 'start_' + prefix + suffix |
232 | begin |
233 | method = getattr( methodname) |
234 | return method(attrs) |
235 | rescue AttributeError |
236 | return push(prefix + suffix, 0) |
237 | end |
238 | end |
239 | return push(tag, 0) |
240 | end |
241 | |
242 | def unknown_endtag( tag) |
243 | colonpos = tag.index(':') |
244 | if colonpos |
245 | prefix = tag[1..colonpos] |
246 | suffix = tag[colonpos+1..(tag.length)] |
247 | prefix = @namespacemap.fetch(prefix, prefix) |
248 | if prefix |
249 | prefix = prefix + '_' |
250 | end |
251 | methodname = 'end_' + prefix + suffix |
252 | begin |
253 | #print methodname |
254 | method = method( methodname) |
255 | return method.call |
256 | rescue NameError |
257 | return pop(prefix + suffix) |
258 | end |
259 | end |
260 | return pop(tag) |
261 | end |
262 | |
263 | def handle_charref( ref) |
264 | # called for each character reference, e.g. for " ", ref will be "160" |
265 | # Reconstruct the original character reference. |
266 | return if not @elementstack |
267 | addData("&#%(ref)s;") |
268 | end |
269 | |
270 | def handle_entityref( ref) |
271 | # called for each entity reference, e.g. for "©", ref will be "copy" |
272 | # Reconstruct the original entity reference. |
273 | return if not @elementstack |
274 | addData("&#{ref};") # % locals()) |
275 | end |
276 | |
277 | def handle_data( text) |
278 | # called for each block of plain text, i.e. outside of any tag and |
279 | # not containing any character or entity references |
280 | return if not @elementstack.last |
281 | addData(text) |
282 | end |
283 | |
284 | def addData(text) |
285 | if (@elementstack.last && @elementstack.last[2]) |
286 | @elementstack.last[2] << (text) |
287 | else |
288 | #@elementstack.last[2] = "" << (text) |
289 | end |
290 | end |
291 | |
292 | def handle_comment( text) |
293 | end |
294 | |
295 | def handle_pi( text) |
296 | # called for each processing instruction, e.g. <?instruction> |
297 | #~ pass |
298 | #print "\n handle_pi: #{@elementstack} #{text}\n" |
299 | |
300 | end |
301 | def handle_decl( text) |
302 | # called for the DOCTYPE, if present, e.g. |
303 | # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" |
304 | # "http://www.w3.org/TR/html4/loose.dtd"> |
305 | #~ pass |
306 | #print "\n handle_decl: #{@elementstack} #{text}\n" |
307 | |
308 | end |
309 | |
310 | def _scan_name( i, declstartpos) |
311 | rawdata = @rawdata |
312 | n = len(rawdata) |
313 | if i == n |
314 | return None, -1 |
315 | end |
316 | m = rawdata.index(@_new_declname_match, i) |
317 | if m |
318 | s = m.group() |
319 | name = s.strip() |
320 | if (i + len(s)) == n |
321 | return None, -1 # end of buffer |
322 | end |
323 | return string.lower(name), m.end() |
324 | else |
325 | updatepos(declstartpos, i) |
326 | error("expected name token") |
327 | end |
328 | end |
329 | |
330 | def parse_declaration( i) |
331 | # override internal declaration handler to handle CDATA blocks |
332 | if @rawdata[i..i+9] == '<![CDATA[' |
333 | k = @rawdata.find(']]>', i) |
334 | if (k == -1) |
335 | k = len(@rawdata) |
336 | end |
337 | handle_data(CGI::escape(@rawdata[i+9..k])) |
338 | return k+3 |
339 | end |
340 | return sgmllib.SGMLParser.parse_declaration( i) |
341 | end |
342 | |
343 | def parse(data,result={}) |
344 | begin |
345 | reset() |
346 | feed(data) |
347 | result["channel"] = channel |
348 | result["items"] = items |
349 | rescue StandardError => bang |
350 | print bang |
351 | print_backtrace(bang) |
352 | end |
353 | result |
354 | end |
355 | |
356 | def print_backtrace( err ) |
357 | $stderr.print err.to_s, " (#{err.type})\n" |
358 | err.backtrace.each {|i| $stderr.puts i } |
359 | end |
360 | |
361 | end |
362 | |
363 | class HttpGetter |
364 | def print_backtrace( err ) |
365 | $stderr.print err.to_s, " (#{err.type})\n" |
366 | err.backtrace.each {|i| $stderr.puts i } |
367 | end |
368 | |
369 | def get_etag(info) |
370 | # """ |
371 | # Get the ETag associated with a response returned from a call to |
372 | # open_resource(). |
373 | |
374 | # If the resource was not returned from an HTTP server or the server did |
375 | # not specify an ETag for the resource, this will return None. |
376 | # """ |
377 | return info["etag"] |
378 | end |
379 | |
380 | def get_modified(info) |
381 | #""" |
382 | # Get the Last-Modified timestamp for a response returned from a call to |
383 | # open_resource(). |
384 | |
385 | # If the resource was not returned from an HTTP server or the server did |
386 | # not specify a Last-Modified timestamp, this function will return None. |
387 | # Otherwise, it returns a tuple of 9 integers as returned by gmtime() in |
388 | # the standard Python time module(). |
389 | # """ |
390 | last_modified = info["last-modified"] |
391 | |
392 | if last_modified |
393 | return parse_http_date(last_modified) |
394 | end |
395 | |
396 | return last_modified |
397 | end |
398 | |
399 | |
400 | def format_http_date(date) |
401 | # """ |
402 | # Formats a tuple of 9 integers into an RFC 1123-compliant timestamp as |
403 | # required in RFC 2616. We don't use time.strftime() since the %a and %b |
404 | # directives can be affected by the current locale (HTTP dates have to be |
405 | # in English). The date MUST be in GMT (Greenwich Mean Time). |
406 | #""" |
407 | time = Time.parse(date) |
408 | return CGI::rfc1123_date(time) #"%s, %02d %s %04d %02d:%02d:%02d GMT" % (short_weekdays[date[6]], date[2], months[date[1] - 1], date[0], date[3], date[4], date[5]) |
409 | end |
410 | |
411 | |
412 | def parse_http_date(date) |
413 | Time.httpdate(date) |
414 | end |
415 | |
416 | |
417 | def readData(source, result, etag=nil, modified=nil, agent=nil, referrer=nil ) |
418 | begin |
419 | headers = {} |
420 | if etag |
421 | headers["If-None-Match"] = etag |
422 | end |
423 | if modified |
424 | headers["If-Modified-Since"] = format_http_date(modified) |
425 | end |
426 | if agent |
427 | headers["User-Agent"] = agent |
428 | end |
429 | if referrer |
430 | # http://www.dictionary.com/search?q=referer |
431 | headers["Referer"] = referrer |
432 | end |
433 | headers["Accept-encoding"] = "gzip" |
434 | uri = URI.parse(source) |
435 | |
436 | h = Net::HTTP.new(uri.host) |
437 | resp , = h.get(uri.path, headers) |
438 | resp.each {|key, val| printf "%-14s = %-40.40s\n", key, val } |
439 | |
440 | data = getBody(resp) |
441 | newEtag = get_etag(resp) |
442 | if newEtag |
443 | result["etag"] = newEtag |
444 | elsif etag |
445 | result["etag"] = etag |
446 | end |
447 | |
448 | newModified = get_modified(resp) |
449 | if newModified |
450 | result["modified"] = newModified |
451 | elsif modified |
452 | result["modified"] = modified |
453 | end |
454 | |
455 | return data |
456 | rescue TimeoutError => toe |
457 | print "TimeoutError on #{source} : #{toe}" |
458 | rescue StandardError => bang |
459 | print "ERROR: #{bang}" |
460 | print_backtrace(bang) |
461 | #return open(source,'r').read |
462 | end |
463 | |
464 | end |
465 | |
466 | def getBody(resp) |
467 | data = resp.body |
468 | #if resp.fetch('content-encoding', 'None') == 'gzip' |
469 | if resp['content-encoding'] == 'gzip' |
470 | begin |
471 | # abstract the differences between 1.7 and 1.6 w/o requiring the shim library |
472 | begin |
473 | require "stringio" if not defined? StringIO |
474 | body = StringIO.new(data) |
475 | rescue LoadError |
476 | require "tempfile" |
477 | body = Tempfile.new("CGI") |
478 | body.binmode |
479 | body.write(data) |
480 | body.flush |
481 | body.pos = 0 |
482 | end |
483 | stream = body |
484 | |
485 | gzReader = Zlib::GzipReader.new(stream) |
486 | data = gzReader.read() |
487 | gzReader.close() |
488 | |
489 | rescue StandardError => err |
490 | print "Zlib error: #{err}\n" |
491 | end |
492 | end |
493 | data |
494 | end |
495 | |
496 | |
497 | end |
498 | |
499 | urls = ['http://www.pocketsoap.com/rssTests/rss1.0withModules.xml', |
500 | 'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNS.xml', |
501 | 'http://www.pocketsoap.com/rssTests/rss1.0withModulesNoDefNSLocalNameClash.xml', |
502 | 'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModules.xml', |
503 | 'http://www.pocketsoap.com/rssTests/rss2.0noNSwithModulesLocalNameClash.xml', |
504 | 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModules.xml', |
505 | 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNS.xml', |
506 | 'http://www.pocketsoap.com/rssTests/rss2.0NSwithModulesNoDefNSLocalNameClash.xml'] |
507 | |
508 | |
509 | if __FILE__ == $0 |
510 | def printIt(x) |
511 | begin |
512 | require 'pp' |
513 | pp x |
514 | rescue LoadError |
515 | p x |
516 | end |
517 | end |
518 | p ARGV |
519 | if (ARGV[0]) |
520 | there = [ARGV[0]] |
521 | end |
522 | there = urls unless there |
523 | |
524 | r = RssParser::new() |
525 | getter = HttpGetter.new() |
526 | there.each { | url | |
527 | print "#{url}" |
528 | |
529 | result = {} |
530 | data = getter.readData(url,result) |
531 | result = r.parse(data,result) |
532 | printIt (result) |
533 | } |
534 | |
535 | |
536 | end |
537 | |
538 | |
539 | #~ Visit http://diveintomark.org/projects/rss_parser/ for the latest version |
540 | |
541 | #~ Handles RSS 0.9x and RSS 1.0 feeds |
542 | |
543 | #~ RSS 0.9x elements: |
544 | #~ - title, link, description, webMaster, managingEditor, language |
545 | #~ copyright, lastBuildDate, pubDate |
546 | |
547 | #~ RSS 1.0 elements: |
548 | #~ - dc:rights, dc:language, dc:creator, dc:date, dc:subject, |
549 | #~ content:encoded |
550 | |
551 | #~ Things it handles that choke other RSS parsers: |
552 | #~ - bastard combinations of RSS 0.9x and RSS 1.0 (most Movable Type feeds) |
553 | #~ - illegal XML characters (most Radio feeds) |
554 | #~ - naked and/or invalid HTML in description (The Register) |
555 | #~ - content:encoded in item element (Aaron Swartz) |
556 | #~ - guid in item element (Scripting News) |
557 | #~ - fullitem in item element (Jon Udell) |
558 | #~ - non-standard namespaces (BitWorking) |
559 | |
560 | #~ Requires Python 2.2 or later |
561 | #~ """ |
562 | |
563 | #~ __author__ = "Mark Pilgrim (f8dy@diveintomark.org)" |
564 | #~ __copyright__ = "Copyright 2002, Mark Pilgrim" |
565 | #~ __contributors__ = ["Jason Diamond (jason@injektilo.org)"] |
566 | #~ __license__ = "GPL" |
567 | #~ __history__ = """ |
568 | #~ 1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements, |
569 | #~ added Simon Fell's test suite |
570 | #~ 1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections |
571 | #~ 2.0 - 10/19/2002 |
572 | #~ JD - use inchannel to watch out for image and textinput elements which can |
573 | #~ also contain title, link, and description elements |
574 | #~ JD - check for isPermaLink="false" attribute on guid elements |
575 | #~ JD - gsubd openAnything with open_resource supporting ETag and |
576 | #~ If-Modified-Since request headers |
577 | #~ JD - parse now accepts etag, modified, agent, and referrer optional |
578 | #~ arguments |
579 | #~ JD - modified parse to return a dictionary instead of a tuple so that any |
580 | #~ etag or modified information can be returned and cached by the caller |
581 | #~ 2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything |
582 | #~ because of etag/modified, return the old etag/modified to the caller to |
583 | #~ indicate why nothing is being returned |
584 | #~ 2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its |
585 | #~ useless. Fixes the problem JD was addressing by adding it. |
586 | #~ 2.1 - 11/14/2002 - MAP - added gzip support |
587 | #~ """ |