#!/bin/ruby ######################################################################## # File:: make-md5.rb # (C):: Hipposoft 2010 # # Purpose:: Calculate new or updated MD5 checksums for files. # ---------------------------------------------------------------------- # 11-Feb-2010 (ADH): Created. ######################################################################## if ( ARGV.size == 0 ) puts puts "Usage:" puts "ruby make-md5.rb [ ...]" puts puts "Given one or more directories or files, scans the directories for" puts "files without recursion and/or loads the file(s) and calculates MD5" puts "checksums for them. A YAML configuration file at 'config/config.yml'" puts "inside each directory given, or the directory containing each file" puts "given, must be present in a format appropriate for the ROOL extended" puts "Radiant directory listing parser." puts puts "Configuration file entries are found by looking up the YAML hash with" puts "a key based on:" puts puts " * The leaf name up to but excluding the first '.' in the name" puts " * The entire leaf name" puts puts "...in that order. Key lookup is case sensitive. The value found (if" puts "any) is itself a hash into which keys 'md5' and 'md5_time' will be" puts "added. A new entry in the configuration file consisting of jut the" puts "MD5 data will be added should there be no existing entry (a warning" puts "is printed to stdout in such csaes)." puts puts "If MD5 data is already present then the checksum will only be" puts "computed if 'md5_time' is older than the file datestamp at the time" puts "the script runs." puts exit() end # Load data in chunks of this many bytes for MD5 calculation BUFFER_SIZE = 1024768 # External dependencies require 'find' require 'yaml' require 'digest/md5' # ============================================================================= # Return an MD5 checksum in hex form for the file at the given pathname. # Reads in chunks of $BUFLEN size to avoid excessive RAM penalties for # large files. # ============================================================================= # def calculate_md5_checksum( path ) # Distantly based upon: # # http://snippets.dzone.com/posts/show/3349 md5 = Digest::MD5.new File.open( path, 'r' ) do | io | print "Calculating MD5 checksum for file '#{ File.basename( path ) }'" counter = 0 while ( ! io.eof ) putc '.' if ( ( counter += 1 ) % 3 == 0 ) buffer = io.readpartial( BUFFER_SIZE ) md5.update( buffer ) end puts end return md5.hexdigest end # ============================================================================= # Return a hash describing a file at a given pathname. See the implementation # for details of the hash contents. # ============================================================================= # def get_file_description( path ) # Parser: Various charaters, a dot, then: one or more digits (0-9) # followed by an optional dot, repeated at least once, this whole # assembly optional, recording only the collection of digits and # dots, not individual digits-plus-dots sets ("(?:" => don't include # this group in the match data). Then zero or more other characters, # non-greedy. mod = File.mtime( path ) leaf = File.basename( path ) regexp = /^(.*?)\.((?:[0-9]+\.?)+)?(.*?)$/ scanned = leaf.scan( regexp )[ 0 ] base_name = scanned[ 0 ] version = ( scanned[ 1 ] || '' ).chomp( '.' ) # May have trailing '.' filetype = scanned[ 2 ] || '' return { :path => path, :leaf => leaf, :base_name => base_name, :link => "#{ path[ leaf.length..-1 ] }?#{ mod.tv_sec }", :mod => File.mtime( path ), :filetype => filetype, :version => version } end # ============================================================================= # Return an array of items describing a directory contents and optionally # the contents of any subdirectories as a flat unsorted list. The last # parameter is 'false' to avoid scanning to a level beyond the current # directory. The first two parameters are concatenated to generate the # path of the directory to scan. # ============================================================================= # def recursive_directory_list( base, dir, recurse = true ) # Distantly based upon: # # http://www.oreillynet.com/onjava/blog/2006/03/recursive_directory_list_with.html excludes = [ 'CVS', '.svn' ] collect = []; dir = File.join( base, dir ) first = true Find.find( dir ) do | path | if ( FileTest.directory?( path ) ) unless ( recurse ) if ( first ) first = false # Skip '.' / "this" directory next else Find.prune # Don't descend into the directory end else if ( excludes.include?( File.basename( path ) ) ) Find.prune # Don't descend into the directory else next end end else collect.push( get_file_description( path ) ) end end return collect end # ============================================================================= # Main processing loop. # ============================================================================= ARGV.each do | path | # Either read the contents of a given directory without recursion and look # for a configuration folder inside it, or read a file and look for a # configuration folder inside the directory in which the file resides. puts "Processing '#{ path }'..." if ( File.directory?( path ) ) collection = recursive_directory_list( path, '', false ) else collection = [ get_file_description( path ) ] path = File.dirname( path ) end config_path = File.join( path, 'config', 'config.yml' ) puts "Using '#{ config_path }'" config_data = YAML.load_file( config_path ) config_changed = false for description in collection path = description[ :path ] leaf = description[ :leaf ] base_name = description[ :base_name ] last_mod = description[ :mod ] info = config_data[ base_name ] || config_data[ leaf ] if ( info.nil? ) puts "WARNING: File '#{ leaf }' has no entry in config.yml" info = {} config_data[ base_name ] = info end if ( info[ 'md5' ].nil? || info[ 'md5_time' ].nil? || info[ 'md5_time' ] < last_mod ) info[ 'md5_time' ] = last_mod info[ 'md5' ] = calculate_md5_checksum( path ) config_changed = true else puts "Checksum for file '#{ leaf }' is up to date." end end if ( config_changed ) puts "Saving '#{ config_path }'" sorted_data = config_data.sort do | a, b | # Sorted hashes get converted to nested arrays of key/value pairs, so # in our case we get a file leaf/base name at index 0 and the data hash # at index 1 of each inner pair. Sort by group then base name. ( ( a[ 1 ][ 'group' ] || '' ) <=> ( b[ 1 ][ 'group' ] || '' ) ).nonzero? || ( a[ 0 ] <=> b[ 0 ] ) end # "YAML.dump" provides no sorting mechanism. Ruby 1.9 should help since # hash ordering is well defined, but for Ruby 1.8 with undefined order of # hash key enumeration, do it the hard way. Otherwise, the unsorted YAML # output is hard for humans to read or hand-modify. File.open( config_path, 'w' ) do | io | # Write the file header. io << "---\n" # Each individual YAML string is returned as an individual file and has # a file header line; output everything except this line. old_group = nil for pair in sorted_data base_name = pair[ 0 ] data = pair[ 1 ] group = data[ 'group' ] || '' if ( old_group != group ) io << "\n\# #{ ( group.empty? ) ? 'Ungrouped' : group }\n\n" old_group = group end io << ( { base_name => data } ).to_yaml.sub( /.*?\n/, '' ) end end end puts "...Processing complete." end puts "Finished."