Changesets can be listed by changeset number.
The Git repository is here.
- Revision:
- 8
- Log:
Updated to HEAD revision from SVN repository for 22-Jul-2006 at
about 6:30pm.
- Author:
- adh
- Date:
- Sat Jul 22 19:30:50 +0100 2006
- Size:
- 36527 Bytes
1 | # Copyright (c) 2006 Kasper Weibel Nielsen-Refs, Thomas Lockney, Jens Krämer |
2 | # |
3 | # Permission is hereby granted, free of charge, to any person obtaining a copy |
4 | # of this software and associated documentation files (the "Software"), to deal |
5 | # in the Software without restriction, including without limitation the rights |
6 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
7 | # copies of the Software, and to permit persons to whom the Software is |
8 | # furnished to do so, subject to the following conditions: |
9 | # |
10 | # The above copyright notice and this permission notice shall be included in all |
11 | # copies or substantial portions of the Software. |
12 | # |
13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
15 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
18 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
19 | # SOFTWARE. |
20 | |
21 | require 'active_record' |
22 | require 'set' |
23 | |
24 | |
25 | # Yet another Ferret Mixin. |
26 | # |
27 | # This mixin adds full text search capabilities to any Rails model. |
28 | # |
29 | # It is heavily based on the original acts_as_ferret plugin done by |
30 | # Kasper Weibel and a modified version done by Thomas Lockney, which |
31 | # both can be found on |
32 | # http://ferret.davebalmain.com/trac/wiki/FerretOnRails |
33 | # |
34 | # usage: |
35 | # include the following in your model class (specifiying the fields you want to get indexed): |
36 | # acts_as_ferret :fields => [ 'title', 'description' ] |
37 | # |
38 | # now you can use ModelClass.find_by_contents(query) to find instances of your model |
39 | # whose indexed fields match a given query. All query terms are required by default, but |
40 | # explicit OR queries are possible. This differs from the ferret default, but imho is the more |
41 | # often needed/expected behaviour (more query terms result in less results). |
42 | # |
43 | # Released under the MIT license. |
44 | # |
45 | # Authors: |
46 | # Kasper Weibel Nielsen-Refs (original author) |
47 | # Jens Kraemer <jk@jkraemer.net> |
48 | # |
49 | module FerretMixin |
50 | module Acts #:nodoc: |
51 | module ARFerret #:nodoc: |
52 | |
53 | # decorator that adds a total_hits accessor to search result arrays |
54 | class SearchResults |
55 | attr_reader :total_hits |
56 | def initialize(results, total_hits) |
57 | @results = results |
58 | @total_hits = total_hits |
59 | end |
60 | def method_missing(symbol, *args, &block) |
61 | @results.send(symbol, *args, &block) |
62 | end |
63 | end |
64 | |
65 | def self.ensure_directory(dir) |
66 | FileUtils.mkdir_p dir unless File.directory? dir |
67 | end |
68 | |
69 | # make sure the default index base dir exists. by default, all indexes are created |
70 | # under RAILS_ROOT/index/RAILS_ENV |
71 | def self.init_index_basedir |
72 | index_base = "#{RAILS_ROOT}/index" |
73 | ensure_directory index_base |
74 | @@index_dir = "#{index_base}/#{RAILS_ENV}" |
75 | ensure_directory @@index_dir |
76 | end |
77 | |
78 | mattr_accessor :index_dir |
79 | init_index_basedir |
80 | |
81 | def self.append_features(base) |
82 | super |
83 | base.extend(ClassMethods) |
84 | end |
85 | |
86 | # declare the class level helper methods |
87 | # which will load the relevant instance methods defined below when invoked |
88 | module ClassMethods |
89 | |
90 | # helper that defines a method that adds the given field to a lucene |
91 | # document instance |
92 | def define_to_field_method(field, options = {}) |
93 | default_opts = { :store => Ferret::Document::Field::Store::NO, |
94 | :index => Ferret::Document::Field::Index::TOKENIZED, |
95 | :term_vector => Ferret::Document::Field::TermVector::NO, |
96 | :binary => false, |
97 | :boost => 1.0 |
98 | } |
99 | default_opts.update(options) if options.is_a?(Hash) |
100 | fields_for_ferret << field |
101 | define_method("#{field}_to_ferret".to_sym) do |
102 | begin |
103 | #val = self[field] || self.instance_variable_get("@#{field.to_s}".to_sym) || self.method(field).call |
104 | val = content_for_field_name(field) |
105 | rescue |
106 | logger.debug("Error retrieving value for field #{field}: #{$!}") |
107 | val = '' |
108 | end |
109 | logger.debug("Adding field #{field} with value '#{val}' to index") |
110 | Ferret::Document::Field.new(field.to_s, val, |
111 | default_opts[:store], |
112 | default_opts[:index], |
113 | default_opts[:term_vector], |
114 | default_opts[:binary], |
115 | default_opts[:boost]) |
116 | end |
117 | end |
118 | |
119 | # TODO: do we need to define this at this level ? Maybe it's |
120 | # sufficient to do this only in classes calling acts_as_ferret ? |
121 | def reloadable?; false end |
122 | |
123 | @@ferret_indexes = Hash.new |
124 | def ferret_indexes; @@ferret_indexes end |
125 | |
126 | @@multi_indexes = Hash.new |
127 | def multi_indexes; @@multi_indexes end |
128 | |
129 | # declares a class as ferret-searchable. |
130 | # |
131 | # options are: |
132 | # |
133 | # fields:: names all fields to include in the index. If not given, |
134 | # all attributes of the class will be indexed. You may also give |
135 | # symbols pointing to instance methods of your model here, i.e. |
136 | # to retrieve and index data from a related model. |
137 | # |
138 | # index_dir:: declares the directory where to put the index for this class. |
139 | # The default is RAILS_ROOT/index/RAILS_ENV/CLASSNAME. |
140 | # The index directory will be created if it doesn't exist. |
141 | # |
142 | # single_index:: set this to true to let this class use a Ferret |
143 | # index that is shared by all classes having :single_index set to true. |
144 | # :store_class_name is set to true implicitly, as well as index_dir, so |
145 | # don't bother setting these when using this option. the shared index |
146 | # will be located in index/<RAILS_ENV>/shared . |
147 | # |
148 | # store_class_name:: to make search across multiple models useful, set |
149 | # this to true. the model class name will be stored in a keyword field |
150 | # named class_name |
151 | # |
152 | # max_results:: number of results to retrieve for :num_docs => :all, |
153 | # default value is 1000 |
154 | # |
155 | # ferret_options may be: |
156 | # occur_default:: - whether query terms are required by |
157 | # default (the default), or not. Specify one of |
158 | # Ferret::Search::BooleanClause::Occur::MUST or |
159 | # Ferret::Search::BooleanClause::Occur::SHOULD |
160 | # |
161 | # analyzer:: the analyzer to use for query parsing (default: nil, |
162 | # wihch means the ferret default Analyzer gets used) |
163 | # |
164 | def acts_as_ferret(options={}, ferret_options={}) |
165 | configuration = { |
166 | :fields => nil, |
167 | :index_dir => "#{FerretMixin::Acts::ARFerret::index_dir}/#{self.name.underscore}", |
168 | :store_class_name => false, |
169 | :single_index => false, |
170 | :max_results => 1000, |
171 | :auto_index_update => true |
172 | } |
173 | ferret_configuration = { |
174 | :occur_default => Ferret::Search::BooleanClause::Occur::MUST, |
175 | :handle_parse_errors => true, |
176 | :default_search_field => '*', |
177 | :analyzer => Ferret::Analysis::StandardAnalyzer.new, |
178 | # :wild_lower => true |
179 | } |
180 | configuration.update(options) if options.is_a?(Hash) |
181 | # apply appropriate settings for shared index |
182 | if configuration[:single_index] |
183 | configuration[:index_dir] = "#{FerretMixin::Acts::ARFerret::index_dir}/shared" |
184 | configuration[:store_class_name] = true |
185 | end |
186 | ferret_configuration.update(ferret_options) if ferret_options.is_a?(Hash) |
187 | # these properties are somewhat vital to the plugin and shouldn't |
188 | # be overwritten by the user: |
189 | ferret_configuration.update( |
190 | :key => (configuration[:single_index] ? ['id', 'class_name'] : 'id'), |
191 | :path => configuration[:index_dir], |
192 | :auto_flush => true, |
193 | :create_if_missing => true |
194 | ) |
195 | |
196 | class_eval <<-EOV |
197 | include FerretMixin::Acts::ARFerret::InstanceMethods |
198 | |
199 | cattr_accessor :fields_for_ferret |
200 | cattr_accessor :configuration |
201 | cattr_accessor :ferret_configuration |
202 | |
203 | after_destroy :ferret_destroy |
204 | |
205 | @@fields_for_ferret = Array.new |
206 | @@configuration = configuration |
207 | @@ferret_configuration = ferret_configuration |
208 | |
209 | if configuration[:fields].respond_to?(:each_pair) |
210 | configuration[:fields].each_pair do |key,val| |
211 | define_to_field_method(key,val) |
212 | end |
213 | elsif configuration[:fields].respond_to?(:each) |
214 | configuration[:fields].each do |field| |
215 | define_to_field_method(field) |
216 | end |
217 | else |
218 | @@fields_for_ferret = nil |
219 | end |
220 | EOV |
221 | |
222 | if configuration[:auto_index_update] |
223 | # index will be updated automatically after a record is created/updated/destroyed |
224 | class_eval <<-EOV |
225 | before_create :ferret_before_create |
226 | before_update :ferret_before_update |
227 | after_create :ferret_create |
228 | after_update :ferret_update |
229 | EOV |
230 | end |
231 | |
232 | FerretMixin::Acts::ARFerret::ensure_directory configuration[:index_dir] |
233 | end |
234 | |
235 | def class_index_dir |
236 | configuration[:index_dir] |
237 | end |
238 | |
239 | # rebuild the index from all data stored for this model. |
240 | # This is called automatically when no index exists yet. |
241 | # |
242 | # TODO: the automatic index initialization only works if |
243 | # every model class has it's |
244 | # own index, otherwise the index will get populated only |
245 | # with instances from the first model loaded |
246 | # |
247 | # When calling this method manually, you can give any additional |
248 | # model classes that should also go into this index as parameters. |
249 | # Useful when using the :single_index option. |
250 | def rebuild_index(*additional_models) |
251 | last_index_update_file = "#{class_index_dir}/last_update.timestamp" |
252 | # update timestamp |
253 | File.open(last_index_update_file, 'w+') do |file| |
254 | file.write(Time.now.to_i.to_s) |
255 | end |
256 | |
257 | index = Ferret::Index::Index.new(ferret_configuration.merge(:create => true)) |
258 | additional_models << self |
259 | batch_size = 1000 |
260 | additional_models.each do |model| |
261 | # index in batches of 1000 to limit memory consumption (fixes #24) |
262 | model.transaction do |
263 | 0.step(model.count, batch_size) do |i| |
264 | model.find(:all, :limit => batch_size, :offset => i).each do |rec| |
265 | puts rec.id |
266 | index << rec.to_doc rescue puts 'could not add to index' |
267 | end |
268 | index.flush |
269 | end |
270 | end |
271 | end |
272 | logger.debug("Created Ferret index in: #{class_index_dir}") |
273 | index.optimize |
274 | index.close |
275 | end |
276 | |
277 | # Update the index for this model manually. This must be done periodically (e.g in a cronjob) |
278 | # if the option :auto_index_update is disabled. |
279 | def manual_index_update |
280 | last_index_update_file = "#{class_index_dir}/last_update.timestamp" |
281 | # read timestamp |
282 | last_index_update = Time.at((File.read(last_index_update_file).to_i rescue 0)) |
283 | # update timestamp |
284 | File.open(last_index_update_file, 'w+') do |file| |
285 | file.write(Time.now.to_i.to_s) |
286 | end |
287 | |
288 | batch_size = 1000 |
289 | changed_records = self.count(['updated_at > ?', last_index_update]) |
290 | 0.step(changed_records, batch_size) do |i| |
291 | self.find(:all, :conditions => ['updated_at > ?', last_index_update], :limit => batch_size, :offset => i).each do |rec| |
292 | rec.ferret_before_update |
293 | rec.ferret_update |
294 | end |
295 | end |
296 | |
297 | self.ferret_index.flush |
298 | self.ferret_index.optimize |
299 | end |
300 | |
301 | # Retrieve the Ferret::Index::Index instance for this model class. |
302 | # |
303 | # Index instances are stored in a hash, using the index directory |
304 | # as the key. So model classes sharing a single index will share their |
305 | # Index object, too. |
306 | def ferret_index |
307 | ferret_indexes[class_index_dir] ||= create_index_instance |
308 | end |
309 | |
310 | # creates a new Index::Index instance. Before that, a check is done |
311 | # to see if the index exists in the file system. If not, index rebuild |
312 | # from all model data retrieved by find(:all) is triggered. |
313 | def create_index_instance |
314 | rebuild_index unless File.file? "#{class_index_dir}/segments" |
315 | Ferret::Index::Index.new(ferret_configuration) |
316 | end |
317 | |
318 | # Finds instances by contents. Terms are ANDed by default, can be circumvented |
319 | # by using OR between terms. |
320 | # options: |
321 | # :first_doc - first hit to retrieve (useful for paging) |
322 | # :num_docs - number of hits to retrieve, or :all to retrieve |
323 | # max_results results, which by default is 1000 and can be changed in |
324 | # the call to acts_as_ferret or on demand like this: |
325 | # Model.configuration[:max_results] = 1000000 |
326 | # |
327 | # find_options is a hash passed on to active_record's find when |
328 | # retrieving the data from db, useful to i.e. prefetch relationships. |
329 | # |
330 | # this method returns a SearchResults instance, which really is an Array that has |
331 | # been decorated with a total_hits accessor that delivers the total |
332 | # number of hits (including those not fetched because of a low num_docs |
333 | # value). |
334 | def find_by_contents(q, options = {}, find_options = {}) |
335 | # handle shared index |
336 | return single_index_find_by_contents(q, options, find_options) if configuration[:single_index] |
337 | id_array = [] |
338 | id_positions = {} |
339 | total_hits = find_id_by_contents(q, options) do |model, id, score| |
340 | id_array << id |
341 | # store index of this id for later ordering of results |
342 | id_positions[id] = id_array.size |
343 | end |
344 | begin |
345 | # TODO: in case of STI AR will filter out hits from other |
346 | # classes for us, but this |
347 | # will lead to less results retrieved --> scoping of ferret query |
348 | # to self.class is still needed. |
349 | if id_array.empty? |
350 | result = [] |
351 | else |
352 | conditions = [ "id in (?)", id_array ] |
353 | # combine our conditions with those given by user, if any |
354 | if find_options[:conditions] |
355 | cust_opts = find_options[:conditions].dup |
356 | conditions.first << " and " << cust_opts.shift |
357 | conditions.concat(cust_opts) |
358 | end |
359 | result = self.find(:all, |
360 | find_options.merge(:conditions => conditions)) |
361 | end |
362 | rescue |
363 | logger.debug "REBUILD YOUR INDEX! One of the id's didn't have an associated record: #{id_array}" |
364 | end |
365 | |
366 | # order results as they were found by ferret, unless an AR :order |
367 | # option was given |
368 | unless find_options[:order] |
369 | result.sort! { |a, b| id_positions[a.id] <=> id_positions[b.id] } |
370 | end |
371 | |
372 | logger.debug "Query: #{q}\nResult id_array: #{id_array.inspect},\nresult: #{result}" |
373 | return SearchResults.new(result, total_hits) |
374 | end |
375 | |
376 | # determine all field names in the shared index |
377 | def single_index_field_names(models) |
378 | @single_index_field_names ||= ( |
379 | searcher = Ferret::Search::IndexSearcher.new(class_index_dir) |
380 | if searcher.reader.respond_to?(:get_field_names) |
381 | (searcher.reader.send(:get_field_names) - ['id', 'class_name']).to_a |
382 | else |
383 | puts <<-END |
384 | unable to retrieve field names for class #{self.name}, please |
385 | consider naming all indexed fields in your call to acts_as_ferret! |
386 | END |
387 | models.map { |m| m.content_columns.map { |col| col.name } }.flatten |
388 | end |
389 | ) |
390 | |
391 | end |
392 | |
393 | # weiter: checken ob ferret-bug, dass wir die queries so selber bauen |
394 | # muessen - liegt am downcasen des qparsers ? - gucken ob jetzt mit |
395 | # ferret geht (content_cols) und dave um zugriff auf qp bitten, oder |
396 | # auf reader |
397 | def single_index_find_by_contents(q, options = {}, find_options = {}) |
398 | result = [] |
399 | |
400 | unless options[:models] == :all # search needs to be restricted by one or more class names |
401 | options[:models] ||= [] |
402 | # add this class to the list of given models |
403 | options[:models] << self unless options[:models].include?(self) |
404 | # build query parser TODO: cache these somehow |
405 | original_query = q |
406 | if q.is_a? String |
407 | #class_clauses = [] |
408 | #options[:models].each do |model| |
409 | # class_clauses << "class_name:#{model}" |
410 | #end |
411 | #q << " AND (#{class_clauses.join(' OR ')})" |
412 | qp = Ferret::QueryParser.new(ferret_configuration[:default_search_field], ferret_configuration.update(:fields => single_index_field_names(options[:models]))) |
413 | original_query = qp.parse(q) |
414 | end |
415 | #else |
416 | q = Ferret::Search::BooleanQuery.new |
417 | q.add_query(original_query, Ferret::Search::BooleanClause::Occur::MUST) |
418 | model_query = Ferret::Search::BooleanQuery.new |
419 | options[:models].each do |model| |
420 | model_query.add_query(Ferret::Search::TermQuery.new(Ferret::Index::Term.new('class_name', model.name)), Ferret::Search::BooleanClause::Occur::SHOULD) |
421 | end |
422 | q.add_query(model_query, Ferret::Search::BooleanClause::Occur::MUST) |
423 | #end |
424 | end |
425 | #puts q.to_s |
426 | total_hits = find_id_by_contents(q, options) do |model, id, score| |
427 | result << Object.const_get(model).find(id, find_options.dup) |
428 | end |
429 | return SearchResults.new(result, total_hits) |
430 | end |
431 | protected :single_index_find_by_contents |
432 | |
433 | # Finds instance model name, ids and scores by contents. |
434 | # Useful if you want to search across models |
435 | # Terms are ANDed by default, can be circumvented by using OR between terms. |
436 | # |
437 | # Example controller code (not tested): |
438 | # def multi_search(query) |
439 | # result = [] |
440 | # result << (Model1.find_id_by_contents query) |
441 | # result << (Model2.find_id_by_contents query) |
442 | # result << (Model3.find_id_by_contents query) |
443 | # result.flatten! |
444 | # result.sort! {|element| element[:score]} |
445 | # # Figure out for yourself how to retreive and present the data from modelname and id |
446 | # end |
447 | # |
448 | # Note that the scores retrieved this way aren't normalized across |
449 | # indexes, so that the order of results after sorting by score will |
450 | # differ from the order you would get when running the same query |
451 | # on a single index containing all the data from Model1, Model2 |
452 | # and Model |
453 | # |
454 | # options: |
455 | # :first_doc - first hit to retrieve (useful for paging) |
456 | # :num_docs - number of hits to retrieve, or :all to retrieve |
457 | # max_results results, which by default is 1000 and can be changed in |
458 | # the call to acts_as_ferret or on demand like this: |
459 | # Model.configuration[:max_results] = 1000000 |
460 | # |
461 | # a block can be given too, it will be executed with every result: |
462 | # find_id_by_contents(q, options) do |model, id, score| |
463 | # id_array << id |
464 | # scores_by_id[id] = score |
465 | # end |
466 | # NOTE: in case a block is given, the total_hits value will be returned |
467 | # instead of the result list! |
468 | # |
469 | def find_id_by_contents(q, options = {}) |
470 | options[:num_docs] = configuration[:max_results] if options[:num_docs] == :all |
471 | result = [] |
472 | index = self.ferret_index |
473 | hits = index.search(q, options) |
474 | hits.each do |hit, score| |
475 | # only collect result data if we intend to return it |
476 | doc = index[hit] |
477 | model = configuration[:store_class_name] ? doc[:class_name] : self.name |
478 | if block_given? |
479 | yield model, doc[:id].to_i, score |
480 | else |
481 | result << { :model => model, :id => doc[:id], :score => score } |
482 | end |
483 | end |
484 | logger.debug "id_score_model array: #{result.inspect}" |
485 | return block_given? ? hits.total_hits : result |
486 | end |
487 | |
488 | # requires the store_class_name option of acts_as_ferret to be true |
489 | # for all models queried this way. |
490 | # |
491 | # TODO: not optimal as each instance is fetched in a db call for it's |
492 | # own. |
493 | def multi_search(query, additional_models = [], options = {}) |
494 | result = [] |
495 | total_hits = id_multi_search(query, additional_models, options) do |model, id, score| |
496 | result << Object.const_get(model).find(id) |
497 | end |
498 | SearchResults.new(result, total_hits) |
499 | end |
500 | |
501 | # returns an array of hashes, each containing :class_name, |
502 | # :id and :score for a hit. |
503 | # |
504 | # if a block is given, class_name, id and score of each hit will |
505 | # be yielded, and the total number of hits is returned. |
506 | # |
507 | def id_multi_search(query, additional_models = [], options = {}) |
508 | options[:num_docs] = configuration[:max_results] if options[:num_docs] == :all |
509 | additional_models << self |
510 | searcher = multi_index(additional_models) |
511 | result = [] |
512 | hits = searcher.search(query, options) |
513 | hits.each { |hit, score| |
514 | doc = searcher.doc(hit) |
515 | if block_given? |
516 | yield doc[:class_name], doc[:id].to_i, score |
517 | else |
518 | result << { :model => doc[:class_name], :id => doc[:id], :score => score } |
519 | end |
520 | } |
521 | return block_given? ? hits.total_hits : result |
522 | end |
523 | |
524 | # returns a MultiIndex instance operating on a MultiReader |
525 | def multi_index(model_classes) |
526 | model_classes.sort! { |a, b| a.name <=> b.name } |
527 | key = model_classes.inject("") { |s, clazz| s << clazz.name } |
528 | @@multi_indexes[key] ||= MultiIndex.new(model_classes, ferret_configuration) |
529 | end |
530 | |
531 | end |
532 | |
533 | |
534 | module InstanceMethods |
535 | attr_reader :reindex |
536 | @ferret_reindex = true |
537 | |
538 | def ferret_before_update |
539 | @ferret_reindex = true |
540 | end |
541 | alias :ferret_before_create :ferret_before_update |
542 | |
543 | # add to index |
544 | def ferret_create |
545 | logger.debug "ferret_create/update: #{self.class.name} : #{self.id}" |
546 | self.class.ferret_index << self.to_doc if @ferret_reindex |
547 | @ferret_reindex = true |
548 | true |
549 | end |
550 | alias :ferret_update :ferret_create |
551 | |
552 | # remove from index |
553 | def ferret_destroy |
554 | logger.debug "ferret_destroy: #{self.class.name} : #{self.id}" |
555 | begin |
556 | query = Ferret::Search::TermQuery.new(Ferret::Index::Term.new('id',self.id.to_s)) |
557 | if self.class.configuration[:single_index] |
558 | bq = Ferret::Search::BooleanQuery.new |
559 | bq.add_query(query, Ferret::Search::BooleanClause::Occur::MUST) |
560 | bq.add_query(Ferret::Search::TermQuery.new(Ferret::Index::Term.new('class_name', self.class.name)), |
561 | Ferret::Search::BooleanClause::Occur::MUST) |
562 | query = bq |
563 | end |
564 | self.class.ferret_index.query_delete(query) |
565 | rescue |
566 | logger.warn("Could not find indexed value for this object: #{$!}") |
567 | end |
568 | true |
569 | end |
570 | |
571 | # convert instance to ferret document |
572 | def to_doc |
573 | logger.debug "creating doc for class: #{self.class.name}, id: #{self.id}" |
574 | # Churn through the complete Active Record and add it to the Ferret document |
575 | doc = Ferret::Document::Document.new |
576 | # store the id of each item |
577 | doc << Ferret::Document::Field.new( "id", self.id, |
578 | Ferret::Document::Field::Store::YES, |
579 | Ferret::Document::Field::Index::UNTOKENIZED ) |
580 | # store the class name if configured to do so |
581 | if configuration[:store_class_name] |
582 | doc << Ferret::Document::Field.new( "class_name", self.class.name, |
583 | Ferret::Document::Field::Store::YES, |
584 | Ferret::Document::Field::Index::UNTOKENIZED ) # have to tokenize to be able to use class_name field in queries ?! |
585 | end |
586 | # iterate through the fields and add them to the document |
587 | if fields_for_ferret |
588 | # have user defined fields |
589 | fields_for_ferret.each do |field| |
590 | doc << self.send("#{field}_to_ferret") |
591 | end |
592 | else |
593 | # take all fields |
594 | self.attributes.each_pair do |key,val| |
595 | unless key == :id |
596 | logger.debug "add field #{key} with value #{val}" |
597 | doc << Ferret::Document::Field.new( |
598 | key, |
599 | val.to_s, |
600 | Ferret::Document::Field::Store::NO, |
601 | Ferret::Document::Field::Index::TOKENIZED) |
602 | end |
603 | end |
604 | end |
605 | return doc |
606 | end |
607 | |
608 | # BIG TODO: this file really gets too big. need to refactor a bit... |
609 | # maybe extract the more like this stuff, could be useful somewhere |
610 | # else, too... |
611 | |
612 | |
613 | # returns other instances of this class, which have similar contents |
614 | # like this one. Basically works like this: find out n most interesting |
615 | # (i.e. characteristic) terms from this document, and then build a |
616 | # query from those which is run against the whole index. Which terms |
617 | # are interesting is decided on variour criteria which can be |
618 | # influenced by the given options. |
619 | # |
620 | # The algorithm used here is a quite straight port of the MoreLikeThis class |
621 | # from Apache Lucene. |
622 | # |
623 | # options are: |
624 | # :field_names : Array of field names to use for similarity search (mandatory) |
625 | # :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc. |
626 | # :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs |
627 | # :min_word_length => nil, # Ignore words if less than this len (longer |
628 | # words tend to be more characteristic for the document they occur in). |
629 | # :max_word_length => nil, # Ignore words if greater than this len. |
630 | # :max_query_terms => 25, # maximum number of terms in the query built |
631 | # :max_num_tokens => 5000, # maximum number of tokens to examine in a |
632 | # single field |
633 | # :boost => false, # when true, a boost according to the |
634 | # relative score of a term is applied to this Term's TermQuery. |
635 | # :similarity => Ferret::Search::Similarity.default, # the similarity |
636 | # implementation to use |
637 | # :analyzer => Ferret::Analysis::StandardAnalyzer.new # the analyzer to |
638 | # use |
639 | # :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios |
640 | # find_options : options handed over to find_by_contents |
641 | def more_like_this(options = {}, find_options = {}) |
642 | options = { |
643 | :field_names => nil, # Default field names |
644 | :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc. |
645 | :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs |
646 | :min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words. |
647 | :max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words. |
648 | :max_query_terms => 25, # maximum number of terms in the query built |
649 | :max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents |
650 | :boost => false, |
651 | :similarity => Ferret::Search::Similarity.default, |
652 | :analyzer => Ferret::Analysis::StandardAnalyzer.new, |
653 | :append_to_query => nil, |
654 | :base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_by_contents can be used to retrieve results from other classes, too |
655 | }.update(options) |
656 | index = self.class.ferret_index |
657 | begin |
658 | reader = index.send(:reader) |
659 | rescue |
660 | # ferret >=0.9, C-Version doesn't allow access to Index#reader |
661 | reader = Ferret::Index::IndexReader.open(Ferret::Store::FSDirectory.new(self.class.class_index_dir, false)) |
662 | end |
663 | doc_number = self.document_number |
664 | term_freq_map = retrieve_terms(document_number, reader, options) |
665 | priority_queue = create_queue(term_freq_map, reader, options) |
666 | query = create_query(priority_queue, options) |
667 | options[:append_to_query].call(query) if options[:append_to_query] |
668 | options[:base_class].find_by_contents(query, find_options) |
669 | end |
670 | |
671 | |
672 | def create_query(priority_queue, options={}) |
673 | query = Ferret::Search::BooleanQuery.new |
674 | qterms = 0 |
675 | best_score = nil |
676 | while(cur = priority_queue.pop) |
677 | term_query = Ferret::Search::TermQuery.new(cur.to_term) |
678 | |
679 | if options[:boost] |
680 | # boost term according to relative score |
681 | # TODO untested |
682 | best_score ||= cur.score |
683 | term_query.boost = cur.score / best_score |
684 | end |
685 | begin |
686 | query.add_query(term_query, Ferret::Search::BooleanClause::Occur::SHOULD) |
687 | rescue Ferret::Search::BooleanQuery::TooManyClauses |
688 | break |
689 | end |
690 | qterms += 1 |
691 | break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms] |
692 | end |
693 | # exclude ourselves |
694 | t = Ferret::Index::Term.new('id', self.id.to_s) |
695 | query.add_query(Ferret::Search::TermQuery.new(t), |
696 | Ferret::Search::BooleanClause::Occur::MUST_NOT) |
697 | return query |
698 | end |
699 | |
700 | |
701 | def document_number |
702 | hits = self.class.ferret_index.search("id:#{self.id}") |
703 | hits.each { |hit, score| return hit } |
704 | end |
705 | |
706 | # creates a term/term_frequency map for terms from the fields |
707 | # given in options[:field_names] |
708 | def retrieve_terms(doc_number, reader, options) |
709 | field_names = options[:field_names] |
710 | max_num_tokens = options[:max_num_tokens] |
711 | term_freq_map = Hash.new(0) |
712 | doc = nil |
713 | field_names.each do |field| |
714 | term_freq_vector = reader.get_term_vector(document_number, field) |
715 | if term_freq_vector |
716 | # use stored term vector |
717 | # TODO untested |
718 | term_freq_vector.terms.each_with_index do |term, i| |
719 | term_freq_map[term] += term_freq_vector.freqs[i] unless noise_word?(term, options) |
720 | end |
721 | else |
722 | # no term vector stored, but we have stored the contents in the index |
723 | # -> extract terms from there |
724 | doc ||= reader.get_document(doc_number) |
725 | content = doc[field] |
726 | unless content |
727 | # no term vector, no stored content, so try content from this instance |
728 | content = content_for_field_name(field) |
729 | end |
730 | token_count = 0 |
731 | |
732 | # C-Ferret >=0.9 again, no #each in tokenstream :-( |
733 | ts = options[:analyzer].token_stream(field, content) |
734 | while token = ts.next |
735 | #options[:analyzer].token_stream(field, doc[field]).each do |token| |
736 | break if (token_count+=1) > max_num_tokens |
737 | next if noise_word?(token_text(token), options) |
738 | term_freq_map[token_text(token)] += 1 |
739 | end |
740 | end |
741 | end |
742 | term_freq_map |
743 | end |
744 | |
745 | # extract textual value of a token |
746 | def token_text(token) |
747 | # token.term_text is for ferret 0.3.2 |
748 | token.respond_to?(:text) ? token.text : token.term_text |
749 | end |
750 | |
751 | # create an ordered(by score) list of word,fieldname,score |
752 | # structures |
753 | def create_queue(term_freq_map, reader, options) |
754 | pq = Array.new(term_freq_map.size) |
755 | |
756 | similarity = options[:similarity] |
757 | num_docs = reader.num_docs |
758 | term_freq_map.each_pair do |word, tf| |
759 | # filter out words that don't occur enough times in the source |
760 | next if options[:min_term_freq] && tf < options[:min_term_freq] |
761 | |
762 | # go through all the fields and find the largest document frequency |
763 | top_field = options[:field_names].first |
764 | doc_freq = 0 |
765 | options[:field_names].each do |field_name| |
766 | freq = reader.doc_freq(Ferret::Index::Term.new(field_name, word)) |
767 | if freq > doc_freq |
768 | top_field = field_name |
769 | doc_freq = freq |
770 | end |
771 | end |
772 | # filter out words that don't occur in enough docs |
773 | next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq] |
774 | next if doc_freq == 0 # index update problem ? |
775 | |
776 | idf = similarity.idf(doc_freq, num_docs) |
777 | score = tf * idf |
778 | pq << FrequencyQueueItem.new(word, top_field, score) |
779 | end |
780 | pq.compact! |
781 | pq.sort! { |a,b| a.score<=>b.score } |
782 | return pq |
783 | end |
784 | |
785 | def noise_word?(text, options) |
786 | len = text.length |
787 | ( |
788 | (options[:min_word_length] > 0 && len < options[:min_word_length]) || |
789 | (options[:max_word_length] > 0 && len > options[:max_word_length]) || |
790 | (options[:stop_words] && options.include?(text)) |
791 | ) |
792 | end |
793 | |
794 | def content_for_field_name(field) |
795 | self[field] || self.instance_variable_get("@#{field.to_s}".to_sym) || self.send(field.to_sym) |
796 | end |
797 | |
798 | end |
799 | |
800 | class FrequencyQueueItem |
801 | attr_reader :word, :field, :score |
802 | def initialize(word, field, score) |
803 | @word = word; @field = field; @score = score |
804 | end |
805 | def to_term |
806 | Ferret::Index::Term.new(self.field, self.word) |
807 | end |
808 | end |
809 | |
810 | end |
811 | end |
812 | end |
813 | |
814 | # reopen ActiveRecord and include all the above to make |
815 | # them available to all our models if they want it |
816 | ActiveRecord::Base.class_eval do |
817 | include FerretMixin::Acts::ARFerret |
818 | end |
819 | |
820 | |
821 | class Ferret::Index::MultiReader |
822 | def latest? |
823 | # TODO: Exception handling added to resolve ticket #6. |
824 | # It should be clarified wether this is a bug in Ferret |
825 | # in which case a bug report should be posted on the Ferret Trac. |
826 | begin |
827 | @sub_readers.each { |r| return false unless r.latest? } |
828 | rescue |
829 | return false |
830 | end |
831 | true |
832 | end |
833 | end |
834 | |
835 | # END acts_as_ferret.rb |