RISC OS Open: Bugs and sources: Browsing repository

You are currently browsing the Subversion repository.
Changesets can be listed by changeset number.
The Git repository is here.

root
» rool
» rails
» rforum
» trunk
» vendor
» plugins
» acts_as_ferret
» lib
» acts_as_ferret.rb

Revision:: 8

Log:: Updated to HEAD revision from SVN repository for 22-Jul-2006 at
about 6:30pm.

Author:: adh

Date:: Sat Jul 22 19:30:50 +0100 2006

Size:: 36527 Bytes

Show revision log

1	# Copyright (c) 2006 Kasper Weibel Nielsen-Refs, Thomas Lockney, Jens Kr�mer
2	#
3	# Permission is hereby granted, free of charge, to any person obtaining a copy
4	# of this software and associated documentation files (the "Software"), to deal
5	# in the Software without restriction, including without limitation the rights
6	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7	# copies of the Software, and to permit persons to whom the Software is
8	# furnished to do so, subject to the following conditions:
9	#
10	# The above copyright notice and this permission notice shall be included in all
11	# copies or substantial portions of the Software.
12	#
13	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19	# SOFTWARE.
20
21	require 'active_record'
22	require 'set'
23
24
25	# Yet another Ferret Mixin.
26	#
27	# This mixin adds full text search capabilities to any Rails model.
28	#
29	# It is heavily based on the original acts_as_ferret plugin done by
30	# Kasper Weibel and a modified version done by Thomas Lockney, which
31	# both can be found on
32	# http://ferret.davebalmain.com/trac/wiki/FerretOnRails
33	#
34	# usage:
35	# include the following in your model class (specifiying the fields you want to get indexed):
36	# acts_as_ferret :fields => [ 'title', 'description' ]
37	#
38	# now you can use ModelClass.find_by_contents(query) to find instances of your model
39	# whose indexed fields match a given query. All query terms are required by default, but
40	# explicit OR queries are possible. This differs from the ferret default, but imho is the more
41	# often needed/expected behaviour (more query terms result in less results).
42	#
43	# Released under the MIT license.
44	#
45	# Authors:
46	# Kasper Weibel Nielsen-Refs (original author)
47	# Jens Kraemer <jk@jkraemer.net>
48	#
49	module FerretMixin
50	module Acts #:nodoc:
51	module ARFerret #:nodoc:
52
53	# decorator that adds a total_hits accessor to search result arrays
54	class SearchResults
55	attr_reader :total_hits
56	def initialize(results, total_hits)
57	@results = results
58	@total_hits = total_hits
59	end
60	def method_missing(symbol, *args, &block)
61	@results.send(symbol, *args, &block)
62	end
63	end
64
65	def self.ensure_directory(dir)
66	FileUtils.mkdir_p dir unless File.directory? dir
67	end
68
69	# make sure the default index base dir exists. by default, all indexes are created
70	# under RAILS_ROOT/index/RAILS_ENV
71	def self.init_index_basedir
72	index_base = "#{RAILS_ROOT}/index"
73	ensure_directory index_base
74	@@index_dir = "#{index_base}/#{RAILS_ENV}"
75	ensure_directory @@index_dir
76	end
77
78	mattr_accessor :index_dir
79	init_index_basedir
80
81	def self.append_features(base)
82	super
83	base.extend(ClassMethods)
84	end
85
86	# declare the class level helper methods
87	# which will load the relevant instance methods defined below when invoked
88	module ClassMethods
89
90	# helper that defines a method that adds the given field to a lucene
91	# document instance
92	def define_to_field_method(field, options = {})
93	default_opts = { :store => Ferret::Document::Field::Store::NO,
94	:index => Ferret::Document::Field::Index::TOKENIZED,
95	:term_vector => Ferret::Document::Field::TermVector::NO,
96	:binary => false,
97	:boost => 1.0
98	}
99	default_opts.update(options) if options.is_a?(Hash)
100	fields_for_ferret << field
101	define_method("#{field}_to_ferret".to_sym) do
102	begin
103	#val = self[field] \|\| self.instance_variable_get("@#{field.to_s}".to_sym) \|\| self.method(field).call
104	val = content_for_field_name(field)
105	rescue
106	logger.debug("Error retrieving value for field #{field}: #{$!}")
107	val = ''
108	end
109	logger.debug("Adding field #{field} with value '#{val}' to index")
110	Ferret::Document::Field.new(field.to_s, val,
111	default_opts[:store],
112	default_opts[:index],
113	default_opts[:term_vector],
114	default_opts[:binary],
115	default_opts[:boost])
116	end
117	end
118
119	# TODO: do we need to define this at this level ? Maybe it's
120	# sufficient to do this only in classes calling acts_as_ferret ?
121	def reloadable?; false end
122
123	@@ferret_indexes = Hash.new
124	def ferret_indexes; @@ferret_indexes end
125
126	@@multi_indexes = Hash.new
127	def multi_indexes; @@multi_indexes end
128
129	# declares a class as ferret-searchable.
130	#
131	# options are:
132	#
133	# fields:: names all fields to include in the index. If not given,
134	# all attributes of the class will be indexed. You may also give
135	# symbols pointing to instance methods of your model here, i.e.
136	# to retrieve and index data from a related model.
137	#
138	# index_dir:: declares the directory where to put the index for this class.
139	# The default is RAILS_ROOT/index/RAILS_ENV/CLASSNAME.
140	# The index directory will be created if it doesn't exist.
141	#
142	# single_index:: set this to true to let this class use a Ferret
143	# index that is shared by all classes having :single_index set to true.
144	# :store_class_name is set to true implicitly, as well as index_dir, so
145	# don't bother setting these when using this option. the shared index
146	# will be located in index/<RAILS_ENV>/shared .
147	#
148	# store_class_name:: to make search across multiple models useful, set
149	# this to true. the model class name will be stored in a keyword field
150	# named class_name
151	#
152	# max_results:: number of results to retrieve for :num_docs => :all,
153	# default value is 1000
154	#
155	# ferret_options may be:
156	# occur_default:: - whether query terms are required by
157	# default (the default), or not. Specify one of
158	# Ferret::Search::BooleanClause::Occur::MUST or
159	# Ferret::Search::BooleanClause::Occur::SHOULD
160	#
161	# analyzer:: the analyzer to use for query parsing (default: nil,
162	# wihch means the ferret default Analyzer gets used)
163	#
164	def acts_as_ferret(options={}, ferret_options={})
165	configuration = {
166	:fields => nil,
167	:index_dir => "#{FerretMixin::Acts::ARFerret::index_dir}/#{self.name.underscore}",
168	:store_class_name => false,
169	:single_index => false,
170	:max_results => 1000,
171	:auto_index_update => true
172	}
173	ferret_configuration = {
174	:occur_default => Ferret::Search::BooleanClause::Occur::MUST,
175	:handle_parse_errors => true,
176	:default_search_field => '*',
177	:analyzer => Ferret::Analysis::StandardAnalyzer.new,
178	# :wild_lower => true
179	}
180	configuration.update(options) if options.is_a?(Hash)
181	# apply appropriate settings for shared index
182	if configuration[:single_index]
183	configuration[:index_dir] = "#{FerretMixin::Acts::ARFerret::index_dir}/shared"
184	configuration[:store_class_name] = true
185	end
186	ferret_configuration.update(ferret_options) if ferret_options.is_a?(Hash)
187	# these properties are somewhat vital to the plugin and shouldn't
188	# be overwritten by the user:
189	ferret_configuration.update(
190	:key => (configuration[:single_index] ? ['id', 'class_name'] : 'id'),
191	:path => configuration[:index_dir],
192	:auto_flush => true,
193	:create_if_missing => true
194	)
195
196	class_eval <<-EOV
197	include FerretMixin::Acts::ARFerret::InstanceMethods
198
199	cattr_accessor :fields_for_ferret
200	cattr_accessor :configuration
201	cattr_accessor :ferret_configuration
202
203	after_destroy :ferret_destroy
204
205	@@fields_for_ferret = Array.new
206	@@configuration = configuration
207	@@ferret_configuration = ferret_configuration
208
209	if configuration[:fields].respond_to?(:each_pair)
210	configuration[:fields].each_pair do \|key,val\|
211	define_to_field_method(key,val)
212	end
213	elsif configuration[:fields].respond_to?(:each)
214	configuration[:fields].each do \|field\|
215	define_to_field_method(field)
216	end
217	else
218	@@fields_for_ferret = nil
219	end
220	EOV
221
222	if configuration[:auto_index_update]
223	# index will be updated automatically after a record is created/updated/destroyed
224	class_eval <<-EOV
225	before_create :ferret_before_create
226	before_update :ferret_before_update
227	after_create :ferret_create
228	after_update :ferret_update
229	EOV
230	end
231
232	FerretMixin::Acts::ARFerret::ensure_directory configuration[:index_dir]
233	end
234
235	def class_index_dir
236	configuration[:index_dir]
237	end
238
239	# rebuild the index from all data stored for this model.
240	# This is called automatically when no index exists yet.
241	#
242	# TODO: the automatic index initialization only works if
243	# every model class has it's
244	# own index, otherwise the index will get populated only
245	# with instances from the first model loaded
246	#
247	# When calling this method manually, you can give any additional
248	# model classes that should also go into this index as parameters.
249	# Useful when using the :single_index option.
250	def rebuild_index(*additional_models)
251	last_index_update_file = "#{class_index_dir}/last_update.timestamp"
252	# update timestamp
253	File.open(last_index_update_file, 'w+') do \|file\|
254	file.write(Time.now.to_i.to_s)
255	end
256
257	index = Ferret::Index::Index.new(ferret_configuration.merge(:create => true))
258	additional_models << self
259	batch_size = 1000
260	additional_models.each do \|model\|
261	# index in batches of 1000 to limit memory consumption (fixes #24)
262	model.transaction do
263	0.step(model.count, batch_size) do \|i\|
264	model.find(:all, :limit => batch_size, :offset => i).each do \|rec\|
265	puts rec.id
266	index << rec.to_doc rescue puts 'could not add to index'
267	end
268	index.flush
269	end
270	end
271	end
272	logger.debug("Created Ferret index in: #{class_index_dir}")
273	index.optimize
274	index.close
275	end
276
277	# Update the index for this model manually. This must be done periodically (e.g in a cronjob)
278	# if the option :auto_index_update is disabled.
279	def manual_index_update
280	last_index_update_file = "#{class_index_dir}/last_update.timestamp"
281	# read timestamp
282	last_index_update = Time.at((File.read(last_index_update_file).to_i rescue 0))
283	# update timestamp
284	File.open(last_index_update_file, 'w+') do \|file\|
285	file.write(Time.now.to_i.to_s)
286	end
287
288	batch_size = 1000
289	changed_records = self.count(['updated_at > ?', last_index_update])
290	0.step(changed_records, batch_size) do \|i\|
291	self.find(:all, :conditions => ['updated_at > ?', last_index_update], :limit => batch_size, :offset => i).each do \|rec\|
292	rec.ferret_before_update
293	rec.ferret_update
294	end
295	end
296
297	self.ferret_index.flush
298	self.ferret_index.optimize
299	end
300
301	# Retrieve the Ferret::Index::Index instance for this model class.
302	#
303	# Index instances are stored in a hash, using the index directory
304	# as the key. So model classes sharing a single index will share their
305	# Index object, too.
306	def ferret_index
307	ferret_indexes[class_index_dir] \|\|= create_index_instance
308	end
309
310	# creates a new Index::Index instance. Before that, a check is done
311	# to see if the index exists in the file system. If not, index rebuild
312	# from all model data retrieved by find(:all) is triggered.
313	def create_index_instance
314	rebuild_index unless File.file? "#{class_index_dir}/segments"
315	Ferret::Index::Index.new(ferret_configuration)
316	end
317
318	# Finds instances by contents. Terms are ANDed by default, can be circumvented
319	# by using OR between terms.
320	# options:
321	# :first_doc - first hit to retrieve (useful for paging)
322	# :num_docs - number of hits to retrieve, or :all to retrieve
323	# max_results results, which by default is 1000 and can be changed in
324	# the call to acts_as_ferret or on demand like this:
325	# Model.configuration[:max_results] = 1000000
326	#
327	# find_options is a hash passed on to active_record's find when
328	# retrieving the data from db, useful to i.e. prefetch relationships.
329	#
330	# this method returns a SearchResults instance, which really is an Array that has
331	# been decorated with a total_hits accessor that delivers the total
332	# number of hits (including those not fetched because of a low num_docs
333	# value).
334	def find_by_contents(q, options = {}, find_options = {})
335	# handle shared index
336	return single_index_find_by_contents(q, options, find_options) if configuration[:single_index]
337	id_array = []
338	id_positions = {}
339	total_hits = find_id_by_contents(q, options) do \|model, id, score\|
340	id_array << id
341	# store index of this id for later ordering of results
342	id_positions[id] = id_array.size
343	end
344	begin
345	# TODO: in case of STI AR will filter out hits from other
346	# classes for us, but this
347	# will lead to less results retrieved --> scoping of ferret query
348	# to self.class is still needed.
349	if id_array.empty?
350	result = []
351	else
352	conditions = [ "id in (?)", id_array ]
353	# combine our conditions with those given by user, if any
354	if find_options[:conditions]
355	cust_opts = find_options[:conditions].dup
356	conditions.first << " and " << cust_opts.shift
357	conditions.concat(cust_opts)
358	end
359	result = self.find(:all,
360	find_options.merge(:conditions => conditions))
361	end
362	rescue
363	logger.debug "REBUILD YOUR INDEX! One of the id's didn't have an associated record: #{id_array}"
364	end
365
366	# order results as they were found by ferret, unless an AR :order
367	# option was given
368	unless find_options[:order]
369	result.sort! { \|a, b\| id_positions[a.id] <=> id_positions[b.id] }
370	end
371
372	logger.debug "Query: #{q}\nResult id_array: #{id_array.inspect},\nresult: #{result}"
373	return SearchResults.new(result, total_hits)
374	end
375
376	# determine all field names in the shared index
377	def single_index_field_names(models)
378	@single_index_field_names \|\|= (
379	searcher = Ferret::Search::IndexSearcher.new(class_index_dir)
380	if searcher.reader.respond_to?(:get_field_names)
381	(searcher.reader.send(:get_field_names) - ['id', 'class_name']).to_a
382	else
383	puts <<-END
384	unable to retrieve field names for class #{self.name}, please
385	consider naming all indexed fields in your call to acts_as_ferret!
386	END
387	models.map { \|m\| m.content_columns.map { \|col\| col.name } }.flatten
388	end
389	)
390
391	end
392
393	# weiter: checken ob ferret-bug, dass wir die queries so selber bauen
394	# muessen - liegt am downcasen des qparsers ? - gucken ob jetzt mit
395	# ferret geht (content_cols) und dave um zugriff auf qp bitten, oder
396	# auf reader
397	def single_index_find_by_contents(q, options = {}, find_options = {})
398	result = []
399
400	unless options[:models] == :all # search needs to be restricted by one or more class names
401	options[:models] \|\|= []
402	# add this class to the list of given models
403	options[:models] << self unless options[:models].include?(self)
404	# build query parser TODO: cache these somehow
405	original_query = q
406	if q.is_a? String
407	#class_clauses = []
408	#options[:models].each do \|model\|
409	# class_clauses << "class_name:#{model}"
410	#end
411	#q << " AND (#{class_clauses.join(' OR ')})"
412	qp = Ferret::QueryParser.new(ferret_configuration[:default_search_field], ferret_configuration.update(:fields => single_index_field_names(options[:models])))
413	original_query = qp.parse(q)
414	end
415	#else
416	q = Ferret::Search::BooleanQuery.new
417	q.add_query(original_query, Ferret::Search::BooleanClause::Occur::MUST)
418	model_query = Ferret::Search::BooleanQuery.new
419	options[:models].each do \|model\|
420	model_query.add_query(Ferret::Search::TermQuery.new(Ferret::Index::Term.new('class_name', model.name)), Ferret::Search::BooleanClause::Occur::SHOULD)
421	end
422	q.add_query(model_query, Ferret::Search::BooleanClause::Occur::MUST)
423	#end
424	end
425	#puts q.to_s
426	total_hits = find_id_by_contents(q, options) do \|model, id, score\|
427	result << Object.const_get(model).find(id, find_options.dup)
428	end
429	return SearchResults.new(result, total_hits)
430	end
431	protected :single_index_find_by_contents
432
433	# Finds instance model name, ids and scores by contents.
434	# Useful if you want to search across models
435	# Terms are ANDed by default, can be circumvented by using OR between terms.
436	#
437	# Example controller code (not tested):
438	# def multi_search(query)
439	# result = []
440	# result << (Model1.find_id_by_contents query)
441	# result << (Model2.find_id_by_contents query)
442	# result << (Model3.find_id_by_contents query)
443	# result.flatten!
444	# result.sort! {\|element\| element[:score]}
445	# # Figure out for yourself how to retreive and present the data from modelname and id
446	# end
447	#
448	# Note that the scores retrieved this way aren't normalized across
449	# indexes, so that the order of results after sorting by score will
450	# differ from the order you would get when running the same query
451	# on a single index containing all the data from Model1, Model2
452	# and Model
453	#
454	# options:
455	# :first_doc - first hit to retrieve (useful for paging)
456	# :num_docs - number of hits to retrieve, or :all to retrieve
457	# max_results results, which by default is 1000 and can be changed in
458	# the call to acts_as_ferret or on demand like this:
459	# Model.configuration[:max_results] = 1000000
460	#
461	# a block can be given too, it will be executed with every result:
462	# find_id_by_contents(q, options) do \|model, id, score\|
463	# id_array << id
464	# scores_by_id[id] = score
465	# end
466	# NOTE: in case a block is given, the total_hits value will be returned
467	# instead of the result list!
468	#
469	def find_id_by_contents(q, options = {})
470	options[:num_docs] = configuration[:max_results] if options[:num_docs] == :all
471	result = []
472	index = self.ferret_index
473	hits = index.search(q, options)
474	hits.each do \|hit, score\|
475	# only collect result data if we intend to return it
476	doc = index[hit]
477	model = configuration[:store_class_name] ? doc[:class_name] : self.name
478	if block_given?
479	yield model, doc[:id].to_i, score
480	else
481	result << { :model => model, :id => doc[:id], :score => score }
482	end
483	end
484	logger.debug "id_score_model array: #{result.inspect}"
485	return block_given? ? hits.total_hits : result
486	end
487
488	# requires the store_class_name option of acts_as_ferret to be true
489	# for all models queried this way.
490	#
491	# TODO: not optimal as each instance is fetched in a db call for it's
492	# own.
493	def multi_search(query, additional_models = [], options = {})
494	result = []
495	total_hits = id_multi_search(query, additional_models, options) do \|model, id, score\|
496	result << Object.const_get(model).find(id)
497	end
498	SearchResults.new(result, total_hits)
499	end
500
501	# returns an array of hashes, each containing :class_name,
502	# :id and :score for a hit.
503	#
504	# if a block is given, class_name, id and score of each hit will
505	# be yielded, and the total number of hits is returned.
506	#
507	def id_multi_search(query, additional_models = [], options = {})
508	options[:num_docs] = configuration[:max_results] if options[:num_docs] == :all
509	additional_models << self
510	searcher = multi_index(additional_models)
511	result = []
512	hits = searcher.search(query, options)
513	hits.each { \|hit, score\|
514	doc = searcher.doc(hit)
515	if block_given?
516	yield doc[:class_name], doc[:id].to_i, score
517	else
518	result << { :model => doc[:class_name], :id => doc[:id], :score => score }
519	end
520	}
521	return block_given? ? hits.total_hits : result
522	end
523
524	# returns a MultiIndex instance operating on a MultiReader
525	def multi_index(model_classes)
526	model_classes.sort! { \|a, b\| a.name <=> b.name }
527	key = model_classes.inject("") { \|s, clazz\| s << clazz.name }
528	@@multi_indexes[key] \|\|= MultiIndex.new(model_classes, ferret_configuration)
529	end
530
531	end
532
533
534	module InstanceMethods
535	attr_reader :reindex
536	@ferret_reindex = true
537
538	def ferret_before_update
539	@ferret_reindex = true
540	end
541	alias :ferret_before_create :ferret_before_update
542
543	# add to index
544	def ferret_create
545	logger.debug "ferret_create/update: #{self.class.name} : #{self.id}"
546	self.class.ferret_index << self.to_doc if @ferret_reindex
547	@ferret_reindex = true
548	true
549	end
550	alias :ferret_update :ferret_create
551
552	# remove from index
553	def ferret_destroy
554	logger.debug "ferret_destroy: #{self.class.name} : #{self.id}"
555	begin
556	query = Ferret::Search::TermQuery.new(Ferret::Index::Term.new('id',self.id.to_s))
557	if self.class.configuration[:single_index]
558	bq = Ferret::Search::BooleanQuery.new
559	bq.add_query(query, Ferret::Search::BooleanClause::Occur::MUST)
560	bq.add_query(Ferret::Search::TermQuery.new(Ferret::Index::Term.new('class_name', self.class.name)),
561	Ferret::Search::BooleanClause::Occur::MUST)
562	query = bq
563	end
564	self.class.ferret_index.query_delete(query)
565	rescue
566	logger.warn("Could not find indexed value for this object: #{$!}")
567	end
568	true
569	end
570
571	# convert instance to ferret document
572	def to_doc
573	logger.debug "creating doc for class: #{self.class.name}, id: #{self.id}"
574	# Churn through the complete Active Record and add it to the Ferret document
575	doc = Ferret::Document::Document.new
576	# store the id of each item
577	doc << Ferret::Document::Field.new( "id", self.id,
578	Ferret::Document::Field::Store::YES,
579	Ferret::Document::Field::Index::UNTOKENIZED )
580	# store the class name if configured to do so
581	if configuration[:store_class_name]
582	doc << Ferret::Document::Field.new( "class_name", self.class.name,
583	Ferret::Document::Field::Store::YES,
584	Ferret::Document::Field::Index::UNTOKENIZED ) # have to tokenize to be able to use class_name field in queries ?!
585	end
586	# iterate through the fields and add them to the document
587	if fields_for_ferret
588	# have user defined fields
589	fields_for_ferret.each do \|field\|
590	doc << self.send("#{field}_to_ferret")
591	end
592	else
593	# take all fields
594	self.attributes.each_pair do \|key,val\|
595	unless key == :id
596	logger.debug "add field #{key} with value #{val}"
597	doc << Ferret::Document::Field.new(
598	key,
599	val.to_s,
600	Ferret::Document::Field::Store::NO,
601	Ferret::Document::Field::Index::TOKENIZED)
602	end
603	end
604	end
605	return doc
606	end
607
608	# BIG TODO: this file really gets too big. need to refactor a bit...
609	# maybe extract the more like this stuff, could be useful somewhere
610	# else, too...
611
612
613	# returns other instances of this class, which have similar contents
614	# like this one. Basically works like this: find out n most interesting
615	# (i.e. characteristic) terms from this document, and then build a
616	# query from those which is run against the whole index. Which terms
617	# are interesting is decided on variour criteria which can be
618	# influenced by the given options.
619	#
620	# The algorithm used here is a quite straight port of the MoreLikeThis class
621	# from Apache Lucene.
622	#
623	# options are:
624	# :field_names : Array of field names to use for similarity search (mandatory)
625	# :min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
626	# :min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
627	# :min_word_length => nil, # Ignore words if less than this len (longer
628	# words tend to be more characteristic for the document they occur in).
629	# :max_word_length => nil, # Ignore words if greater than this len.
630	# :max_query_terms => 25, # maximum number of terms in the query built
631	# :max_num_tokens => 5000, # maximum number of tokens to examine in a
632	# single field
633	# :boost => false, # when true, a boost according to the
634	# relative score of a term is applied to this Term's TermQuery.
635	# :similarity => Ferret::Search::Similarity.default, # the similarity
636	# implementation to use
637	# :analyzer => Ferret::Analysis::StandardAnalyzer.new # the analyzer to
638	# use
639	# :append_to_query => nil # proc taking a query object as argument, which will be called after generating the query. can be used to further manipulate the query used to find related documents, i.e. to constrain the search to a given class in single table inheritance scenarios
640	# find_options : options handed over to find_by_contents
641	def more_like_this(options = {}, find_options = {})
642	options = {
643	:field_names => nil, # Default field names
644	:min_term_freq => 2, # Ignore terms with less than this frequency in the source doc.
645	:min_doc_freq => 5, # Ignore words which do not occur in at least this many docs
646	:min_word_length => 0, # Ignore words if less than this len. Default is not to ignore any words.
647	:max_word_length => 0, # Ignore words if greater than this len. Default is not to ignore any words.
648	:max_query_terms => 25, # maximum number of terms in the query built
649	:max_num_tokens => 5000, # maximum number of tokens to analyze when analyzing contents
650	:boost => false,
651	:similarity => Ferret::Search::Similarity.default,
652	:analyzer => Ferret::Analysis::StandardAnalyzer.new,
653	:append_to_query => nil,
654	:base_class => self.class # base class to use for querying, useful in STI scenarios where BaseClass.find_by_contents can be used to retrieve results from other classes, too
655	}.update(options)
656	index = self.class.ferret_index
657	begin
658	reader = index.send(:reader)
659	rescue
660	# ferret >=0.9, C-Version doesn't allow access to Index#reader
661	reader = Ferret::Index::IndexReader.open(Ferret::Store::FSDirectory.new(self.class.class_index_dir, false))
662	end
663	doc_number = self.document_number
664	term_freq_map = retrieve_terms(document_number, reader, options)
665	priority_queue = create_queue(term_freq_map, reader, options)
666	query = create_query(priority_queue, options)
667	options[:append_to_query].call(query) if options[:append_to_query]
668	options[:base_class].find_by_contents(query, find_options)
669	end
670
671
672	def create_query(priority_queue, options={})
673	query = Ferret::Search::BooleanQuery.new
674	qterms = 0
675	best_score = nil
676	while(cur = priority_queue.pop)
677	term_query = Ferret::Search::TermQuery.new(cur.to_term)
678
679	if options[:boost]
680	# boost term according to relative score
681	# TODO untested
682	best_score \|\|= cur.score
683	term_query.boost = cur.score / best_score
684	end
685	begin
686	query.add_query(term_query, Ferret::Search::BooleanClause::Occur::SHOULD)
687	rescue Ferret::Search::BooleanQuery::TooManyClauses
688	break
689	end
690	qterms += 1
691	break if options[:max_query_terms] > 0 && qterms >= options[:max_query_terms]
692	end
693	# exclude ourselves
694	t = Ferret::Index::Term.new('id', self.id.to_s)
695	query.add_query(Ferret::Search::TermQuery.new(t),
696	Ferret::Search::BooleanClause::Occur::MUST_NOT)
697	return query
698	end
699
700
701	def document_number
702	hits = self.class.ferret_index.search("id:#{self.id}")
703	hits.each { \|hit, score\| return hit }
704	end
705
706	# creates a term/term_frequency map for terms from the fields
707	# given in options[:field_names]
708	def retrieve_terms(doc_number, reader, options)
709	field_names = options[:field_names]
710	max_num_tokens = options[:max_num_tokens]
711	term_freq_map = Hash.new(0)
712	doc = nil
713	field_names.each do \|field\|
714	term_freq_vector = reader.get_term_vector(document_number, field)
715	if term_freq_vector
716	# use stored term vector
717	# TODO untested
718	term_freq_vector.terms.each_with_index do \|term, i\|
719	term_freq_map[term] += term_freq_vector.freqs[i] unless noise_word?(term, options)
720	end
721	else
722	# no term vector stored, but we have stored the contents in the index
723	# -> extract terms from there
724	doc \|\|= reader.get_document(doc_number)
725	content = doc[field]
726	unless content
727	# no term vector, no stored content, so try content from this instance
728	content = content_for_field_name(field)
729	end
730	token_count = 0
731
732	# C-Ferret >=0.9 again, no #each in tokenstream :-(
733	ts = options[:analyzer].token_stream(field, content)
734	while token = ts.next
735	#options[:analyzer].token_stream(field, doc[field]).each do \|token\|
736	break if (token_count+=1) > max_num_tokens
737	next if noise_word?(token_text(token), options)
738	term_freq_map[token_text(token)] += 1
739	end
740	end
741	end
742	term_freq_map
743	end
744
745	# extract textual value of a token
746	def token_text(token)
747	# token.term_text is for ferret 0.3.2
748	token.respond_to?(:text) ? token.text : token.term_text
749	end
750
751	# create an ordered(by score) list of word,fieldname,score
752	# structures
753	def create_queue(term_freq_map, reader, options)
754	pq = Array.new(term_freq_map.size)
755
756	similarity = options[:similarity]
757	num_docs = reader.num_docs
758	term_freq_map.each_pair do \|word, tf\|
759	# filter out words that don't occur enough times in the source
760	next if options[:min_term_freq] && tf < options[:min_term_freq]
761
762	# go through all the fields and find the largest document frequency
763	top_field = options[:field_names].first
764	doc_freq = 0
765	options[:field_names].each do \|field_name\|
766	freq = reader.doc_freq(Ferret::Index::Term.new(field_name, word))
767	if freq > doc_freq
768	top_field = field_name
769	doc_freq = freq
770	end
771	end
772	# filter out words that don't occur in enough docs
773	next if options[:min_doc_freq] && doc_freq < options[:min_doc_freq]
774	next if doc_freq == 0 # index update problem ?
775
776	idf = similarity.idf(doc_freq, num_docs)
777	score = tf * idf
778	pq << FrequencyQueueItem.new(word, top_field, score)
779	end
780	pq.compact!
781	pq.sort! { \|a,b\| a.score<=>b.score }
782	return pq
783	end
784
785	def noise_word?(text, options)
786	len = text.length
787	(
788	(options[:min_word_length] > 0 && len < options[:min_word_length]) \|\|
789	(options[:max_word_length] > 0 && len > options[:max_word_length]) \|\|
790	(options[:stop_words] && options.include?(text))
791	)
792	end
793
794	def content_for_field_name(field)
795	self[field] \|\| self.instance_variable_get("@#{field.to_s}".to_sym) \|\| self.send(field.to_sym)
796	end
797
798	end
799
800	class FrequencyQueueItem
801	attr_reader :word, :field, :score
802	def initialize(word, field, score)
803	@word = word; @field = field; @score = score
804	end
805	def to_term
806	Ferret::Index::Term.new(self.field, self.word)
807	end
808	end
809
810	end
811	end
812	end
813
814	# reopen ActiveRecord and include all the above to make
815	# them available to all our models if they want it
816	ActiveRecord::Base.class_eval do
817	include FerretMixin::Acts::ARFerret
818	end
819
820
821	class Ferret::Index::MultiReader
822	def latest?
823	# TODO: Exception handling added to resolve ticket #6.
824	# It should be clarified wether this is a bug in Ferret
825	# in which case a bug report should be posted on the Ferret Trac.
826	begin
827	@sub_readers.each { \|r\| return false unless r.latest? }
828	rescue
829	return false
830	end
831	true
832	end
833	end
834
835	# END acts_as_ferret.rb

Download in other formats:

raw | text

Download in other formats:

Search tickets

Social

ROOL Store

Donate! Why?

RISC OS IPR

Options

RSS feeds