IDZEBRA 2.2.8
ranksimilarity.c
Go to the documentation of this file.
1/* This file is part of the Zebra server.
2 Copyright (C) Index Data
3
4Zebra is free software; you can redistribute it and/or modify it under
5the terms of the GNU General Public License as published by the Free
6Software Foundation; either version 2, or (at your option) any later
7version.
8
9Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10WARRANTY; without even the implied warranty of MERCHANTABILITY or
11FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12for more details.
13
14You should have received a copy of the GNU General Public License
15along with this program; if not, write to the Free Software
16Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
18*/
19
20#if HAVE_CONFIG_H
21#include <config.h>
22#endif
23#include <stdio.h>
24#include <assert.h>
25#include <limits.h>
26#ifdef WIN32
27#include <io.h>
28#endif
29#if HAVE_UNISTD_H
30#include <unistd.h>
31#endif
32
33#include "index.h"
34#include "rank.h"
35
36static int log_level = 0;
37static int log_initialized = 0;
38
42
73
94
95
96/* local clean-up function */
98{
99 int i;
100
101 for (i = 0; i < si->no_terms_query; i++)
102 {
103 si->entries[i].freq_term_docfield = 0;
104 }
105}
106
107
108/*
109 * create: Creates/Initialises this rank handler. This routine is
110 * called exactly once. The routine returns the class_handle.
111 */
112static void *create (ZebraHandle zh)
113{
114 struct ranksimilarity_class_info *ci =
115 (struct ranksimilarity_class_info *) xmalloc (sizeof(*ci));
116
117 if (!log_initialized)
118 {
119 log_level = yaz_log_module_level("rank-similarity");
120 log_initialized = 1;
121 }
122 yaz_log(log_level, "create()");
123 return 0;
124}
125
126/*
127 * destroy: Destroys this rank handler. This routine is called
128 * when the handler is no longer needed - i.e. when the server
129 * dies. The class_handle was previously returned by create.
130 */
131static void destroy (struct zebra_register *reg, void *class_handle)
132{
134 = (struct ranksimilarity_class_info *) class_handle;
135 yaz_log(log_level, "destroy()");
136 xfree (ci);
137}
138
139
145static void *begin (struct zebra_register *reg,
146 void *class_handle, RSET rset, NMEM nmem,
147 TERMID *terms, int numterms)
148{
149 struct ranksimilarity_set_info *si =
150 (struct ranksimilarity_set_info *) nmem_malloc (nmem, sizeof(*si));
151 int i;
152
153 yaz_log(log_level, "begin() numterms=%d", numterms);
154
155 /* setting database global statistics */
156 si->no_docs_database = -1; /* TODO */
157 si->no_terms_database = -1; /* TODO */
158
159 /* setting query statistics */
160 si->no_terms_query = numterms;
161 si->no_ranked_terms_query = 0;
162
163 /* setting internal data structures */
164 si->nmem=nmem;
165 si->entries = (struct ranksimilarity_term_info *)
166 nmem_malloc (si->nmem, sizeof(*si->entries)*numterms);
167
168 /* reset the counts for the next term */
170
171
172 /* looping all terms in a specific fieldindex of query */
173 for (i = 0; i < numterms; i++)
174 {
175 struct ord_list *ol = NULL;
176
177
178 /* adding to number of rank entries */
179 if (strncmp (terms[i]->flags, "rank,", 5))
180 {
181 si->entries[i].rank_flag = 0;
182 yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s not ranked",
183 i, terms[i]->name, terms[i]->flags);
184 }
185 else
186 {
187 const char *cp = strstr(terms[i]->flags+4, ",w=");
188
189 zint no_docs_fieldindex = 0;
190 zint no_terms_fieldindex = 0;
191
192 yaz_log(log_level, "begin() terms[%d]: '%s' flags=%s",
193 i, terms[i]->name, terms[i]->flags);
194
195 (si->no_ranked_terms_query)++;
196 ol = terms[i]->ol;
197
198 si->entries[i].rank_flag = 1;
199 /* notice that the call to rset_count(rset) has he side-effect
200 of setting rset->hits_limit = rset_count(rset) ??? */
201 si->entries[i].freq_term_resset = rset_count(terms[i]->rset);
202 si->entries[i].no_docs_resset = terms[i]->rset->hits_count;
203
204
205 if (cp)
206 si->entries[i].fieldindex_weight = atoi (cp+3);
207 else
208 si->entries[i].fieldindex_weight = 34; /* sqrroot of 1000 */
209
210
211 /*
212 yaz_log(log_level, "begin() rset_count(terms[%d]->rset) = "
213 ZINT_FORMAT, i, rset_count(terms[i]->rset));
214 yaz_log(log_level, "begin() terms[%d]->rset->hits_limit = "
215 ZINT_FORMAT, i, terms[i]->rset->hits_limit);
216 yaz_log(log_level, "begin() terms[%d]->rset->hits_count = "
217 ZINT_FORMAT, i, terms[i]->rset->hits_count);
218 yaz_log(log_level, "begin() terms[%d]->rset->hits_round = "
219 ZINT_FORMAT, i, terms[i]->rset->hits_round);
220 yaz_log(log_level, "begin() terms[%d]->rset->hits_approx = %d",
221 i, terms[i]->rset->hits_approx);
222 */
223
224 /* looping indexes where term terms[i] is found */
225
226 for (; ol; ol = ol->next)
227 {
228 const char *index_type = 0;
229 const char *db = 0;
230 const char *string_index = 0;
231
233 ol->ord, &index_type, &db,
234 &string_index);
235
236 no_docs_fieldindex
238 no_terms_fieldindex
240
241 if (string_index)
242 yaz_log(log_level,
243 "begin() index: ord=%d type=%s db=%s str-index=%s",
244 ol->ord, index_type, db, string_index);
245 else
246 yaz_log(log_level,
247 "begin() index: ord=%d type=%s db=%s",
248 ol->ord, index_type, db);
249 }
250
251 si->entries[i].no_docs_fieldindex = no_docs_fieldindex;
252 si->entries[i].no_terms_fieldindex = no_terms_fieldindex;
253 }
254
255 si->entries[i].term = terms[i];
256 si->entries[i].term_index=i;
257
258 /* setting next entry in term */
259 terms[i]->rankpriv = &(si->entries[i]);
260 }
261
262 return si;
263}
264
265/*
266 * end: Terminates ranking process. Called after a result set
267 * has been ranked.
268 */
269static void end (struct zebra_register *reg, void *set_handle)
270{
271 yaz_log(log_level, "end()");
272}
273
274
275
281static void add (void *set_handle, int seqno, TERMID term)
282{
283 struct ranksimilarity_set_info *si
284 = (struct ranksimilarity_set_info *) set_handle;
285 struct ranksimilarity_term_info *ti;
286 assert(si);
287 if (!term)
288 {
289 /* yaz_log(log_level, "add() seqno=%d NULL term", seqno); */
290 return;
291 }
292
293 ti= (struct ranksimilarity_term_info *) term->rankpriv;
294 assert(ti);
295 si->last_pos = seqno;
296 ti->freq_term_docfield++;
297 /*yaz_log(log_level, "add() seqno=%d term=%s freq_term_docfield=%d",
298 seqno, term->name, ti->freq_term_docfield); */
299}
300
301/*
302 * calc: Called for each document in a result. This handler should
303 * produce a score based on previous call(s) to the add handler. The
304 * score should be between 0 and 1000. If score cannot be obtained
305 * -1 should be returned.
306 */
307static int calc (void *set_handle, zint sysno, zint staticrank,
308 int *stop_flag)
309{
310 int i, score = 0;
311 struct ranksimilarity_set_info *si
312 = (struct ranksimilarity_set_info *) set_handle;
313
314
315 yaz_log(log_level, "calc() sysno = " ZINT_FORMAT, sysno);
316 yaz_log(log_level, "calc() staticrank = " ZINT_FORMAT, staticrank);
317
318 yaz_log(log_level, "calc() si->no_terms_query = %d",
319 si->no_terms_query);
320 yaz_log(log_level, "calc() si->no_ranked_terms_query = %d",
322 yaz_log(log_level, "calc() si->no_docs_database = " ZINT_FORMAT,
323 si->no_docs_database);
324 yaz_log(log_level, "calc() si->no_terms_database = " ZINT_FORMAT,
326
327
328 if (!si->no_ranked_terms_query)
329 return -1; /* ranking not enabled for any terms */
330
331
332 /* if we set *stop_flag = 1, we stop processing (of result set list) */
333
334
335 /* here goes your formula to compute a scoring function */
336 /* you may use all the gathered statistics here */
337 for (i = 0; i < si->no_terms_query; i++)
338 {
339 yaz_log(log_level, "calc() entries[%d] termid %p",
340 i, si->entries[i].term);
341 if (si->entries[i].term){
342 yaz_log(log_level, "calc() entries[%d] term '%s' flags=%s",
343 i, si->entries[i].term->name, si->entries[i].term->flags);
344 yaz_log(log_level, "calc() entries[%d] rank_flag %d",
345 i, si->entries[i].rank_flag );
346 yaz_log(log_level, "calc() entries[%d] fieldindex_weight %d",
347 i, si->entries[i].fieldindex_weight );
348 yaz_log(log_level, "calc() entries[%d] freq_term_docfield %d",
349 i, si->entries[i].freq_term_docfield );
350 yaz_log(log_level, "calc() entries[%d] freq_term_resset " ZINT_FORMAT,
351 i, si->entries[i].freq_term_resset );
352 yaz_log(log_level, "calc() entries[%d] no_docs_resset " ZINT_FORMAT,
353 i, si->entries[i].no_docs_resset );
354 yaz_log(log_level, "calc() entries[%d] no_docs_fieldindex "
356 i, si->entries[i].no_docs_fieldindex );
357 yaz_log(log_level, "calc() entries[%d] no_terms_fieldindex "
359 i, si->entries[i].no_terms_fieldindex );
360 }
361 }
362
363
364 /* reset the counts for the next term */
366
367
368 /* staticrank = 0 is highest, MAXINT lowest */
369 if (staticrank >= INT_MAX)
370 score = 0;
371 else
372 { /* but score is reverse (logical) */
373 score = INT_MAX - CAST_ZINT_TO_INT(staticrank);
374 }
375
376
377 /* debugging statistics output */
378 yaz_log(log_level, "calc() statistics: score = %d", score);
379
380 return score;
381}
382
383/*
384 * Pseudo-meta code with sequence of calls as they occur in a
385 * server. Handlers are prefixed by --:
386 *
387 * server init
388 * -- create
389 * foreach search
390 * rank result set
391 * -- begin
392 * foreach record
393 * foreach word
394 * -- add
395 * -- calc
396 * -- end
397 * -- destroy
398 * server close
399 */
400
401static struct rank_control rank_control = {
402 "rank-similarity",
403 create,
404 destroy,
405 begin,
406 end,
407 calc,
408 add,
409};
410
412/*
413 * Local variables:
414 * c-basic-offset: 4
415 * c-file-style: "Stroustrup"
416 * indent-tabs-mode: nil
417 * End:
418 * vim: shiftwidth=4 tabstop=8 expandtab
419 */
420
static void ranksimilar_rec_reset(struct ranksimilarity_set_info *si)
static struct rank_control rank_control
static void end(struct zebra_register *reg, void *set_handle)
static void * create(ZebraHandle zh)
static int log_initialized
struct rank_control * rank_similarity_class
static void destroy(struct zebra_register *reg, void *class_handle)
static void * begin(struct zebra_register *reg, void *class_handle, RSET rset, NMEM nmem, TERMID *terms, int numterms)
static int log_level
static int calc(void *set_handle, zint sysno, zint staticrank, int *stop_flag)
static void add(void *set_handle, int seqno, TERMID term)
zint rset_count(RSET rs)
Estimates hit count for result set.
Definition rset.c:272
int ord
Definition rset.h:36
struct ord_list * next
Definition rset.h:37
struct ranksimilarity_term_info * entries
char * flags
Definition rset.h:52
struct ord_list * ol
Definition rset.h:64
char * name
Definition rset.h:51
RSET rset
Definition rset.h:60
void * rankpriv
Definition rset.h:61
Definition rset.h:151
zint hits_count
Definition rset.h:164
ZebraExplainInfo zei
Definition index.h:139
long zint
Zebra integer.
Definition util.h:66
#define ZINT_FORMAT
Definition util.h:72
#define CAST_ZINT_TO_INT(x)
Definition util.h:96
zint zebraExplain_ord_get_term_occurrences(ZebraExplainInfo zei, int ord)
Definition zinfo.c:1470
int zebraExplain_lookup_ord(ZebraExplainInfo zei, int ord, const char **index_type, const char **db, const char **string_index)
Definition zinfo.c:1478
zint zebraExplain_ord_get_doc_occurrences(ZebraExplainInfo zei, int ord)
Definition zinfo.c:1462