IDZEBRA 2.2.8
benchindex1.c
Go to the documentation of this file.
1/* This file is part of the Zebra server.
2 Copyright (C) Index Data
3
4Zebra is free software; you can redistribute it and/or modify it under
5the terms of the GNU General Public License as published by the Free
6Software Foundation; either version 2, or (at your option) any later
7version.
8
9Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10WARRANTY; without even the implied warranty of MERCHANTABILITY or
11FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12for more details.
13
14You should have received a copy of the GNU General Public License
15along with this program; if not, write to the Free Software
16Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
18*/
19
20#if HAVE_CONFIG_H
21#include <config.h>
22#endif
23#include <yaz/options.h>
24#include <ctype.h>
25#include <stdlib.h>
26#include <string.h>
27#include <yaz/log.h>
28#include <yaz/nmem.h>
29#include <yaz/xmalloc.h>
30#include <yaz/marcdisp.h>
31#include <yaz/timing.h>
32#include <it_key.h>
33#include <idzebra/isamb.h>
34#include <idzebra/dict.h>
35#include <assert.h>
36
38 NMEM nmem;
43 struct index_term **ar;
44 int round;
45};
46
47struct index_term {
48 const char *term;
53};
54
55struct index_block *index_block_new(int memory)
56{
57 struct index_block *b = xmalloc(sizeof(*b));
58 b->no_entries = 0;
59 b->current_max = memory * 1024 * 1024;
60 b->terms = 0;
61 b->nmem = nmem_create();
62 b->round = 0;
63 return b;
64}
65
67{
68 if (*bp)
69 {
70 nmem_destroy((*bp)->nmem);
71 xfree(*bp);
72 *bp = 0;
73 }
74}
75
76static int cmp_ar(const void *p1, const void *p2)
77{
78 struct index_term *t1 = *(struct index_term **) p1;
79 struct index_term *t2 = *(struct index_term **) p2;
80 int d = strcmp(t1->term, t2->term);
81 if (d)
82 return d;
83
84 if (t1->docid > t2->docid)
85 return 1;
86 else if (t1->docid < t2->docid)
87 return -1;
88 if (t1->seqno > t2->seqno)
89 return 1;
90 else if (t1->seqno < t2->seqno)
91 return -1;
92 return 0;
93}
94
95
96int code_read(void *vp, char **dst, int *insertMode)
97{
98 struct index_block *b = (struct index_block *)vp;
99 struct index_term *t;
100 struct it_key key;
101
102 if (b->current_entry >= b->no_entries)
103 return 0;
104
105 t = b->ar[b->current_entry];
106 b->current_entry++;
107
108 key.len = 3;
109 key.mem[0] = t->word_id;
110 key.mem[1] = t->docid;
111 key.mem[2] = t->seqno;
112 key.mem[3] = 0;
113
114 memcpy(*dst, &key, sizeof(key));
115
116 (*dst) += sizeof(key);
117 *insertMode = 1;
118#if 0
119 yaz_log(YLOG_LOG, "returning " ZINT_FORMAT " " ZINT_FORMAT "\n",
120 key.mem[0], key.mem[1]);
121#endif
122 return 1;
123}
124
126 int no_docs)
127{
128 struct index_term *t = b->terms;
129 int i;
130 int word_id_seq = 0;
131 int no_words = 0, no_new_words = 0;
132 const char *dict_info = 0;
133 ISAM_P isamc_p = 0;
134 yaz_timing_t tim_dict = 0;
135 yaz_timing_t tim_isamb = 0;
136 zint number_of_int_splits = isamb_get_int_splits(isb);
137 zint number_of_leaf_splits = isamb_get_leaf_splits(isb);
138 zint number_of_dict_splits = dict_get_no_split(dict);
139
140 b->ar = xmalloc(sizeof(*b->ar) * b->no_entries);
141 for (i = 0; i < b->no_entries; i++, t = t->next)
142 {
143 assert(t);
144 b->ar[i] = t;
145 }
146 assert(!t);
147
148 qsort(b->ar, b->no_entries, sizeof(*b->ar), cmp_ar);
149 tim_dict = yaz_timing_create();
150#if 0
151 for (i = 0; i < b->no_entries; i++)
152 {
153 printf("%s " ZINT_FORMAT " " ZINT_FORMAT "\n",
154 ar[i]->term, ar[i]->docid, ar[i]->seqno);
155 }
156#endif
157 dict_info = dict_lookup(dict, "_w");
158 if (dict_info)
159 {
160 assert(*dict_info == sizeof(word_id_seq));
161 memcpy(&word_id_seq, dict_info+1, sizeof(word_id_seq));
162 }
163
164 dict_info = dict_lookup(dict, "_i");
165 if (dict_info)
166 {
167 assert(*dict_info == sizeof(isamc_p));
168 memcpy(&isamc_p, dict_info+1, sizeof(isamc_p));
169 }
170
171 for (i = 0; i < b->no_entries; i++)
172 {
173 if (i > 0 && strcmp(b->ar[i-1]->term, b->ar[i]->term) == 0)
174 b->ar[i]->word_id = b->ar[i-1]->word_id;
175 else
176 {
177 const char *dict_info = dict_lookup(dict, b->ar[i]->term);
178 if (dict_info)
179 {
180 memcpy(&b->ar[i]->word_id, dict_info+1, sizeof(int));
181 }
182 else
183 {
184 word_id_seq++;
185 no_new_words++;
186 dict_insert(dict, b->ar[i]->term, sizeof(int), &word_id_seq);
187 b->ar[i]->word_id = word_id_seq;
188 }
189 no_words++;
190 }
191 }
192 dict_insert(dict, "_w", sizeof(word_id_seq), &word_id_seq);
193
194 yaz_timing_stop(tim_dict);
195 tim_isamb = yaz_timing_create();
196
197 b->current_entry = 0;
198
199 if (b->no_entries)
200 {
201 ISAMC_I isamc_i;
202
203 isamc_i.clientData = b;
204 isamc_i.read_item = code_read;
205
206 isamb_merge (isb, &isamc_p, &isamc_i);
207
208 assert(isamc_p);
209 dict_insert(dict, "_i", sizeof(isamc_p), &isamc_p);
210 }
211
212 yaz_timing_stop(tim_isamb);
213
214 number_of_int_splits = isamb_get_int_splits(isb) - number_of_int_splits;
215 number_of_leaf_splits = isamb_get_leaf_splits(isb) - number_of_leaf_splits;
216 number_of_dict_splits = dict_get_no_split(dict) - number_of_dict_splits;
217
218 if (b->round == 0)
219 {
220 printf("# run total dict-real user sys isam-real user sys "
221 " intsp leafsp docs postings words new d-spl\n");
222 }
223 b->round++;
224 printf("%5d %9.6f %9.6f %5.2f %5.2f %9.6f %5.2f %5.2f "
225 "%6" ZINT_FORMAT0 " %6" ZINT_FORMAT0
226 " %8d %8d %6d %6d" " %5" ZINT_FORMAT0 "\n",
227 b->round,
228 yaz_timing_get_real(tim_dict) + yaz_timing_get_real(tim_isamb),
229 yaz_timing_get_real(tim_dict),
230 yaz_timing_get_user(tim_dict),
231 yaz_timing_get_sys(tim_dict),
232 yaz_timing_get_real(tim_isamb),
233 yaz_timing_get_user(tim_isamb),
234 yaz_timing_get_sys(tim_isamb),
235 number_of_int_splits,
236 number_of_leaf_splits,
237 no_docs,
238 b->no_entries,
239 no_words,
240 no_new_words,
241 number_of_dict_splits
242 );
243 fflush(stdout);
244
245 xfree(b->ar);
246 b->ar = 0;
247 nmem_reset(b->nmem);
248 b->no_entries = 0;
249 b->terms = 0;
250
251 yaz_timing_destroy(&tim_isamb);
252 yaz_timing_destroy(&tim_dict);
253}
254
256 int no_docs)
257{
258 int total = nmem_total(b->nmem);
259 int max = b->current_max;
260 if (total > max)
261 {
262 index_block_flush(b, isb, dict, no_docs);
263 }
264}
265
267 const char *term, zint docid, zint seqno)
268{
269 struct index_term *t = nmem_malloc(b->nmem, sizeof(*t));
270 t->term = nmem_strdup(b->nmem, term);
271 t->docid = docid;
272 t->seqno = seqno;
273 t->next = b->terms;
274 b->terms = t;
275 b->no_entries++;
276}
277
278void index_term(struct index_block *b, const char *term,
280{
281#if 0
282 printf("%s " ZINT_FORMAT " " ZINT_FORMAT "\n", term,
283 docid, *seqno);
284#endif
286 (*seqno)++;
287}
288
289void index_wrbuf(struct index_block *b, WRBUF wrbuf, zint docid,
290 int subfield_char)
291{
292 int nl = 1;
293 const char *cp = wrbuf_buf(wrbuf);
294 char term[4096];
295 size_t sz = 0;
296 zint seqno = 0;
297
298 while (*cp)
299 {
300 if (nl)
301 {
302 int i;
303 if (cp[0] != ' ')
304 { /* skip field+indicator (e.g. 245 00) */
305 for (i = 0; i<6 && *cp; i++, cp++)
306 ;
307 }
308 else
309 { /* continuation line */
310 for (i = 0; i<4 && *cp; i++, cp++)
311 ;
312 }
313 }
314 nl = 0;
315 if (*cp == '\n')
316 {
317 if (sz)
318 {
319 index_term(b, term, docid, &seqno);
320 sz = 0;
321 }
322 nl = 1;
323 cp++;
324 }
325 else if (*cp == subfield_char && cp[1])
326 {
327 if (sz)
328 {
329 index_term(b, term, docid, &seqno);
330 sz = 0;
331 }
332 cp += 2;
333 }
334 else if (strchr("$*/-;,.:[]\"&(){} ", *cp))
335 {
336 if (sz)
337 {
338 index_term(b, term, docid, &seqno);
339 sz = 0;
340 }
341 cp++;
342 }
343 else
344 {
345 unsigned ch = *(const unsigned char *)cp;
346 if (sz < sizeof(term))
347 {
348 term[sz] = tolower(ch);
349 term[sz+1] = '\0';
350 sz++;
351 }
352 cp++;
353 }
354 }
355 if (sz)
356 index_term(b, term, docid, &seqno);
357}
358
360 Dict dict,
361 zint *docid_seq,
362 FILE *inf,
363 int memory)
364{
365 WRBUF wrbuf = wrbuf_alloc();
366 int no_docs = 0;
367 int new_rec = 1;
368 char line[4096];
369 struct index_block *b = index_block_new(memory);
370 while(fgets(line, sizeof(line)-1, inf))
371 {
372 if (line[0] == '$')
373 {
374 if (!new_rec)
375 new_rec = 1;
376 else
377 new_rec = 0;
378 continue;
379 }
380 if (new_rec)
381 {
382 (*docid_seq)++;
383 no_docs++;
384 index_block_check_flush(b, isb, dict, no_docs);
385 new_rec = 0;
386 }
387
388 if (line[0] == ' ')
389 {
390 /* continuation */
391 wrbuf_puts(wrbuf, line);
392 continue;
393 }
394 else
395 {
396 /* index existing buffer (if any) */
397 if (wrbuf_len(wrbuf))
398 {
399 index_wrbuf(b, wrbuf, *docid_seq, '*');
400 wrbuf_rewind(wrbuf);
401 }
402 if (line[0] != ' ' && line[1] != ' ' && line[2] != ' ' &&
403 line[3] == ' ')
404 {
405 /* normal field+indicator line */
406 wrbuf_puts(wrbuf, line);
407 }
408 }
409 }
410 if (wrbuf_len(wrbuf))
411 {
412 index_wrbuf(b, wrbuf, *docid_seq, '*');
413 wrbuf_rewind(wrbuf);
414 }
415 (*docid_seq)++;
416 no_docs++;
417 index_block_flush(b, isb, dict, no_docs);
419}
420
422 Dict dict,
423 zint *docid_seq,
424 FILE *inf,
425 int memory,
426 int verbose, int print_offset)
427{
428 yaz_marc_t mt = yaz_marc_create();
429 WRBUF wrbuf = wrbuf_alloc();
430 struct index_block *b = index_block_new(memory);
431 int no_docs = 0;
432
433 while (1)
434 {
435 size_t r;
436 char buf[100001];
437 int len, rlen;
438
439 r = fread (buf, 1, 5, inf);
440 if (r < 5)
441 {
442 if (r && print_offset && verbose)
443 printf ("<!-- Extra %ld bytes at end of file -->\n",
444 (long) r);
445 break;
446 }
447 while (*buf < '0' || *buf > '9')
448 {
449 int i;
450 long off = ftell(inf) - 5;
451 if (verbose || print_offset)
452 printf("<!-- Skipping bad byte %d (0x%02X) at offset "
453 "%ld (0x%lx) -->\n",
454 *buf & 0xff, *buf & 0xff,
455 off, off);
456 for (i = 0; i<4; i++)
457 buf[i] = buf[i+1];
458 r = fread(buf+4, 1, 1, inf);
459 if (r < 1)
460 break;
461 }
462 if (r < 1)
463 {
464 if (verbose || print_offset)
465 printf ("<!-- End of file with data -->\n");
466 break;
467 }
468 len = atoi_n(buf, 5);
469 if (len < 25 || len > 100000)
470 {
471 long off = ftell(inf) - 5;
472 printf("Bad Length %ld read at offset %ld (%lx)\n",
473 (long)len, (long) off, (long) off);
474 break;
475 }
476 rlen = len - 5;
477 r = fread (buf + 5, 1, rlen, inf);
478 if (r < rlen)
479 break;
480 yaz_marc_read_iso2709(mt, buf, len);
481
482 if (yaz_marc_write_line(mt, wrbuf))
483 break;
484
485 index_wrbuf(b, wrbuf, *docid_seq, '$');
486 wrbuf_rewind(wrbuf);
487 (*docid_seq)++;
488
489 no_docs++;
490 index_block_check_flush(b, isb, dict, no_docs);
491 }
492 index_block_flush(b, isb, dict, no_docs);
493 wrbuf_destroy(wrbuf);
494 yaz_marc_destroy(mt);
496}
497
498void exit_usage(void)
499{
500 fprintf(stderr, "benchindex1 [-t type] [-c d:i] [-m mem] [-i] [inputfile]\n");
501 exit(1);
502}
503
504int main(int argc, char **argv)
505{
506 BFiles bfs;
507 ISAMB isb_postings;
508 ISAMC_M method_postings;
509 Dict dict;
510 int ret;
511 int reset = 0;
512 char *arg;
513 int memory = 5;
514 int isam_cache_size = 40;
515 int dict_cache_size = 50;
516 const char *fname = 0;
517 FILE *inf = stdin;
518 yaz_timing_t tim = 0;
519 zint docid_seq = 1;
520 const char *dict_info;
521 const char *type = "iso2709";
522 int int_count_enable = 1;
523
524 while ((ret = options("im:t:c:N", argv, argc, &arg)) != -2)
525 {
526 switch(ret)
527 {
528 case 'm':
529 memory = atoi(arg);
530 break;
531 case 'i':
532 reset = 1;
533 break;
534 case 't':
535 if (!strcmp(arg, "iso2709"))
536 type = "iso2709";
537 else if (!strcmp(arg, "line"))
538 type = "line";
539 else
540 {
541 fprintf(stderr, "bad type: %s.\n", arg);
542 exit_usage();
543 }
544 break;
545 case 'c':
546 if (sscanf(arg, "%d:%d", &dict_cache_size, &isam_cache_size)
547 != 2)
548 {
549 fprintf(stderr, "bad cache sizes for -c\n");
550 exit_usage();
551 }
552 break;
553 case 0:
554 fname = arg;
555 break;
556 case 'N':
557 int_count_enable = 0;
558 break;
559 default:
560 fprintf(stderr, "bad option.\n");
561 exit_usage();
562 }
563 }
564
565 if (fname)
566 {
567 inf = fopen(fname, "rb");
568 if (!inf)
569 {
570 fprintf(stderr, "Cannot open %s\n", fname);
571 exit(1);
572 }
573 }
574 printf("# benchindex1 %s %s\n", __DATE__, __TIME__);
575 printf("# isam_cache_size = %d\n", isam_cache_size);
576 printf("# dict_cache_size = %d\n", dict_cache_size);
577 printf("# int_count_enable = %d\n", int_count_enable);
578 printf("# memory = %d\n", memory);
579
580 /* setup postings isamb attributes */
581 method_postings.compare_item = key_compare;
582 method_postings.log_item = key_logdump_txt;
583
584 method_postings.codec.start = iscz1_start;
585 method_postings.codec.decode = iscz1_decode;
586 method_postings.codec.encode = iscz1_encode;
587 method_postings.codec.stop = iscz1_stop;
588 method_postings.codec.reset = iscz1_reset;
589
590 method_postings.debug = 0;
591
592 /* create block system */
593 bfs = bfs_create(0, 0);
594 if (!bfs)
595 {
596 yaz_log(YLOG_WARN, "bfs_create failed");
597 exit(1);
598 }
599
600 if (reset)
601 bf_reset(bfs);
602
603 tim = yaz_timing_create();
604 /* create isam handle */
605 isb_postings = isamb_open (bfs, "isamb", isam_cache_size ? 1 : 0,
606 &method_postings, 0);
607 if (!isb_postings)
608 {
609 yaz_log(YLOG_WARN, "isamb_open failed");
610 exit(2);
611 }
612 isamb_set_cache_size(isb_postings, isam_cache_size);
613 isamb_set_int_count(isb_postings, int_count_enable);
614 dict = dict_open(bfs, "dict", dict_cache_size, 1, 0, 4096);
615
616 dict_info = dict_lookup(dict, "_s");
617 if (dict_info)
618 {
619 assert(*dict_info == sizeof(docid_seq));
620 memcpy(&docid_seq, dict_info+1, sizeof(docid_seq));
621 }
622
623 if (!strcmp(type, "iso2709"))
624 index_marc_from_file(isb_postings, dict, &docid_seq, inf, memory,
625 0 /* verbose */ , 0 /* print_offset */);
626 else if (!strcmp(type, "line"))
627 index_marc_line_records(isb_postings, dict, &docid_seq, inf, memory);
628
629 printf("# Total " ZINT_FORMAT " documents\n", docid_seq);
630 dict_insert(dict, "_s", sizeof(docid_seq), &docid_seq);
631
633 isamb_close(isb_postings);
634
635 if (fname)
636 fclose(inf);
637 /* exit block system */
638 bfs_destroy(bfs);
639 yaz_timing_stop(tim);
640
641 printf("# Total timings real=%8.6f user=%3.2f system=%3.2f\n",
642 yaz_timing_get_real(tim),
643 yaz_timing_get_user(tim),
644 yaz_timing_get_sys(tim));
645
646 yaz_timing_destroy(&tim);
647
648 exit(0);
649 return 0;
650}
651/*
652 * Local variables:
653 * c-basic-offset: 4
654 * c-file-style: "Stroustrup"
655 * indent-tabs-mode: nil
656 * End:
657 * vim: shiftwidth=4 tabstop=8 expandtab
658 */
659
static int cmp_ar(const void *p1, const void *p2)
Definition benchindex1.c:76
void exit_usage(void)
void index_block_add(struct index_block *b, const char *term, zint docid, zint seqno)
void index_block_flush(struct index_block *b, ISAMB isb, Dict dict, int no_docs)
int main(int argc, char **argv)
void index_marc_from_file(ISAMB isb, Dict dict, zint *docid_seq, FILE *inf, int memory, int verbose, int print_offset)
void index_marc_line_records(ISAMB isb, Dict dict, zint *docid_seq, FILE *inf, int memory)
void index_block_destroy(struct index_block **bp)
Definition benchindex1.c:66
void index_wrbuf(struct index_block *b, WRBUF wrbuf, zint docid, int subfield_char)
void index_block_check_flush(struct index_block *b, ISAMB isb, Dict dict, int no_docs)
int code_read(void *vp, char **dst, int *insertMode)
Definition benchindex1.c:96
struct index_block * index_block_new(int memory)
Definition benchindex1.c:55
void bf_reset(BFiles bfs)
Removes register and shadow completely.
Definition bfile.c:268
BFiles bfs_create(const char *spec, const char *base)
creates a Block files collection
Definition bfile.c:56
void bfs_destroy(BFiles bfiles)
destroys a block files handle
Definition bfile.c:73
Zebra dictionary.
zint dict_get_no_split(Dict dict)
get number of page split operations, since dict_open
Definition open.c:133
Dict dict_open(BFiles bfs, const char *name, int cache, int rw, int compact_flag, int page_size)
open dictionary
Definition open.c:50
int dict_insert(Dict dict, const char *p, int userlen, void *userinfo)
insert item into dictionary
Definition insert.c:439
char * dict_lookup(Dict dict, const char *p)
lookup item in dictionary
Definition lookup.c:100
int dict_close(Dict dict)
closes dictionary
Definition close.c:32
static Dict dict
Definition dicttest.c:35
void isamb_set_int_count(ISAMB b, int v)
Definition isamb.c:195
zint isamb_get_leaf_splits(ISAMB b)
Definition isamb.c:1664
ISAMB isamb_open(BFiles bfs, const char *name, int writeflag, ISAMC_M *method, int cache)
Definition isamb.c:351
zint isamb_get_int_splits(ISAMB b)
Definition isamb.c:1659
void isamb_close(ISAMB isamb)
Definition isamb.c:455
void isamb_set_cache_size(ISAMB b, int sz)
Definition isamb.c:200
void isamb_merge(ISAMB b, ISAM_P *pos, ISAMC_I *data)
Definition isamb.c:1266
zint ISAM_P
Definition isamc.h:28
void * iscz1_start(void)
Definition it_key.c:130
void iscz1_decode(void *vp, char **dst, const char **src)
Definition it_key.c:238
void iscz1_encode(void *vp, char **dst, const char **src)
Definition it_key.c:190
int key_compare(const void *p1, const void *p2)
Definition it_key.c:74
void key_logdump_txt(int logmask, const void *p, const char *txt)
Definition it_key.c:38
void iscz1_reset(void *vp)
Definition it_key.c:146
void iscz1_stop(void *p)
Definition it_key.c:155
static FILE * inf
Definition readfile.c:37
int(* read_item)(void *clientData, char **dst, int *insertMode)
Definition isamc.h:53
void * clientData
Definition isamc.h:54
int(* compare_item)(const void *a, const void *b)
Definition isamc.h:43
ISAM_CODEC codec
Definition isamc.h:46
void(* log_item)(int logmask, const void *p, const char *txt)
Definition isamc.h:44
int debug
Definition isamc.h:49
void(* decode)(void *p, char **dst, const char **src)
Definition isam-codec.h:26
void(* stop)(void *p)
Definition isam-codec.h:25
void *(* start)(void)
Definition isam-codec.h:24
void(* encode)(void *p, char **dst, const char **src)
Definition isam-codec.h:27
void(* reset)(void *p)
Definition isam-codec.h:28
struct index_term ** ar
Definition benchindex1.c:43
struct index_term * terms
Definition benchindex1.c:42
size_t current_entry
Definition benchindex1.c:40
size_t current_max
Definition benchindex1.c:41
const char * term
Definition benchindex1.c:48
struct index_term * next
Definition benchindex1.c:52
int len
Definition it_key.h:31
zint mem[IT_KEY_LEVEL_MAX]
Definition it_key.h:32
long zint
Zebra integer.
Definition util.h:66
#define ZINT_FORMAT
Definition util.h:72
#define ZINT_FORMAT0
Definition util.h:67