YAZ 5.37.0
stemwords.c
Go to the documentation of this file.
1/* This is a simple program which uses libstemmer to provide a command
2 * line interface for stemming using any of the algorithms provided.
3 */
4
5#include <stdio.h>
6#include <stdlib.h> /* for malloc, free */
7#include <string.h> /* for memmove */
8#include <ctype.h> /* for isupper, tolower */
9
10#include "libstemmer.h"
11
12const char * progname;
13static int pretty = 1;
14
15static void
16stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
17{
18#define INC 10
19 int lim = INC;
20 sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));
21
22 while(1) {
23 int ch = getc(f_in);
24 if (ch == EOF) {
25 free(b); return;
26 }
27 {
28 int i = 0;
29 int inlen = 0;
30 while(1) {
31 if (ch == '\n' || ch == EOF) break;
32 if (i == lim) {
33 sb_symbol * newb;
34 newb = (sb_symbol *)
35 realloc(b, (lim + INC) * sizeof(sb_symbol));
36 if (newb == 0) goto error;
37 b = newb;
38 lim = lim + INC;
39 }
40 /* Update count of utf-8 characters. */
41 if (ch < 0x80 || ch > 0xBF) inlen += 1;
42 /* force lower case: */
43 if (isupper(ch)) ch = tolower(ch);
44
45 b[i] = ch;
46 i++;
47 ch = getc(f_in);
48 }
49
50 {
51 const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
52 if (stemmed == NULL)
53 {
54 fprintf(stderr, "Out of memory");
55 exit(1);
56 }
57 else
58 {
59 if (pretty == 1) {
60 fwrite(b, i, 1, f_out);
61 fputs(" -> ", f_out);
62 } else if (pretty == 2) {
63 fwrite(b, i, 1, f_out);
64 if (sb_stemmer_length(stemmer) > 0) {
65 int j;
66 if (inlen < 30) {
67 for (j = 30 - inlen; j > 0; j--)
68 fputs(" ", f_out);
69 } else {
70 fputs("\n", f_out);
71 for (j = 30; j > 0; j--)
72 fputs(" ", f_out);
73 }
74 }
75 }
76
77 fputs((char *)stemmed, f_out);
78 putc('\n', f_out);
79 }
80 }
81 }
82 }
83error:
84 if (b != 0) free(b);
85 return;
86}
87
91static void
92usage(int n)
93{
94 printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
95 "\n"
96 "The input file consists of a list of words to be stemmed, one per\n"
97 "line. Words should be in lower case, but (for English) A-Z letters\n"
98 "are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
99 "used.\n"
100 "\n"
101 "If -c is given, the argument is the character encoding of the input\n"
102 "and output files. If it is omitted, the UTF-8 encoding is used.\n"
103 "\n"
104 "If -p is given the output file consists of each word of the input\n"
105 "file followed by \"->\" followed by its stemmed equivalent.\n"
106 "If -p2 is given the output file is a two column layout containing\n"
107 "the input words in the first column and the stemmed eqivalents in\n"
108 "the second column.\n"
109 "Otherwise, the output file consists of the stemmed words, one per\n"
110 "line.\n"
111 "\n"
112 "-h displays this help\n",
113 progname);
114 exit(n);
115}
116
117int
118main(int argc, char * argv[])
119{
120 char * in = 0;
121 char * out = 0;
122 FILE * f_in;
123 FILE * f_out;
124 struct sb_stemmer * stemmer;
125
126 char * language = "english";
127 char * charenc = NULL;
128
129 char * s;
130 int i = 1;
131 pretty = 0;
132
133 progname = argv[0];
134
135 while(i < argc) {
136 s = argv[i++];
137 if (s[0] == '-') {
138 if (strcmp(s, "-o") == 0) {
139 if (i >= argc) {
140 fprintf(stderr, "%s requires an argument\n", s);
141 exit(1);
142 }
143 out = argv[i++];
144 } else if (strcmp(s, "-i") == 0) {
145 if (i >= argc) {
146 fprintf(stderr, "%s requires an argument\n", s);
147 exit(1);
148 }
149 in = argv[i++];
150 } else if (strcmp(s, "-l") == 0) {
151 if (i >= argc) {
152 fprintf(stderr, "%s requires an argument\n", s);
153 exit(1);
154 }
155 language = argv[i++];
156 } else if (strcmp(s, "-c") == 0) {
157 if (i >= argc) {
158 fprintf(stderr, "%s requires an argument\n", s);
159 exit(1);
160 }
161 charenc = argv[i++];
162 } else if (strcmp(s, "-p2") == 0) {
163 pretty = 2;
164 } else if (strcmp(s, "-p") == 0) {
165 pretty = 1;
166 } else if (strcmp(s, "-h") == 0) {
167 usage(0);
168 } else {
169 fprintf(stderr, "option %s unknown\n", s);
170 usage(1);
171 }
172 } else {
173 fprintf(stderr, "unexpected parameter %s\n", s);
174 usage(1);
175 }
176 }
177
178 /* prepare the files */
179 f_in = (in == 0) ? stdin : fopen(in, "r");
180 if (f_in == 0) {
181 fprintf(stderr, "file %s not found\n", in);
182 exit(1);
183 }
184 f_out = (out == 0) ? stdout : fopen(out, "w");
185 if (f_out == 0) {
186 fprintf(stderr, "file %s cannot be opened\n", out);
187 exit(1);
188 }
189
190 /* do the stemming process: */
191 stemmer = sb_stemmer_new(language, charenc);
192 if (stemmer == 0) {
193 if (charenc == NULL) {
194 fprintf(stderr, "language `%s' not available for stemming\n", language);
195 exit(1);
196 } else {
197 fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
198 exit(1);
199 }
200 }
201 stem_file(stemmer, f_in, f_out);
202 sb_stemmer_delete(stemmer);
203
204 if (in != 0) (void) fclose(f_in);
205 if (out != 0) (void) fclose(f_out);
206
207 return 0;
208}
209
void * malloc(YYSIZE_T)
void free(void *)
unsigned char sb_symbol
Definition libstemmer.h:8
int sb_stemmer_length(struct sb_stemmer *stemmer)
Definition libstemmer.c:92
struct sb_stemmer * sb_stemmer_new(const char *algorithm, const char *charenc)
Definition libstemmer.c:35
void sb_stemmer_delete(struct sb_stemmer *stemmer)
Definition libstemmer.c:67
const sb_symbol * sb_stemmer_stem(struct sb_stemmer *stemmer, const sb_symbol *word, int size)
Definition libstemmer.c:77
int main(int argc, char *argv[])
Definition stemwords.c:118
static void usage(int n)
Definition stemwords.c:92
static void stem_file(struct sb_stemmer *stemmer, FILE *f_in, FILE *f_out)
Definition stemwords.c:16
static int pretty
Definition stemwords.c:13
const char * progname
Definition stemwords.c:12
#define INC