YAZ 5.35.1
stemwords.c
Go to the documentation of this file.
1/* This is a simple program which uses libstemmer to provide a command
2 * line interface for stemming using any of the algorithms provided.
3 */
4
5#include <stdio.h>
6#include <stdlib.h> /* for malloc, free */
7#include <string.h> /* for memmove */
8#include <ctype.h> /* for isupper, tolower */
9
10#include "libstemmer.h"
11
12const char *progname;
13static int pretty = 1;
14
15static void
16stem_file (struct sb_stemmer *stemmer, FILE *f_in, FILE *f_out)
17{
18#define INC 10
19 int lim = INC;
20 sb_symbol *b = (sb_symbol *) malloc (lim * sizeof (sb_symbol));
21
22 while (1)
23 {
24 int ch = getc (f_in);
25 if (ch == EOF)
26 {
27 free (b);
28 return;
29 }
30 {
31 int i = 0;
32 int inlen = 0;
33 while (1)
34 {
35 if (ch == '\n' || ch == EOF)
36 break;
37 if (i == lim)
38 {
39 sb_symbol *newb;
40 newb = (sb_symbol *)
41 realloc (b, (lim + INC) * sizeof (sb_symbol));
42 if (newb == 0)
43 goto error;
44 b = newb;
45 lim = lim + INC;
46 }
47 /* Update count of utf-8 characters. */
48 if (ch < 0x80 || ch > 0xBF)
49 inlen += 1;
50 /* force lower case: */
51 if (isupper (ch))
52 ch = tolower (ch);
53
54 b[i] = ch;
55 i++;
56 ch = getc (f_in);
57 }
58
59 {
60 const sb_symbol *stemmed = sb_stemmer_stem (stemmer, b, i);
61 if (stemmed == NULL)
62 {
63 fprintf (stderr, "Out of memory");
64 exit (1);
65 }
66 else
67 {
68 if (pretty == 1)
69 {
70 fwrite (b, i, 1, f_out);
71 fputs (" -> ", f_out);
72 }
73 else if (pretty == 2)
74 {
75 fwrite (b, i, 1, f_out);
76 if (sb_stemmer_length (stemmer) > 0)
77 {
78 int j;
79 if (inlen < 30)
80 {
81 for (j = 30 - inlen; j > 0; j--)
82 fputs (" ", f_out);
83 }
84 else
85 {
86 fputs ("\n", f_out);
87 for (j = 30; j > 0; j--)
88 fputs (" ", f_out);
89 }
90 }
91 }
92
93 fputs ((char *) stemmed, f_out);
94 putc ('\n', f_out);
95 }
96 }
97 }
98 }
99error:
100 if (b != 0)
101 free (b);
102 return;
103}
104
108static void
109usage (int n)
110{
111 printf
112 ("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
113 "\n"
114 "The input file consists of a list of words to be stemmed, one per\n"
115 "line. Words should be in lower case, but (for English) A-Z letters\n"
116 "are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
117 "used.\n" "\n"
118 "If -c is given, the argument is the character encoding of the input\n"
119 "and output files. If it is omitted, the UTF-8 encoding is used.\n" "\n"
120 "If -p is given the output file consists of each word of the input\n"
121 "file followed by \"->\" followed by its stemmed equivalent.\n"
122 "If -p2 is given the output file is a two column layout containing\n"
123 "the input words in the first column and the stemmed eqivalents in\n"
124 "the second column.\n"
125 "Otherwise, the output file consists of the stemmed words, one per\n"
126 "line.\n" "\n" "-h displays this help\n", progname);
127 exit (n);
128}
129
130int
131main (int argc, char *argv[])
132{
133 char *in = 0;
134 char *out = 0;
135 FILE *f_in;
136 FILE *f_out;
137 struct sb_stemmer *stemmer;
138
139 char *language = "english";
140 char *charenc = NULL;
141
142 char *s;
143 int i = 1;
144 pretty = 0;
145
146 progname = argv[0];
147
148 while (i < argc)
149 {
150 s = argv[i++];
151 if (s[0] == '-')
152 {
153 if (strcmp (s, "-o") == 0)
154 {
155 if (i >= argc)
156 {
157 fprintf (stderr, "%s requires an argument\n", s);
158 exit (1);
159 }
160 out = argv[i++];
161 }
162 else if (strcmp (s, "-i") == 0)
163 {
164 if (i >= argc)
165 {
166 fprintf (stderr, "%s requires an argument\n", s);
167 exit (1);
168 }
169 in = argv[i++];
170 }
171 else if (strcmp (s, "-l") == 0)
172 {
173 if (i >= argc)
174 {
175 fprintf (stderr, "%s requires an argument\n", s);
176 exit (1);
177 }
178 language = argv[i++];
179 }
180 else if (strcmp (s, "-c") == 0)
181 {
182 if (i >= argc)
183 {
184 fprintf (stderr, "%s requires an argument\n", s);
185 exit (1);
186 }
187 charenc = argv[i++];
188 }
189 else if (strcmp (s, "-p2") == 0)
190 {
191 pretty = 2;
192 }
193 else if (strcmp (s, "-p") == 0)
194 {
195 pretty = 1;
196 }
197 else if (strcmp (s, "-h") == 0)
198 {
199 usage (0);
200 }
201 else
202 {
203 fprintf (stderr, "option %s unknown\n", s);
204 usage (1);
205 }
206 }
207 else
208 {
209 fprintf (stderr, "unexpected parameter %s\n", s);
210 usage (1);
211 }
212 }
213
214 /* prepare the files */
215 f_in = (in == 0) ? stdin : fopen (in, "r");
216 if (f_in == 0)
217 {
218 fprintf (stderr, "file %s not found\n", in);
219 exit (1);
220 }
221 f_out = (out == 0) ? stdout : fopen (out, "w");
222 if (f_out == 0)
223 {
224 fprintf (stderr, "file %s cannot be opened\n", out);
225 exit (1);
226 }
227
228 /* do the stemming process: */
229 stemmer = sb_stemmer_new (language, charenc);
230 if (stemmer == 0)
231 {
232 if (charenc == NULL)
233 {
234 fprintf (stderr, "language `%s' not available for stemming\n",
235 language);
236 exit (1);
237 }
238 else
239 {
240 fprintf (stderr,
241 "language `%s' not available for stemming in encoding `%s'\n",
242 language, charenc);
243 exit (1);
244 }
245 }
246 stem_file (stemmer, f_in, f_out);
247 sb_stemmer_delete (stemmer);
248
249 if (in != 0)
250 (void) fclose (f_in);
251 if (out != 0)
252 (void) fclose (f_out);
253
254 return 0;
255}
void * malloc(YYSIZE_T)
void free(void *)
unsigned char sb_symbol
Definition libstemmer.h:8
int sb_stemmer_length(struct sb_stemmer *stemmer)
Definition libstemmer.c:105
struct sb_stemmer * sb_stemmer_new(const char *algorithm, const char *charenc)
Definition libstemmer.c:40
void sb_stemmer_delete(struct sb_stemmer *stemmer)
Definition libstemmer.c:77
const sb_symbol * sb_stemmer_stem(struct sb_stemmer *stemmer, const sb_symbol *word, int size)
Definition libstemmer.c:89
int main(int argc, char *argv[])
Definition stemwords.c:131
static void usage(int n)
Definition stemwords.c:109
static void stem_file(struct sb_stemmer *stemmer, FILE *f_in, FILE *f_out)
Definition stemwords.c:16
static int pretty
Definition stemwords.c:13
const char * progname
Definition stemwords.c:12
#define INC