YAZ  5.34.0
stemwords.c
Go to the documentation of this file.
1 /* This is a simple program which uses libstemmer to provide a command
2  * line interface for stemming using any of the algorithms provided.
3  */
4 
5 #include <stdio.h>
6 #include <stdlib.h> /* for malloc, free */
7 #include <string.h> /* for memmove */
8 #include <ctype.h> /* for isupper, tolower */
9 
10 #include "libstemmer.h"
11 
12 const char * progname;
13 static int pretty = 1;
14 
15 static void
16 stem_file(struct sb_stemmer * stemmer, FILE * f_in, FILE * f_out)
17 {
18 #define INC 10
19  int lim = INC;
20  sb_symbol * b = (sb_symbol *) malloc(lim * sizeof(sb_symbol));
21 
22  while(1) {
23  int ch = getc(f_in);
24  if (ch == EOF) {
25  free(b); return;
26  }
27  {
28  int i = 0;
29  int inlen = 0;
30  while(1) {
31  if (ch == '\n' || ch == EOF) break;
32  if (i == lim) {
33  sb_symbol * newb;
34  newb = (sb_symbol *)
35  realloc(b, (lim + INC) * sizeof(sb_symbol));
36  if (newb == 0) goto error;
37  b = newb;
38  lim = lim + INC;
39  }
40  /* Update count of utf-8 characters. */
41  if (ch < 0x80 || ch > 0xBF) inlen += 1;
42  /* force lower case: */
43  if (isupper(ch)) ch = tolower(ch);
44 
45  b[i] = ch;
46  i++;
47  ch = getc(f_in);
48  }
49 
50  {
51  const sb_symbol * stemmed = sb_stemmer_stem(stemmer, b, i);
52  if (stemmed == NULL)
53  {
54  fprintf(stderr, "Out of memory");
55  exit(1);
56  }
57  else
58  {
59  if (pretty == 1) {
60  fwrite(b, i, 1, f_out);
61  fputs(" -> ", f_out);
62  } else if (pretty == 2) {
63  fwrite(b, i, 1, f_out);
64  if (sb_stemmer_length(stemmer) > 0) {
65  int j;
66  if (inlen < 30) {
67  for (j = 30 - inlen; j > 0; j--)
68  fputs(" ", f_out);
69  } else {
70  fputs("\n", f_out);
71  for (j = 30; j > 0; j--)
72  fputs(" ", f_out);
73  }
74  }
75  }
76 
77  fputs((char *)stemmed, f_out);
78  putc('\n', f_out);
79  }
80  }
81  }
82  }
83 error:
84  if (b != 0) free(b);
85  return;
86 }
87 
91 static void
92 usage(int n)
93 {
94  printf("usage: %s [-l <language>] [-i <input file>] [-o <output file>] [-c <character encoding>] [-p[2]] [-h]\n"
95  "\n"
96  "The input file consists of a list of words to be stemmed, one per\n"
97  "line. Words should be in lower case, but (for English) A-Z letters\n"
98  "are mapped to their a-z equivalents anyway. If omitted, stdin is\n"
99  "used.\n"
100  "\n"
101  "If -c is given, the argument is the character encoding of the input\n"
102  "and output files. If it is omitted, the UTF-8 encoding is used.\n"
103  "\n"
104  "If -p is given the output file consists of each word of the input\n"
105  "file followed by \"->\" followed by its stemmed equivalent.\n"
106  "If -p2 is given the output file is a two column layout containing\n"
107  "the input words in the first column and the stemmed eqivalents in\n"
108  "the second column.\n"
109  "Otherwise, the output file consists of the stemmed words, one per\n"
110  "line.\n"
111  "\n"
112  "-h displays this help\n",
113  progname);
114  exit(n);
115 }
116 
117 int
118 main(int argc, char * argv[])
119 {
120  char * in = 0;
121  char * out = 0;
122  FILE * f_in;
123  FILE * f_out;
124  struct sb_stemmer * stemmer;
125 
126  char * language = "english";
127  char * charenc = NULL;
128 
129  char * s;
130  int i = 1;
131  pretty = 0;
132 
133  progname = argv[0];
134 
135  while(i < argc) {
136  s = argv[i++];
137  if (s[0] == '-') {
138  if (strcmp(s, "-o") == 0) {
139  if (i >= argc) {
140  fprintf(stderr, "%s requires an argument\n", s);
141  exit(1);
142  }
143  out = argv[i++];
144  } else if (strcmp(s, "-i") == 0) {
145  if (i >= argc) {
146  fprintf(stderr, "%s requires an argument\n", s);
147  exit(1);
148  }
149  in = argv[i++];
150  } else if (strcmp(s, "-l") == 0) {
151  if (i >= argc) {
152  fprintf(stderr, "%s requires an argument\n", s);
153  exit(1);
154  }
155  language = argv[i++];
156  } else if (strcmp(s, "-c") == 0) {
157  if (i >= argc) {
158  fprintf(stderr, "%s requires an argument\n", s);
159  exit(1);
160  }
161  charenc = argv[i++];
162  } else if (strcmp(s, "-p2") == 0) {
163  pretty = 2;
164  } else if (strcmp(s, "-p") == 0) {
165  pretty = 1;
166  } else if (strcmp(s, "-h") == 0) {
167  usage(0);
168  } else {
169  fprintf(stderr, "option %s unknown\n", s);
170  usage(1);
171  }
172  } else {
173  fprintf(stderr, "unexpected parameter %s\n", s);
174  usage(1);
175  }
176  }
177 
178  /* prepare the files */
179  f_in = (in == 0) ? stdin : fopen(in, "r");
180  if (f_in == 0) {
181  fprintf(stderr, "file %s not found\n", in);
182  exit(1);
183  }
184  f_out = (out == 0) ? stdout : fopen(out, "w");
185  if (f_out == 0) {
186  fprintf(stderr, "file %s cannot be opened\n", out);
187  exit(1);
188  }
189 
190  /* do the stemming process: */
191  stemmer = sb_stemmer_new(language, charenc);
192  if (stemmer == 0) {
193  if (charenc == NULL) {
194  fprintf(stderr, "language `%s' not available for stemming\n", language);
195  exit(1);
196  } else {
197  fprintf(stderr, "language `%s' not available for stemming in encoding `%s'\n", language, charenc);
198  exit(1);
199  }
200  }
201  stem_file(stemmer, f_in, f_out);
202  sb_stemmer_delete(stemmer);
203 
204  if (in != 0) (void) fclose(f_in);
205  if (out != 0) (void) fclose(f_out);
206 
207  return 0;
208 }
209 
void * malloc(YYSIZE_T)
void free(void *)
unsigned char sb_symbol
Definition: libstemmer.h:7
int sb_stemmer_length(struct sb_stemmer *stemmer)
Definition: libstemmer.c:92
struct sb_stemmer * sb_stemmer_new(const char *algorithm, const char *charenc)
Definition: libstemmer.c:35
const sb_symbol * sb_stemmer_stem(struct sb_stemmer *stemmer, const sb_symbol *word, int size)
Definition: libstemmer.c:77
void sb_stemmer_delete(struct sb_stemmer *stemmer)
Definition: libstemmer.c:67
int main(int argc, char *argv[])
Definition: stemwords.c:118
static void usage(int n)
Definition: stemwords.c:92
static void stem_file(struct sb_stemmer *stemmer, FILE *f_in, FILE *f_out)
Definition: stemwords.c:16
static int pretty
Definition: stemwords.c:13
const char * progname
Definition: stemwords.c:12
#define INC