metaproxy 1.22.1
html_parser.cpp
Go to the documentation of this file.
1/* This file is part of Metaproxy.
2 Copyright (C) Index Data
3
4Metaproxy is free software; you can redistribute it and/or modify it under
5the terms of the GNU General Public License as published by the Free
6Software Foundation; either version 2, or (at your option) any later
7version.
8
9Metaproxy is distributed in the hope that it will be useful, but WITHOUT ANY
10WARRANTY; without even the implied warranty of MERCHANTABILITY or
11FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12for more details.
13
14You should have received a copy of the GNU General Public License
15along with this program; if not, write to the Free Software
16Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17*/
18
19#include "config.hpp"
20#include "html_parser.hpp"
21
22#include <assert.h>
23#include <string.h>
24#include <stdlib.h>
25#include <ctype.h>
26#include <stdio.h>
27#include <yaz/matchstr.h>
28
29#define SPACECHR " \t\r\n\f"
30
31// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html
32
33namespace metaproxy_1 {
35 friend class HTMLParser;
36 public:
37 void parse_str(HTMLParserEvent &event, const char *cp);
38 void tagText(HTMLParserEvent &event,
39 const char *text_start, const char *text_end);
40 int tagEnd(HTMLParserEvent &event,
41 const char *tag, int tag_len, const char *cp);
42 int tagAttrs(HTMLParserEvent &event,
43 const char *name, int len,
44 const char *cp);
46 const char *cp, int *attr_len,
47 const char **value, int *val_len, int *tr);
48 Rep();
49 ~Rep();
51 bool nest;
52 };
53}
54
55namespace mp = metaproxy_1;
56
57mp::HTMLParser::Rep::Rep()
58{
59 m_verbose = 0;
60 nest = true;
61}
62
63mp::HTMLParser::Rep::~Rep()
64{
65}
66
67mp::HTMLParser::HTMLParser() : m_p(new Rep)
68{
69}
70
71mp::HTMLParser::~HTMLParser()
72{
73}
74
75void mp::HTMLParser::set_verbose(int v)
76{
77 m_p->m_verbose = v;
78}
79
80
81void mp::HTMLParser::parse(mp::HTMLParserEvent & event, const char *str) const
82{
83 m_p->parse_str(event, str);
84}
85
86static int isAlpha(int c)
87{
88 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
89}
90
91static int skipSpace(const char *cp)
92{
93 int i = 0;
94 while (cp[i] && strchr(SPACECHR, cp[i]))
95 i++;
96 return i;
97}
98
99static int skipName(const char *cp)
100{
101 int i;
102 for (i = 0; cp[i] && !strchr(SPACECHR "/><=", cp[i]); i++)
103 ;
104 return i;
105}
106
107int mp::HTMLParser::Rep::skipAttribute(HTMLParserEvent &event,
108 const char *cp, int *attr_len,
109 const char **value, int *val_len,
110 int *tr)
111{
112 int v0, v1;
113 int i = skipName(cp);
114 *attr_len = i;
115 *value = NULL;
116 if (!i)
117 return skipSpace(cp);
118 i += skipSpace(cp + i);
119 if (cp[i] == '=')
120 {
121 i++;
122 i += skipSpace(cp + i);
123 if (cp[i] == '\"' || cp[i] == '\'')
124 {
125 *tr = cp[i];
126 v0 = ++i;
127 while (cp[i] != *tr && cp[i])
128 i++;
129 v1 = i;
130 if (cp[i])
131 i++;
132 }
133 else
134 {
135 *tr = 0;
136 v0 = i;
137 while (cp[i] && !strchr(SPACECHR ">", cp[i]))
138 i++;
139 v1 = i;
140 }
141 *value = cp + v0;
142 *val_len = v1 - v0;
143 i += skipSpace(cp + i);
144 }
145 return i;
146}
147
148int mp::HTMLParser::Rep::tagAttrs(HTMLParserEvent &event,
149 const char *name, int len,
150 const char *cp)
151{
152 int i = skipSpace(cp);
153 while (cp[i] && !strchr("/><", cp[i]))
154 {
155 const char *attr_name = cp + i;
156 int attr_len;
157 const char *value;
158 int val_len;
159 int tr;
160 char x[2];
161 int nor = skipAttribute(event, cp+i, &attr_len, &value, &val_len, &tr);
162 if (!nor)
163 break;
164 i += nor;
165
166 x[0] = tr;
167 x[1] = 0;
168 if (m_verbose)
169 {
170 printf("------ attr %.*s", attr_len, attr_name);
171 if (value)
172 printf("=%.*s", val_len, value);
173 printf("\n");
174 }
175 event.attribute(name, len, attr_name, attr_len, value, val_len, x);
176 }
177 return i;
178}
179
180int mp::HTMLParser::Rep::tagEnd(HTMLParserEvent &event,
181 const char *tag, int tag_len, const char *cp)
182{
183 int i = 0;
184 int close_it = 0;
185 for (; cp[i] && !strchr("/><", cp[i]); i++)
186 ;
187 if (i > 0)
188 {
189 if (m_verbose)
190 printf("------ text %.*s\n", i, cp);
191 event.text(cp, i);
192 }
193 if (cp[i] == '/')
194 {
195 close_it = 1;
196 i++;
197 }
198 if (cp[i] == '>')
199 {
200 if (m_verbose)
201 printf("------ any tag %s %.*s\n",
202 close_it ? "close" : "end", tag_len, tag);
203 event.anyTagEnd(tag, tag_len, close_it);
204 i++;
205 }
206 return i;
207}
208
209void mp::HTMLParser::Rep::tagText(HTMLParserEvent &event,
210 const char *text_start, const char *text_end)
211{
212 if (text_end - text_start) //got text to flush
213 {
214 if (m_verbose)
215 printf("------ text %.*s\n",
216 (int) (text_end - text_start), text_start);
217 event.text(text_start, text_end-text_start);
218 }
219}
220
221void mp::HTMLParser::Rep::parse_str(HTMLParserEvent &event, const char *cp)
222{
223 const char *text_start = cp;
224 while (*cp)
225 {
226 if (*cp++ != '<')
227 continue;
228
229 if (nest && *cp == '!')
230 {
231 int i;
232 tagText(event, text_start, cp - 1);
233 if (cp[1] == '-' && cp[2] == '-')
234 {
235 for (i = 3; cp[i]; i++)
236 if (cp[i] == '-' && cp[i+1] == '-' && cp[i+2] == '>')
237 {
238 i+= 2;
239 event.openTagStart(cp, i);
240 break;
241 }
242 }
243 else
244 {
245 for (i = 1; cp[i] && cp[i] != '>'; i++)
246 ;
247 event.openTagStart(cp, i);
248 }
249 if (m_verbose)
250 printf("------ dtd %.*s\n", i, cp);
251 i += tagEnd(event, cp, i, cp + i);
252 cp += i;
253 text_start = cp;
254 }
255 else if (nest && *cp == '?')
256 {
257 int i;
258 tagText(event, text_start, cp - 1);
259 for (i = 1; cp[i] && cp[i] != '>'; i++)
260 ;
261 event.openTagStart(cp, i);
262 if (m_verbose)
263 printf("------ pi %.*s\n", i, cp);
264 i += tagEnd(event, cp, i, cp + i);
265 cp += i;
266 text_start = cp;
267 }
268 else if (*cp == '/' && isAlpha(cp[1]))
269 {
270 int i;
271
272 i = skipName(++cp);
273
274 if (!nest)
275 {
276 if (i == 6 && !yaz_strncasecmp(cp, "script", i))
277 {
278 int ws = skipSpace(cp + 6);
279 if (cp[ws + 6] == '>')
280 nest = true; /* really terminated */
281 }
282 if (!nest)
283 continue;
284 }
285 tagText(event, text_start, cp - 2);
286 event.closeTag(cp, i);
287 if (m_verbose)
288 printf("------ tag close %.*s\n", i, cp);
289 i += tagEnd(event, cp, i, cp + i);
290 cp += i;
291 text_start = cp;
292 }
293 else if (nest && isAlpha(*cp))
294 {
295 int i, j;
296 tagText(event, text_start, cp - 1);
297 i = skipName(cp);
298 event.openTagStart(cp, i);
299 if (m_verbose)
300 printf("------ tag open %.*s\n", i, cp);
301 j = tagAttrs(event, cp, i, cp + i);
302 j += tagEnd(event, cp, i, cp + i + j);
303
304 if (i == 6 && !yaz_strncasecmp(cp, "script", i))
305 nest = false;
306
307 cp += i + j;
308 text_start = cp;
309 }
310 }
311 tagText(event, text_start, cp);
312}
313
314mp::HTMLParserEvent::~HTMLParserEvent()
315{
316}
317
318/*
319 * Local variables:
320 * c-basic-offset: 4
321 * c-file-style: "Stroustrup"
322 * indent-tabs-mode: nil
323 * End:
324 * vim: shiftwidth=4 tabstop=8 expandtab
325 */
326
int tagEnd(HTMLParserEvent &event, const char *tag, int tag_len, const char *cp)
void tagText(HTMLParserEvent &event, const char *text_start, const char *text_end)
int tagAttrs(HTMLParserEvent &event, const char *name, int len, const char *cp)
void parse_str(HTMLParserEvent &event, const char *cp)
int skipAttribute(HTMLParserEvent &event, const char *cp, int *attr_len, const char **value, int *val_len, int *tr)
static int skipName(const char *cp)
#define SPACECHR
static int isAlpha(int c)
static int skipSpace(const char *cp)