extractjson

extract embedded JSON metadata from HTML pages
git clone git://git.codemadness.org/extractjson
Log | Files | Refs | README | LICENSE

extractjson.c (7744B)


      1 #include <ctype.h>
      2 #include <errno.h>
      3 #include <stdio.h>
      4 #include <stdlib.h>
      5 #include <string.h>
      6 #include <strings.h>
      7 
      8 #define GETNEXT getnext
      9 
     10 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26)
     11 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5))
     12 
     13 typedef struct xmlparser {
     14 	/* current tag */
     15 	char tag[1024];
     16 	size_t taglen;
     17 	/* current tag is in shortform ? <tag /> */
     18 	int isshorttag;
     19 	/* current attribute name */
     20 	char name[1024];
     21 	/* data buffer used for tag data, cdata and attribute data */
     22 	char data[BUFSIZ];
     23 } XMLParser;
     24 
     25 static XMLParser parser;
     26 static int isjson;
     27 static const char *ignorestate, *endtag;
     28 static int (*getnext)(void) = getchar;
     29 
     30 /* ignore parsing all HTML data inside <script> tags, because they may contain
     31    characters such as '<' and '>' */
     32 static int
     33 getnext_json(void)
     34 {
     35 	int c;
     36 
     37 	if ((c = getchar()) == EOF)
     38 		return EOF;
     39 
     40 	if (tolower(c) == tolower((unsigned char)*ignorestate)) {
     41 		ignorestate++;
     42 		if (*ignorestate == '\0') {
     43 			getnext = getchar; /* restore */
     44 			putchar('\n');
     45 			isjson = 0;
     46 			return c;
     47 		}
     48 
     49 	} else {
     50 		ignorestate = endtag;
     51 		if (c != '\r' && c != '\n')
     52 			putchar(c);
     53 	}
     54 
     55 	return ' ';
     56 }
     57 
     58 static void
     59 xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
     60         const char *v, size_t vl)
     61 {
     62 	if (!strcasecmp(t, "script") &&
     63 	    !strcasecmp(a, "type")  &&
     64 	    (strstr(v, "application/json") ||
     65 	    strstr(v, "application/ld+json") ||
     66 	    strstr(v, "text/json")))
     67 		isjson = 1;
     68 }
     69 
     70 static void
     71 xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
     72 {
     73 	if (!strcasecmp(t, "script") && isjson) {
     74 		ignorestate = endtag = "</script>";
     75 		getnext = getnext_json;
     76 		return;
     77 	}
     78 }
     79 
     80 static void
     81 xml_parseattrs(XMLParser *x)
     82 {
     83 	size_t namelen = 0, valuelen;
     84 	int c, endsep, endname = 0, valuestart = 0;
     85 
     86 	while ((c = GETNEXT()) != EOF) {
     87 		if (ISSPACE(c)) {
     88 			if (namelen)
     89 				endname = 1;
     90 			continue;
     91 		} else if (c == '?')
     92 			; /* ignore */
     93 		else if (c == '=') {
     94 			x->name[namelen] = '\0';
     95 			valuestart = 1;
     96 			endname = 1;
     97 		} else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) {
     98 			/* attribute without value */
     99 			xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0);
    100 			x->name[namelen] = '\0';
    101 			endname = 0;
    102 			x->name[0] = c;
    103 			namelen = 1;
    104 		} else if (namelen && valuestart) {
    105 			/* attribute with value */
    106 			valuelen = 0;
    107 			if (c == '\'' || c == '"') {
    108 				endsep = c;
    109 			} else {
    110 				endsep = ' '; /* ISSPACE() */
    111 				goto startvalue;
    112 			}
    113 
    114 			while ((c = GETNEXT()) != EOF) {
    115 startvalue:
    116 				if (c == '&') { /* entities */
    117 					x->data[valuelen] = '\0';
    118 					/* call data function with data before entity if there is data */
    119 					if (valuelen)
    120 						xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
    121 					x->data[0] = c;
    122 					valuelen = 1;
    123 					while ((c = GETNEXT()) != EOF) {
    124 						if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c))))
    125 							break;
    126 						if (valuelen < sizeof(x->data) - 1)
    127 							x->data[valuelen++] = c;
    128 						else {
    129 							/* entity too long for buffer, handle as normal data */
    130 							x->data[valuelen] = '\0';
    131 							xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
    132 							x->data[0] = c;
    133 							valuelen = 1;
    134 							break;
    135 						}
    136 						if (c == ';') {
    137 							x->data[valuelen] = '\0';
    138 							valuelen = 0;
    139 							break;
    140 						}
    141 					}
    142 				} else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) {
    143 					if (valuelen < sizeof(x->data) - 1) {
    144 						x->data[valuelen++] = c;
    145 					} else {
    146 						x->data[valuelen] = '\0';
    147 						xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
    148 						x->data[0] = c;
    149 						valuelen = 1;
    150 					}
    151 				}
    152 				if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) {
    153 					x->data[valuelen] = '\0';
    154 					xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen);
    155 					break;
    156 				}
    157 			}
    158 			namelen = endname = valuestart = 0;
    159 		} else if (namelen < sizeof(x->name) - 1) {
    160 			x->name[namelen++] = c;
    161 		}
    162 		if (c == '>') {
    163 			break;
    164 		} else if (c == '/') {
    165 			x->isshorttag = 1;
    166 			x->name[0] = '\0';
    167 			namelen = 0;
    168 		}
    169 	}
    170 }
    171 
    172 static void
    173 xml_parsecomment(XMLParser *x)
    174 {
    175 	int c, i = 0;
    176 
    177 	while ((c = GETNEXT()) != EOF) {
    178 		if (c == '-') {
    179 			if (++i > 2)
    180 				i = 2;
    181 			continue;
    182 		} else if (c == '>' && i == 2) {
    183 			return;
    184 		} else if (i) {
    185 			i = 0;
    186 		}
    187 	}
    188 }
    189 
    190 static void
    191 xml_parsecdata(XMLParser *x)
    192 {
    193 	size_t datalen = 0, i = 0;
    194 	int c;
    195 
    196 	while ((c = GETNEXT()) != EOF) {
    197 		if (c == ']') {
    198 			if (++i > 2)
    199 				i = 2;
    200 			continue;
    201 		} else if (c == '>' && i == 2) {
    202 			return;
    203 		} else if (i) {
    204 			i = 0;
    205 		}
    206 
    207 		if (datalen < sizeof(x->data) - 1) {
    208 			x->data[datalen++] = c;
    209 		} else {
    210 			x->data[datalen] = '\0';
    211 			x->data[0] = c;
    212 			datalen = 1;
    213 		}
    214 	}
    215 }
    216 
    217 static void
    218 xml_parse(XMLParser *x)
    219 {
    220 	size_t datalen, tagdatalen;
    221 	int c, isend;
    222 
    223 	while ((c = GETNEXT()) != EOF && c != '<')
    224 		; /* skip until < */
    225 
    226 	while (c != EOF) {
    227 		if (c == '<') { /* parse tag */
    228 			if ((c = GETNEXT()) == EOF)
    229 				return;
    230 
    231 			if (c == '!') { /* cdata and comments */
    232 				for (tagdatalen = 0; (c = GETNEXT()) != EOF;) {
    233 					/* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */
    234 					if (tagdatalen <= sizeof("[CDATA[") - 1)
    235 						x->data[tagdatalen++] = c;
    236 					if (c == '>')
    237 						break;
    238 					else if (c == '-' && tagdatalen == sizeof("--") - 1 &&
    239 							(x->data[0] == '-')) {
    240 						xml_parsecomment(x);
    241 						break;
    242 					} else if (c == '[') {
    243 						if (tagdatalen == sizeof("[CDATA[") - 1 &&
    244 						    !strncmp(x->data, "[CDATA[", tagdatalen)) {
    245 							xml_parsecdata(x);
    246 							break;
    247 						}
    248 					}
    249 				}
    250 			} else {
    251 				/* normal tag (open, short open, close), processing instruction. */
    252 				x->tag[0] = c;
    253 				x->taglen = 1;
    254 				x->isshorttag = isend = 0;
    255 
    256 				/* treat processing instruction as shorttag, don't strip "?" prefix. */
    257 				if (c == '?') {
    258 					x->isshorttag = 1;
    259 				} else if (c == '/') {
    260 					if ((c = GETNEXT()) == EOF)
    261 						return;
    262 					x->tag[0] = c;
    263 					isend = 1;
    264 				}
    265 
    266 				while ((c = GETNEXT()) != EOF) {
    267 					if (c == '/')
    268 						x->isshorttag = 1; /* short tag */
    269 					else if (c == '>' || ISSPACE(c)) {
    270 						x->tag[x->taglen] = '\0';
    271 						if (isend) { /* end tag, starts with </ */
    272 							while (c != '>' && c != EOF) /* skip until > */
    273 								c = GETNEXT();
    274 							x->tag[0] = '\0';
    275 							x->taglen = 0;
    276 						} else {
    277 							/* start tag */
    278 							if (ISSPACE(c))
    279 								xml_parseattrs(x);
    280 							xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag);
    281 						}
    282 						/* call tagend for shortform or processing instruction */
    283 						if (x->isshorttag) {
    284 							x->tag[0] = '\0';
    285 							x->taglen = 0;
    286 						}
    287 						break;
    288 					} else if (x->taglen < sizeof(x->tag) - 1)
    289 						x->tag[x->taglen++] = c; /* NOTE: tag name truncation */
    290 				}
    291 			}
    292 		} else {
    293 			/* parse tag data */
    294 			datalen = 0;
    295 			while ((c = GETNEXT()) != EOF) {
    296 				if (c == '&') {
    297 					if (datalen)
    298 						x->data[datalen] = '\0';
    299 					x->data[0] = c;
    300 					datalen = 1;
    301 					while ((c = GETNEXT()) != EOF) {
    302 						if (c == '<')
    303 							break;
    304 						if (datalen < sizeof(x->data) - 1)
    305 							x->data[datalen++] = c;
    306 						else {
    307 							/* entity too long for buffer, handle as normal data */
    308 							x->data[datalen] = '\0';
    309 							x->data[0] = c;
    310 							datalen = 1;
    311 							break;
    312 						}
    313 						if (c == ';') {
    314 							x->data[datalen] = '\0';
    315 							datalen = 0;
    316 							break;
    317 						}
    318 					}
    319 				} else if (c != '<') {
    320 					if (datalen < sizeof(x->data) - 1) {
    321 						x->data[datalen++] = c;
    322 					} else {
    323 						x->data[datalen] = '\0';
    324 						x->data[0] = c;
    325 						datalen = 1;
    326 					}
    327 				}
    328 				if (c == '<') {
    329 					x->data[datalen] = '\0';
    330 					break;
    331 				}
    332 			}
    333 		}
    334 	}
    335 }
    336 
    337 int
    338 main(void)
    339 {
    340 	xml_parse(&parser);
    341 
    342 	return 0;
    343 }