extractjson.c (7744B)
1 #include <ctype.h> 2 #include <errno.h> 3 #include <stdio.h> 4 #include <stdlib.h> 5 #include <string.h> 6 #include <strings.h> 7 8 #define GETNEXT getnext 9 10 #define ISALPHA(c) ((((unsigned)c) | 32) - 'a' < 26) 11 #define ISSPACE(c) ((c) == ' ' || ((((unsigned)c) - '\t') < 5)) 12 13 typedef struct xmlparser { 14 /* current tag */ 15 char tag[1024]; 16 size_t taglen; 17 /* current tag is in shortform ? <tag /> */ 18 int isshorttag; 19 /* current attribute name */ 20 char name[1024]; 21 /* data buffer used for tag data, cdata and attribute data */ 22 char data[BUFSIZ]; 23 } XMLParser; 24 25 static XMLParser parser; 26 static int isjson; 27 static const char *ignorestate, *endtag; 28 static int (*getnext)(void) = getchar; 29 30 /* ignore parsing all HTML data inside <script> tags, because they may contain 31 characters such as '<' and '>' */ 32 static int 33 getnext_json(void) 34 { 35 int c; 36 37 if ((c = getchar()) == EOF) 38 return EOF; 39 40 if (tolower(c) == tolower((unsigned char)*ignorestate)) { 41 ignorestate++; 42 if (*ignorestate == '\0') { 43 getnext = getchar; /* restore */ 44 putchar('\n'); 45 isjson = 0; 46 return c; 47 } 48 49 } else { 50 ignorestate = endtag; 51 if (c != '\r' && c != '\n') 52 putchar(c); 53 } 54 55 return ' '; 56 } 57 58 static void 59 xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al, 60 const char *v, size_t vl) 61 { 62 if (!strcasecmp(t, "script") && 63 !strcasecmp(a, "type") && 64 (strstr(v, "application/json") || 65 strstr(v, "application/ld+json") || 66 strstr(v, "text/json"))) 67 isjson = 1; 68 } 69 70 static void 71 xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort) 72 { 73 if (!strcasecmp(t, "script") && isjson) { 74 ignorestate = endtag = "</script>"; 75 getnext = getnext_json; 76 return; 77 } 78 } 79 80 static void 81 xml_parseattrs(XMLParser *x) 82 { 83 size_t namelen = 0, valuelen; 84 int c, endsep, endname = 0, valuestart = 0; 85 86 while ((c = GETNEXT()) != EOF) { 87 if (ISSPACE(c)) { 88 if (namelen) 89 endname = 1; 90 continue; 91 } else if (c == '?') 92 ; /* ignore */ 93 else if (c == '=') { 94 x->name[namelen] = '\0'; 95 valuestart = 1; 96 endname = 1; 97 } else if (namelen && ((endname && !valuestart && ISALPHA(c)) || (c == '>' || c == '/'))) { 98 /* attribute without value */ 99 xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); 100 x->name[namelen] = '\0'; 101 endname = 0; 102 x->name[0] = c; 103 namelen = 1; 104 } else if (namelen && valuestart) { 105 /* attribute with value */ 106 valuelen = 0; 107 if (c == '\'' || c == '"') { 108 endsep = c; 109 } else { 110 endsep = ' '; /* ISSPACE() */ 111 goto startvalue; 112 } 113 114 while ((c = GETNEXT()) != EOF) { 115 startvalue: 116 if (c == '&') { /* entities */ 117 x->data[valuelen] = '\0'; 118 /* call data function with data before entity if there is data */ 119 if (valuelen) 120 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 121 x->data[0] = c; 122 valuelen = 1; 123 while ((c = GETNEXT()) != EOF) { 124 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) 125 break; 126 if (valuelen < sizeof(x->data) - 1) 127 x->data[valuelen++] = c; 128 else { 129 /* entity too long for buffer, handle as normal data */ 130 x->data[valuelen] = '\0'; 131 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 132 x->data[0] = c; 133 valuelen = 1; 134 break; 135 } 136 if (c == ';') { 137 x->data[valuelen] = '\0'; 138 valuelen = 0; 139 break; 140 } 141 } 142 } else if (c != endsep && !(endsep == ' ' && (c == '>' || ISSPACE(c)))) { 143 if (valuelen < sizeof(x->data) - 1) { 144 x->data[valuelen++] = c; 145 } else { 146 x->data[valuelen] = '\0'; 147 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 148 x->data[0] = c; 149 valuelen = 1; 150 } 151 } 152 if (c == endsep || (endsep == ' ' && (c == '>' || ISSPACE(c)))) { 153 x->data[valuelen] = '\0'; 154 xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); 155 break; 156 } 157 } 158 namelen = endname = valuestart = 0; 159 } else if (namelen < sizeof(x->name) - 1) { 160 x->name[namelen++] = c; 161 } 162 if (c == '>') { 163 break; 164 } else if (c == '/') { 165 x->isshorttag = 1; 166 x->name[0] = '\0'; 167 namelen = 0; 168 } 169 } 170 } 171 172 static void 173 xml_parsecomment(XMLParser *x) 174 { 175 int c, i = 0; 176 177 while ((c = GETNEXT()) != EOF) { 178 if (c == '-') { 179 if (++i > 2) 180 i = 2; 181 continue; 182 } else if (c == '>' && i == 2) { 183 return; 184 } else if (i) { 185 i = 0; 186 } 187 } 188 } 189 190 static void 191 xml_parsecdata(XMLParser *x) 192 { 193 size_t datalen = 0, i = 0; 194 int c; 195 196 while ((c = GETNEXT()) != EOF) { 197 if (c == ']') { 198 if (++i > 2) 199 i = 2; 200 continue; 201 } else if (c == '>' && i == 2) { 202 return; 203 } else if (i) { 204 i = 0; 205 } 206 207 if (datalen < sizeof(x->data) - 1) { 208 x->data[datalen++] = c; 209 } else { 210 x->data[datalen] = '\0'; 211 x->data[0] = c; 212 datalen = 1; 213 } 214 } 215 } 216 217 static void 218 xml_parse(XMLParser *x) 219 { 220 size_t datalen, tagdatalen; 221 int c, isend; 222 223 while ((c = GETNEXT()) != EOF && c != '<') 224 ; /* skip until < */ 225 226 while (c != EOF) { 227 if (c == '<') { /* parse tag */ 228 if ((c = GETNEXT()) == EOF) 229 return; 230 231 if (c == '!') { /* cdata and comments */ 232 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { 233 /* NOTE: sizeof(x->data) must be at least sizeof("[CDATA[") */ 234 if (tagdatalen <= sizeof("[CDATA[") - 1) 235 x->data[tagdatalen++] = c; 236 if (c == '>') 237 break; 238 else if (c == '-' && tagdatalen == sizeof("--") - 1 && 239 (x->data[0] == '-')) { 240 xml_parsecomment(x); 241 break; 242 } else if (c == '[') { 243 if (tagdatalen == sizeof("[CDATA[") - 1 && 244 !strncmp(x->data, "[CDATA[", tagdatalen)) { 245 xml_parsecdata(x); 246 break; 247 } 248 } 249 } 250 } else { 251 /* normal tag (open, short open, close), processing instruction. */ 252 x->tag[0] = c; 253 x->taglen = 1; 254 x->isshorttag = isend = 0; 255 256 /* treat processing instruction as shorttag, don't strip "?" prefix. */ 257 if (c == '?') { 258 x->isshorttag = 1; 259 } else if (c == '/') { 260 if ((c = GETNEXT()) == EOF) 261 return; 262 x->tag[0] = c; 263 isend = 1; 264 } 265 266 while ((c = GETNEXT()) != EOF) { 267 if (c == '/') 268 x->isshorttag = 1; /* short tag */ 269 else if (c == '>' || ISSPACE(c)) { 270 x->tag[x->taglen] = '\0'; 271 if (isend) { /* end tag, starts with </ */ 272 while (c != '>' && c != EOF) /* skip until > */ 273 c = GETNEXT(); 274 x->tag[0] = '\0'; 275 x->taglen = 0; 276 } else { 277 /* start tag */ 278 if (ISSPACE(c)) 279 xml_parseattrs(x); 280 xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); 281 } 282 /* call tagend for shortform or processing instruction */ 283 if (x->isshorttag) { 284 x->tag[0] = '\0'; 285 x->taglen = 0; 286 } 287 break; 288 } else if (x->taglen < sizeof(x->tag) - 1) 289 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ 290 } 291 } 292 } else { 293 /* parse tag data */ 294 datalen = 0; 295 while ((c = GETNEXT()) != EOF) { 296 if (c == '&') { 297 if (datalen) 298 x->data[datalen] = '\0'; 299 x->data[0] = c; 300 datalen = 1; 301 while ((c = GETNEXT()) != EOF) { 302 if (c == '<') 303 break; 304 if (datalen < sizeof(x->data) - 1) 305 x->data[datalen++] = c; 306 else { 307 /* entity too long for buffer, handle as normal data */ 308 x->data[datalen] = '\0'; 309 x->data[0] = c; 310 datalen = 1; 311 break; 312 } 313 if (c == ';') { 314 x->data[datalen] = '\0'; 315 datalen = 0; 316 break; 317 } 318 } 319 } else if (c != '<') { 320 if (datalen < sizeof(x->data) - 1) { 321 x->data[datalen++] = c; 322 } else { 323 x->data[datalen] = '\0'; 324 x->data[0] = c; 325 datalen = 1; 326 } 327 } 328 if (c == '<') { 329 x->data[datalen] = '\0'; 330 break; 331 } 332 } 333 } 334 } 335 } 336 337 int 338 main(void) 339 { 340 xml_parse(&parser); 341 342 return 0; 343 }