cgi-bin/tek/lib/luahtml.c
author Timm S. Mueller <tmueller@neoscientists.org>
Fri, 24 Oct 2008 01:35:27 +0200
changeset 251 2de5931b723d
parent 241 c6c81629f54e
permissions -rw-r--r--
tek.os.posix has been moved to tek.lib.posix, module initialization procedure
simplified, added statvfs() under Linux
     1 
     2 /*
     3 **	tek.lib.luahtml - Lua+HTML parser/decoding library
     4 **	Written by Timm S. Mueller <tmueller at neoscientists.org>
     5 **	See copyright notice in COPYRIGHT
     6 */
     7 
     8 #include <ctype.h>
     9 #include <stdlib.h>
    10 #include <string.h>
    11 #include <lua.h>
    12 #include <lualib.h>
    13 #include <lauxlib.h>
    14 
    15 
    16 #define topfile(L)	((FILE **)luaL_checkudata(L, 1, LUA_FILEHANDLE))
    17 
    18 
    19 static FILE *tofile (lua_State *L) {
    20   FILE **f = topfile(L);
    21   if (*f == NULL)
    22     luaL_error(L, "attempt to use a closed file");
    23   return *f;
    24 }
    25 
    26 
    27 static unsigned char *encodeutf8(unsigned char *buf, int c)
    28 {
    29 	if (c < 128)
    30 	{
    31 		*buf++ = c;
    32 	}
    33 	else if (c < 2048)
    34 	{
    35 		*buf++ = 0xc0 + (c >> 6);
    36 		*buf++ = 0x80 + (c & 0x3f);
    37 	}
    38 	else if (c < 65536)
    39 	{
    40 		*buf++ = 0xe0 + (c >> 12);
    41 		*buf++ = 0x80 + ((c & 0xfff) >> 6);
    42 		*buf++ = 0x80 + (c & 0x3f);
    43 	}
    44 	else if (c < 2097152)
    45 	{
    46 		*buf++ = 0xf0 + (c >> 18);
    47 		*buf++ = 0x80 + ((c & 0x3ffff) >> 12);
    48 		*buf++ = 0x80 + ((c & 0xfff) >> 6);
    49 		*buf++ = 0x80 + (c & 0x3f);
    50 	}
    51 	else if (c < 67108864)
    52 	{
    53 		*buf++ = 0xf8 + (c >> 24);
    54 		*buf++ = 0x80 + ((c & 0xffffff) >> 18);
    55 		*buf++ = 0x80 + ((c & 0x3ffff) >> 12);
    56 		*buf++ = 0x80 + ((c & 0xfff) >> 6);
    57 		*buf++ = 0x80 + (c & 0x3f);
    58 	}
    59 	else
    60 	{
    61 		*buf++ = 0xfc + (c >> 30);
    62 		*buf++ = 0x80 + ((c & 0x3fffffff) >> 24);
    63 		*buf++ = 0x80 + ((c & 0xffffff) >> 18);
    64 		*buf++ = 0x80 + ((c & 0x3ffff) >> 12);
    65 		*buf++ = 0x80 + ((c & 0xfff) >> 6);
    66 		*buf++ = 0x80 + (c & 0x3f);
    67 	}
    68 	return buf;
    69 }
    70 
    71 
    72 struct utf8reader
    73 {
    74 	int (*readchar)(struct utf8reader *);
    75 	int accu, numa, min, bufc;
    76 	const unsigned char *src;
    77 	size_t srclen;
    78 	FILE *file;
    79 	void *udata;
    80 };
    81 
    82 
    83 static int readstring(struct utf8reader *rd)
    84 {
    85 	if (rd->srclen == 0)
    86 		return -1;
    87 	rd->srclen--;
    88 	return *rd->src++;
    89 }
    90 
    91 
    92 static int readfile(struct utf8reader *rd)
    93 {
    94 	return fgetc(rd->file);
    95 }
    96 
    97 
    98 static int readutf8(struct utf8reader *rd)
    99 {
   100 	int c;
   101 	for (;;)
   102 	{
   103 		if (rd->bufc >= 0)
   104 		{
   105 			c = rd->bufc;
   106 			rd->bufc = -1;
   107 		}
   108 		else
   109 			c = rd->readchar(rd);
   110 
   111 		if (c < 0)
   112 			return c;
   113 
   114 		if (c == 254 || c == 255)
   115 			break;
   116 
   117 		if (c < 128)
   118 		{
   119 			if (rd->numa > 0)
   120 			{
   121 				rd->bufc = c;
   122 				break;
   123 			}
   124 			return c;
   125 		}
   126 		else if (c < 192)
   127 		{
   128 			if (rd->numa == 0)
   129 				break;
   130 			rd->accu <<= 6;
   131 			rd->accu += c - 128;
   132 			rd->numa--;
   133 			if (rd->numa == 0)
   134 			{
   135 				if (rd->accu == 0 || rd->accu < rd->min ||
   136 					(rd->accu >= 55296 && rd->accu <= 57343))
   137 					break;
   138 				c = rd->accu;
   139 				rd->accu = 0;
   140 				return c;
   141 			}
   142 		}
   143 		else
   144 		{
   145 			if (rd->numa > 0)
   146 			{
   147 				rd->bufc = c;
   148 				break;
   149 			}
   150 
   151 			if (c < 224)
   152 			{
   153 				rd->min = 128;
   154 				rd->accu = c - 192;
   155 				rd->numa = 1;
   156 			}
   157 			else if (c < 240)
   158 			{
   159 				rd->min = 2048;
   160 				rd->accu = c - 224;
   161 				rd->numa = 2;
   162 			}
   163 			else if (c < 248)
   164 			{
   165 				rd->min = 65536;
   166 				rd->accu = c - 240;
   167 				rd->numa = 3;
   168 			}
   169 			else if (c < 252)
   170 			{
   171 				rd->min = 2097152;
   172 				rd->accu = c - 248;
   173 				rd->numa = 4;
   174 			}
   175 			else
   176 			{
   177 				rd->min = 67108864;
   178 				rd->accu = c - 252;
   179 				rd->numa = 5;
   180 			}
   181 		}
   182 	}
   183 	/* bad char */
   184 	rd->accu = 0;
   185 	rd->numa = 0;
   186 	return 65533;
   187 }
   188 
   189 
   190 typedef enum { PARSER_UNDEF = -1, PARSER_HTML, PARSER_OPEN1, PARSER_OPEN2,
   191 PARSER_CODE, PARSER_VAR, PARSER_CLOSE } parser_state_t;
   192 
   193 
   194 static unsigned char *outchar(lua_State *L, unsigned char *buf, parser_state_t state, int c)
   195 {
   196 	if (state == PARSER_HTML)
   197 	{
   198 		if (c > 127 || c == '[' || c == ']')
   199 			return buf + sprintf((char *) buf, "&#%02d;", c);
   200 	}
   201 	else if (state == PARSER_CODE)
   202 	{
   203 		if (c > 127)
   204 			return encodeutf8(buf, c);
   205 	}
   206 	else if (c > 127)
   207 		luaL_error(L, "Non-ASCII character outside code or HTML context");
   208 
   209 	*buf++ = c;
   210 	return buf;
   211 }
   212 
   213 
   214 struct readdata
   215 {
   216 	/* buffer including " " .. outfunc .. "(": */
   217 	unsigned char buf0[256];
   218 	/* pointer into buf0 past outfunc: */
   219 	unsigned char *buf;
   220 	/* html+lua parser state: */
   221 	parser_state_t state;
   222 	/* utf8 reader state: */
   223 	struct utf8reader utf8;
   224 };
   225 
   226 
   227 static const char *readparsed(lua_State *L, void *udata, size_t *sz)
   228 {
   229 	struct readdata *rd = udata;
   230 	parser_state_t news = rd->state;
   231 	int c;
   232 
   233 	while ((c = readutf8(&rd->utf8)) >= 0)
   234 	{
   235 		switch (news)
   236 		{
   237 			case PARSER_UNDEF:
   238 				if (c == '<')
   239 				{
   240 					news = PARSER_OPEN1;
   241 					continue;
   242 				}
   243 				rd->state = PARSER_HTML;
   244 				rd->buf[0] = '[';
   245 				rd->buf[1] = '[';
   246 				*sz = outchar(L, rd->buf + 2, rd->state, c) - rd->buf0;
   247 				return (char *) rd->buf0;
   248 
   249 			case PARSER_HTML:
   250 				if (c == '<')
   251 				{
   252 					news = PARSER_OPEN1;
   253 					continue;
   254 				}
   255 				break;
   256 
   257 			case PARSER_OPEN1:
   258 				if (c == '%')
   259 				{
   260 					news = PARSER_OPEN2;
   261 					continue;
   262 				}
   263 				rd->buf[0] = '<';
   264 				rd->buf[1] = c;
   265 				*sz = 2;
   266 				return (char *) rd->buf;
   267 
   268 			case PARSER_OPEN2:
   269 				if (c == '=')
   270 				{
   271 					if (rd->state == PARSER_UNDEF)
   272 					{
   273 						rd->state = PARSER_VAR;
   274 						*sz = rd->buf - rd->buf0;
   275 						return (char *) rd->buf0;
   276 					}
   277 					rd->state = PARSER_VAR;
   278 					strcpy((char *) rd->buf, "]])");
   279 					memcpy(rd->buf + 3, rd->buf0, rd->buf - rd->buf0);
   280 					*sz = 3 + rd->buf - rd->buf0;
   281 					return (char *) rd->buf;
   282 				}
   283 
   284 				if (rd->state == PARSER_UNDEF)
   285 					rd->state = PARSER_CODE;
   286 				else
   287 				{
   288 					rd->state = PARSER_CODE;
   289 					rd->buf[0] = ']';
   290 					rd->buf[1] = ']';
   291 					rd->buf[2] = ')';
   292 					rd->buf[3] = ' ';
   293 					rd->buf[4] = c;
   294 					*sz = 5;
   295 					return (char *) rd->buf;
   296 				}
   297 				break;
   298 
   299 			case PARSER_VAR:
   300 			case PARSER_CODE:
   301 				if (c == '%')
   302 				{
   303 					news = PARSER_CLOSE;
   304 					continue;
   305 				}
   306 				break;
   307 
   308 			case PARSER_CLOSE:
   309 				if (c == '>')
   310 				{
   311 					size_t len;
   312 					if (rd->state == PARSER_CODE)
   313 					{
   314 						rd->state = PARSER_HTML;
   315 						rd->buf[0] = '[';
   316 						rd->buf[1] = '[';
   317 						*sz = rd->buf + 2 - rd->buf0;
   318 						return (char *) rd->buf0;
   319 					}
   320 					rd->state = PARSER_HTML;
   321 					strcpy((char *) rd->buf, " or \"nil\")");
   322 					memcpy(rd->buf + 10, rd->buf0, rd->buf - rd->buf0);
   323 					len = 10 + rd->buf - rd->buf0;
   324 					rd->buf[len++] = '[';
   325 					rd->buf[len++] = '[';
   326 					*sz = len;
   327 					return (char *) rd->buf;
   328 				}
   329 				rd->buf[0] = '%';
   330 				rd->buf[1] = c;
   331 				*sz = 2;
   332 				return (char *) rd->buf;
   333 		}
   334 
   335 		*sz = outchar(L, rd->buf, rd->state, c) - rd->buf;
   336 		return (char *) rd->buf;
   337 	}
   338 
   339 	rd->state = PARSER_UNDEF;
   340 	if (news == PARSER_HTML)
   341 	{
   342 		*sz = 4;
   343 		return "]]) ";
   344 	}
   345 
   346 	return NULL;
   347 }
   348 
   349 
   350 static int load(lua_State *L)
   351 {
   352 	struct readdata rd;
   353 	const char *outfunc = lua_tostring(L, 2);
   354 	const char *chunkname = lua_tostring(L, 3);
   355 	int res;
   356 
   357 	if (lua_isuserdata(L, 1))
   358 	{
   359 		rd.utf8.file = tofile(L);
   360 		rd.utf8.readchar = readfile;
   361 	}
   362 	else
   363 	{
   364 		rd.utf8.src = (unsigned char *) lua_tolstring(L, 1, &rd.utf8.srclen);
   365 		rd.utf8.readchar = readstring;
   366 	}
   367 
   368 	rd.utf8.accu = 0;
   369 	rd.utf8.numa = 0;
   370 	rd.utf8.bufc = -1;
   371 
   372 	rd.state = PARSER_UNDEF;
   373 	strcpy((char *) rd.buf0, " ");
   374 	strcat((char *) rd.buf0, outfunc);
   375 	strcat((char *) rd.buf0, "(");
   376 	rd.buf = rd.buf0 + strlen((char *) rd.buf0);
   377 
   378 	res = lua_load(L, readparsed, &rd, chunkname);
   379 	if (res == 0)
   380 		return 1;
   381 
   382 	lua_pushnil(L);
   383 	lua_insert(L, -2);
   384 	/* nil, message on stack */
   385 	return 2;
   386 }
   387 
   388 
   389 /*****************************************************************************/
   390 
   391 
   392 static int encodeform(lua_State *L)
   393 {
   394 	struct readdata rd;
   395 	char sbuf[16];
   396 	luaL_Buffer b;
   397 	int c;
   398 
   399 	if (lua_isuserdata(L, 1))
   400 	{
   401 		rd.utf8.file = tofile(L);
   402 		rd.utf8.readchar = readfile;
   403 	}
   404 	else
   405 	{
   406 		rd.utf8.src = (unsigned char *) lua_tolstring(L, 1, &rd.utf8.srclen);
   407 		rd.utf8.readchar = readstring;
   408 	}
   409 
   410 	luaL_buffinit(L, &b);
   411 
   412 	rd.utf8.accu = 0;
   413 	rd.utf8.numa = 0;
   414 	rd.utf8.bufc = -1;
   415 
   416 	while ((c = readutf8(&rd.utf8)) >= 0)
   417 	{
   418 		switch(c)
   419 		{
   420 			case 34:
   421 				luaL_addlstring(&b, "&quot;", 6);
   422 				break;
   423 			case 38:
   424 				luaL_addlstring(&b, "&amp;", 5);
   425 				break;
   426 			case 60:
   427 				luaL_addlstring(&b, "&lt;", 4);
   428 				break;
   429 			case 62:
   430 				luaL_addlstring(&b, "&gt;", 4);
   431 				break;
   432 			default:
   433 				if (c == 91 || c == 93 || c > 126)
   434 				{
   435 					sprintf(sbuf, "&#%03d;", c);
   436 					luaL_addstring(&b, sbuf);
   437 				}
   438 				else
   439 					luaL_addchar(&b, c);
   440 		}
   441 	}
   442 
   443 	luaL_pushresult(&b);
   444 	return 1;
   445 }
   446 
   447 
   448 /*****************************************************************************/
   449 
   450 
   451 static const luaL_Reg lib[] =
   452 {
   453 	{ "load", load },
   454 	{ "encodeform", encodeform },
   455 	{ NULL, NULL }
   456 };
   457 
   458 
   459 int luaopen_tek_lib_luahtml(lua_State *L)
   460 {
   461 	luaL_register(L, "tek.lib.luahtml", lib);
   462 	return 0;
   463 }