summaryrefslogtreecommitdiff
path: root/source4/lib/appweb/ejs-2.0/exml/exmlParser.c
diff options
context:
space:
mode:
Diffstat (limited to 'source4/lib/appweb/ejs-2.0/exml/exmlParser.c')
-rw-r--r--source4/lib/appweb/ejs-2.0/exml/exmlParser.c752
1 files changed, 752 insertions, 0 deletions
diff --git a/source4/lib/appweb/ejs-2.0/exml/exmlParser.c b/source4/lib/appweb/ejs-2.0/exml/exmlParser.c
new file mode 100644
index 0000000000..14871411a6
--- /dev/null
+++ b/source4/lib/appweb/ejs-2.0/exml/exmlParser.c
@@ -0,0 +1,752 @@
+/*
+ * exml.c -- A simple SAX style XML parser
+ */
+
+/********************************* Description ********************************/
+/*
+ * This is a recursive descent parser for XML text files. It is a one-pass
+ * simple parser that invokes a user supplied callback for key tokens in the
+ * XML file. The user supplies a read function so that XML files can be parsed
+ * from disk or in-memory.
+ */
+/********************************** Includes **********************************/
+
+#include "exml.h"
+
+/****************************** Forward Declarations **************************/
+/* MOB -- FIX */
+#if BLD_FEATURE_EXML || 1
+
+static int parseNext(Exml *xp, int state);
+static ExmlToken getToken(Exml *xp, int state);
+static int getNextChar(Exml *xp);
+static int scanFor(Exml *xp, char *str);
+static int putLastChar(Exml *xp, int c);
+static void error(Exml *xp, char *fmt, ...);
+static void trimToken(Exml *xp);
+
+/************************************ Code ************************************/
+
+Exml *exmlOpen(MprCtx ctx, int initialSize, int maxSize)
+{
+ Exml *xp;
+
+ xp = mprAllocTypeZeroed(ctx, Exml);
+
+ xp->inBuf = mprCreateBuf(xp, EXML_BUFSIZE, EXML_BUFSIZE);
+ xp->tokBuf = mprCreateBuf(xp, initialSize, maxSize);
+
+ return xp;
+}
+
+/******************************************************************************/
+
+void exmlClose(Exml *xp)
+{
+ mprAssert(xp);
+
+ mprFree(xp);
+}
+
+/******************************************************************************/
+
+void exmlSetParserHandler(Exml *xp, ExmlHandler h)
+{
+ mprAssert(xp);
+
+ xp->handler = h;
+}
+
+/******************************************************************************/
+
+void exmlSetInputStream(Exml *xp, ExmlInputStream s, void *arg)
+{
+ mprAssert(xp);
+
+ xp->readFn = s;
+ xp->inputArg = arg;
+}
+
+/******************************************************************************/
+/*
+ * Set the parse arg
+ */
+
+void exmlSetParseArg(Exml *xp, void *parseArg)
+{
+ mprAssert(xp);
+
+ xp->parseArg = parseArg;
+}
+
+/******************************************************************************/
+/*
+ * Set the parse arg
+ */
+
+void *exmlGetParseArg(Exml *xp)
+{
+ mprAssert(xp);
+
+ return xp->parseArg;
+}
+
+/******************************************************************************/
+/*
+ * Parse an XML file. Return 0 for success, -1 for error.
+ */
+
+int exmlParse(Exml *xp)
+{
+ mprAssert(xp);
+
+ return parseNext(xp, EXML_BEGIN);
+}
+
+/******************************************************************************/
+/*
+ * XML parser. This is a recursive descent parser. Return -1 for errors, 0 for
+ * EOF and 1 if there is still more data to parse.
+ */
+
+static int parseNext(Exml *xp, int state)
+{
+ ExmlHandler handler;
+ ExmlToken token;
+ MprBuf *tokBuf;
+ char *tname, *aname;
+ int rc;
+
+ mprAssert(state >= 0);
+
+ tokBuf = xp->tokBuf;
+ handler = xp->handler;
+ tname = aname = 0;
+ rc = 0;
+
+ /*
+ * In this parse loop, the state is never assigned EOF or ERR. In
+ * such cases we always return EOF or ERR.
+ */
+ while (1) {
+
+ token = getToken(xp, state);
+
+ if (token == TOKEN_TOO_BIG) {
+ error(xp, "XML token is too big");
+ goto err;
+ }
+
+ switch (state) {
+ case EXML_BEGIN: /* ------------------------------------------ */
+ /*
+ * Expect to get an element, comment or processing instruction
+ */
+ switch (token) {
+ case TOKEN_EOF:
+ goto exit;
+
+ case TOKEN_LS:
+ /*
+ * Recurse to handle the new element, comment etc.
+ */
+ rc = parseNext(xp, EXML_AFTER_LS);
+ if (rc < 0) {
+ goto exit;
+ }
+ break;
+
+ default:
+ error(xp, "Syntax error");
+ goto err;
+ }
+ break;
+
+ case EXML_AFTER_LS: /* ------------------------------------------ */
+ switch (token) {
+ case TOKEN_COMMENT:
+ state = EXML_COMMENT;
+ rc = (*handler)(xp, state, "!--", 0, mprGetBufStart(tokBuf));
+ if (rc < 0) {
+ goto err;
+ }
+ rc = 1;
+ goto exit;
+
+ case TOKEN_CDATA:
+ state = EXML_CDATA;
+ rc = (*handler)(xp, state, "!--", 0, mprGetBufStart(tokBuf));
+ if (rc < 0) {
+ goto err;
+ }
+ rc = 1;
+ goto exit;
+
+ case TOKEN_INSTRUCTIONS:
+ /* Just ignore processing instructions */
+ rc = 1;
+ goto exit;
+
+ case TOKEN_TEXT:
+ state = EXML_NEW_ELT;
+ tname = mprStrdup(xp, mprGetBufStart(tokBuf));
+ if (tname == 0) {
+ rc = MPR_ERR_MEMORY;
+ goto exit;
+ }
+ rc = (*handler)(xp, state, tname, 0, 0);
+ if (rc < 0) {
+ goto err;
+ }
+ break;
+
+ default:
+ error(xp, "Syntax error");
+ goto err;
+ }
+ break;
+
+ case EXML_NEW_ELT: /* ------------------------------------------ */
+ /*
+ * We have seen the opening "<element" for a new element and have
+ * not yet seen the terminating ">" of the opening element.
+ */
+ switch (token) {
+ case TOKEN_TEXT:
+ /*
+ * Must be an attribute name
+ */
+ aname = mprStrdup(xp, mprGetBufStart(tokBuf));
+ token = getToken(xp, state);
+ if (token != TOKEN_EQ) {
+ error(xp, "Missing assignment for attribute \"%s\"", aname);
+ goto err;
+ }
+
+ token = getToken(xp, state);
+ if (token != TOKEN_TEXT) {
+ error(xp, "Missing value for attribute \"%s\"", aname);
+ goto err;
+ }
+ state = EXML_NEW_ATT;
+ rc = (*handler)(xp, state, tname, aname,
+ mprGetBufStart(tokBuf));
+ if (rc < 0) {
+ goto err;
+ }
+ state = EXML_NEW_ELT;
+ break;
+
+ case TOKEN_GR:
+ /*
+ * This is ">" the termination of the opening element
+ */
+ if (*tname == '\0') {
+ error(xp, "Missing element name");
+ goto err;
+ }
+
+ /*
+ * Tell the user that the opening element is now complete
+ */
+ state = EXML_ELT_DEFINED;
+ rc = (*handler)(xp, state, tname, 0, 0);
+ if (rc < 0) {
+ goto err;
+ }
+ state = EXML_ELT_DATA;
+ break;
+
+ case TOKEN_SLASH_GR:
+ /*
+ * If we see a "/>" then this is a solo element
+ */
+ if (*tname == '\0') {
+ error(xp, "Missing element name");
+ goto err;
+ }
+ state = EXML_SOLO_ELT_DEFINED;
+ rc = (*handler)(xp, state, tname, 0, 0);
+ if (rc < 0) {
+ goto err;
+ }
+ rc = 1;
+ goto exit;
+
+ default:
+ error(xp, "Syntax error");
+ goto err;
+ }
+ break;
+
+ case EXML_ELT_DATA: /* -------------------------------------- */
+ /*
+ * We have seen the full opening element "<name ...>" and now
+ * await data or another element.
+ */
+ if (token == TOKEN_LS) {
+ /*
+ * Recurse to handle the new element, comment etc.
+ */
+ rc = parseNext(xp, EXML_AFTER_LS);
+ if (rc < 0) {
+ goto exit;
+ }
+ break;
+
+ } else if (token == TOKEN_LS_SLASH) {
+ state = EXML_END_ELT;
+ break;
+
+ } else if (token != TOKEN_TEXT) {
+ goto err;
+ }
+ if (mprGetBufLength(tokBuf) > 0) {
+ /*
+ * Pass the data between the element to the user
+ */
+ rc = (*handler)(xp, state, tname, 0, mprGetBufStart(tokBuf));
+ if (rc < 0) {
+ goto err;
+ }
+ }
+ break;
+
+ case EXML_END_ELT: /* -------------------------------------- */
+ if (token != TOKEN_TEXT) {
+ error(xp, "Missing closing element name for \"%s\"", tname);
+ goto err;
+ }
+ /*
+ * The closing element name must match the opening element name
+ */
+ if (strcmp(tname, mprGetBufStart(tokBuf)) != 0) {
+ error(xp,
+ "Closing element name \"%s\" does not match on line %d"
+ "opening name \"%s\"",
+ mprGetBufStart(tokBuf), xp->lineNumber, tname);
+ goto err;
+ }
+ rc = (*handler)(xp, state, tname, 0, 0);
+ if (rc < 0) {
+ goto err;
+ }
+ if (getToken(xp, state) != TOKEN_GR) {
+ error(xp, "Syntax error");
+ goto err;
+ }
+ return 1;
+
+ case EXML_EOF: /* ---------------------------------------------- */
+ goto exit;
+
+ case EXML_ERR: /* ---------------------------------------------- */
+ default:
+ goto err;
+ }
+ }
+ mprAssert(0);
+
+err:
+ rc = -1;
+
+exit:
+ mprFree(tname);
+ mprFree(aname);
+
+ return rc;
+}
+
+/******************************************************************************/
+/*
+ * Lexical analyser for XML. Return the next token reading input as required.
+ * It uses a one token look ahead and push back mechanism (LAR1 parser).
+ * Text token identifiers are left in the tokBuf parser buffer on exit.
+ * This Lex has special cases for the states EXML_ELT_DATA where we
+ * have an optimized read of element data, and EXML_AFTER_LS where we
+ * distinguish between element names, processing instructions and comments.
+ */
+
+static ExmlToken getToken(Exml *xp, int state)
+{
+ MprBuf *tokBuf, *inBuf;
+ uchar *cp;
+ int c, rc;
+
+ tokBuf = xp->tokBuf;
+ inBuf = xp->inBuf;
+
+ mprAssert(state >= 0);
+
+ if ((c = getNextChar(xp)) < 0) {
+ return TOKEN_EOF;
+ }
+ mprFlushBuf(tokBuf);
+
+ /*
+ * Special case parsing for names and for element data. We do this for
+ * performance so we can return to the caller the largest token possible
+ */
+ if (state == EXML_ELT_DATA) {
+ /*
+ * Read all the data up to the start of the closing element "<" or the
+ * start of a sub-element.
+ */
+#if UNUSED
+ while (isspace(c)) {
+ if ((c = getNextChar(xp)) < 0) {
+ return TOKEN_EOF;
+ }
+ }
+#endif
+ if (c == '<') {
+ if ((c = getNextChar(xp)) < 0) {
+ return TOKEN_EOF;
+ }
+ if (c == '/') {
+ return TOKEN_LS_SLASH;
+ }
+ putLastChar(xp, c);
+ return TOKEN_LS;
+ }
+ do {
+ if (mprPutCharToBuf(tokBuf, c) < 0) {
+ return TOKEN_TOO_BIG;
+ }
+ if ((c = getNextChar(xp)) < 0) {
+ return TOKEN_EOF;
+ }
+ } while (c != '<');
+
+ /*
+ * Put back the last look-ahead character
+ */
+ putLastChar(xp, c);
+
+ /*
+ * If all white space, then zero the token buffer
+ */
+ for (cp = tokBuf->start; *cp; cp++) {
+ if (!isspace(*cp)) {
+ return TOKEN_TEXT;
+ }
+ }
+ mprFlushBuf(tokBuf);
+ return TOKEN_TEXT;
+ }
+
+ while (1) {
+ switch (c) {
+ case ' ':
+ case '\n':
+ case '\t':
+ case '\r':
+ break;
+
+ case '<':
+ if ((c = getNextChar(xp)) < 0) {
+ return TOKEN_EOF;
+ }
+ if (c == '/') {
+ return TOKEN_LS_SLASH;
+ }
+ putLastChar(xp, c);
+ return TOKEN_LS;
+
+ case '=':
+ return TOKEN_EQ;
+
+ case '>':
+ return TOKEN_GR;
+
+ case '/':
+ if ((c = getNextChar(xp)) < 0) {
+ return TOKEN_EOF;
+ }
+ if (c == '>') {
+ return TOKEN_SLASH_GR;
+ }
+ return TOKEN_ERR;
+
+ case '\"':
+ case '\'':
+ xp->quoteChar = c;
+ /* Fall through */
+
+ default:
+ /*
+ * We handle element names, attribute names and attribute values
+ * here. We do NOT handle data between elements here. Read the
+ * token. Stop on white space or a closing element ">"
+ */
+ if (xp->quoteChar) {
+ if ((c = getNextChar(xp)) < 0) {
+ return TOKEN_EOF;
+ }
+ while (c != xp->quoteChar) {
+ if (mprPutCharToBuf(tokBuf, c) < 0) {
+ return TOKEN_TOO_BIG;
+ }
+ if ((c = getNextChar(xp)) < 0) {
+ return TOKEN_EOF;
+ }
+ }
+ xp->quoteChar = 0;
+
+ } else {
+ while (!isspace(c) && c != '>' && c != '/' && c != '=') {
+ if (mprPutCharToBuf(tokBuf, c) < 0) {
+ return TOKEN_TOO_BIG;
+ }
+ if ((c = getNextChar(xp)) < 0) {
+ return TOKEN_EOF;
+ }
+ }
+ putLastChar(xp, c);
+ }
+ if (mprGetBufLength(tokBuf) <= 0) {
+ return TOKEN_ERR;
+ }
+ mprAddNullToBuf(tokBuf);
+
+ if (state == EXML_AFTER_LS) {
+ /*
+ * If we are just inside an element "<", then analyze what we
+ * have to see if we have an element name, instruction or
+ * comment. Tokbuf will hold "?" for instructions or "!--"
+ * for comments.
+ */
+ if (mprLookAtNextCharInBuf(tokBuf) == '?') {
+ /* Just ignore processing instructions */
+ rc = scanFor(xp, "?>");
+ if (rc < 0) {
+ return TOKEN_TOO_BIG;
+ } else if (rc == 0) {
+ return TOKEN_ERR;
+ }
+ return TOKEN_INSTRUCTIONS;
+
+ } else if (mprLookAtNextCharInBuf(tokBuf) == '!') {
+ /*
+ * First discard the comment leadin "!--" and eat leading
+ * white space.
+ */
+ if (strcmp((char*) tokBuf->start, "![CDATA[") == 0) {
+ mprFlushBuf(tokBuf);
+#if UNUSED
+ c = mprLookAtNextCharInBuf(inBuf);
+ while (isspace(c)) {
+ if ((c = getNextChar(xp)) < 0) {
+ return TOKEN_EOF;
+ }
+ c = mprLookAtNextCharInBuf(inBuf);
+ }
+#endif
+ rc = scanFor(xp, "]]>");
+ if (rc < 0) {
+ return TOKEN_TOO_BIG;
+ } else if (rc == 0) {
+ return TOKEN_ERR;
+ }
+ return TOKEN_CDATA;
+
+ } else {
+ mprFlushBuf(tokBuf);
+#if UNUSED
+ c = mprLookAtNextCharInBuf(inBuf);
+ while (isspace(c)) {
+ if ((c = getNextChar(xp)) < 0) {
+ return TOKEN_EOF;
+ }
+ c = mprLookAtNextCharInBuf(inBuf);
+ }
+#endif
+ rc = scanFor(xp, "-->");
+ if (rc < 0) {
+ return TOKEN_TOO_BIG;
+ } else if (rc == 0) {
+ return TOKEN_ERR;
+ }
+ return TOKEN_COMMENT;
+ }
+ }
+ }
+ trimToken(xp);
+ return TOKEN_TEXT;
+ }
+ if ((c = getNextChar(xp)) < 0) {
+ return TOKEN_EOF;
+ }
+ }
+
+ /* Should never get here */
+ mprAssert(0);
+ return TOKEN_ERR;
+}
+
+/******************************************************************************/
+/*
+ * Scan for a pattern. Eat and discard input up to the pattern. Return 1 if
+ * the pattern was found, return 0 if not found. Return < 0 on errors.
+ */
+
+static int scanFor(Exml *xp, char *str)
+{
+ MprBuf *tokBuf;
+ char *cp;
+ int c;
+
+ mprAssert(str);
+
+ tokBuf = xp->tokBuf;
+
+ while (1) {
+ for (cp = str; *cp; cp++) {
+ if ((c = getNextChar(xp)) < 0) {
+ return 0;
+ }
+ if (tokBuf) {
+ if (mprPutCharToBuf(tokBuf, c) < 0) {
+ return -1;
+ }
+ }
+ if (c != *cp) {
+ break;
+ }
+ }
+ if (*cp == '\0') {
+ /*
+ * Remove the pattern from the tokBuf
+ */
+ if (tokBuf) {
+ mprAdjustBufEnd(tokBuf, -(int) strlen(str));
+ trimToken(xp);
+ }
+ return 1;
+ }
+ }
+}
+
+/******************************************************************************/
+/*
+ * Get another character. We read and buffer blocks of data if we need more
+ * data to parse.
+ */
+
+static int getNextChar(Exml *xp)
+{
+ MprBuf *inBuf;
+ char c;
+ int l;
+
+ inBuf = xp->inBuf;
+ if (mprGetBufLength(inBuf) <= 0) {
+ /*
+ * Flush to reset the servp/endp pointers to the start of the buffer
+ * so we can do a maximal read
+ */
+ mprFlushBuf(inBuf);
+ l = (xp->readFn)(xp, xp->inputArg, mprGetBufStart(inBuf),
+ mprGetBufLinearSpace(inBuf));
+ if (l <= 0) {
+ return -1;
+ }
+ mprAdjustBufEnd(inBuf, l);
+ }
+ c = mprGetCharFromBuf(inBuf);
+
+ if (c == '\n') {
+ xp->lineNumber++;
+ }
+ return c;
+}
+
+/******************************************************************************/
+/*
+ * Put back a character in the input buffer
+ */
+
+static int putLastChar(Exml *xp, int c)
+{
+ if (mprInsertCharToBuf(xp->inBuf, (char) c) < 0) {
+ mprAssert(0);
+ return -1;
+ }
+ if (c == '\n') {
+ xp->lineNumber--;
+ }
+ return 0;
+}
+
+/******************************************************************************/
+/*
+ * Output a parse message
+ */
+
+static void error(Exml *xp, char *fmt, ...)
+{
+ va_list args;
+ char *buf;
+
+ mprAssert(fmt);
+
+ va_start(args, fmt);
+ mprAllocVsprintf(MPR_LOC_ARGS(xp), &buf, MPR_MAX_STRING, fmt, args);
+ va_end(args);
+
+ /*
+ * MOB need to add the failing line text and a pointer to which column
+ */
+ mprFree(xp->errMsg);
+ mprAllocSprintf(MPR_LOC_ARGS(xp), &xp->errMsg, MPR_MAX_STRING,
+ "XML error: %s\nAt line %d\n", buf, xp->lineNumber);
+
+ mprFree(buf);
+}
+
+/******************************************************************************/
+/*
+ * Remove trailing whitespace in a token and ensure it is terminated with
+ * a NULL for easy parsing
+ */
+
+static void trimToken(Exml *xp)
+{
+ while (isspace(mprLookAtLastCharInBuf(xp->tokBuf))) {
+ mprAdjustBufEnd(xp->tokBuf, -1);
+ }
+ mprAddNullToBuf(xp->tokBuf);
+}
+
+/******************************************************************************/
+
+const char *exmlGetErrorMsg(Exml *xp)
+{
+ if (xp->errMsg == 0) {
+ return "";
+ }
+ return xp->errMsg;
+}
+
+/******************************************************************************/
+
+int exmlGetLineNumber(Exml *xp)
+{
+ return xp->lineNumber;
+}
+
+/******************************************************************************/
+#else
+
+void exmlParserDummy() {}
+#endif /* BLD_FEATURE_EXML */
+
+/*
+ * Local variables:
+ * tab-width: 4
+ * c-basic-offset: 4
+ * End:
+ * vim:tw=78
+ * vim600: sw=4 ts=4 fdm=marker
+ * vim<600: sw=4 ts=4
+ */