diff options
Diffstat (limited to 'source4/lib/appweb/ejs-2.0/exml/exmlParser.c')
-rw-r--r-- | source4/lib/appweb/ejs-2.0/exml/exmlParser.c | 752 |
1 files changed, 752 insertions, 0 deletions
diff --git a/source4/lib/appweb/ejs-2.0/exml/exmlParser.c b/source4/lib/appweb/ejs-2.0/exml/exmlParser.c new file mode 100644 index 0000000000..14871411a6 --- /dev/null +++ b/source4/lib/appweb/ejs-2.0/exml/exmlParser.c @@ -0,0 +1,752 @@ +/* + * exml.c -- A simple SAX style XML parser + */ + +/********************************* Description ********************************/ +/* + * This is a recursive descent parser for XML text files. It is a one-pass + * simple parser that invokes a user supplied callback for key tokens in the + * XML file. The user supplies a read function so that XML files can be parsed + * from disk or in-memory. + */ +/********************************** Includes **********************************/ + +#include "exml.h" + +/****************************** Forward Declarations **************************/ +/* MOB -- FIX */ +#if BLD_FEATURE_EXML || 1 + +static int parseNext(Exml *xp, int state); +static ExmlToken getToken(Exml *xp, int state); +static int getNextChar(Exml *xp); +static int scanFor(Exml *xp, char *str); +static int putLastChar(Exml *xp, int c); +static void error(Exml *xp, char *fmt, ...); +static void trimToken(Exml *xp); + +/************************************ Code ************************************/ + +Exml *exmlOpen(MprCtx ctx, int initialSize, int maxSize) +{ + Exml *xp; + + xp = mprAllocTypeZeroed(ctx, Exml); + + xp->inBuf = mprCreateBuf(xp, EXML_BUFSIZE, EXML_BUFSIZE); + xp->tokBuf = mprCreateBuf(xp, initialSize, maxSize); + + return xp; +} + +/******************************************************************************/ + +void exmlClose(Exml *xp) +{ + mprAssert(xp); + + mprFree(xp); +} + +/******************************************************************************/ + +void exmlSetParserHandler(Exml *xp, ExmlHandler h) +{ + mprAssert(xp); + + xp->handler = h; +} + +/******************************************************************************/ + +void exmlSetInputStream(Exml *xp, ExmlInputStream s, void *arg) +{ + mprAssert(xp); + + xp->readFn = s; + xp->inputArg = arg; +} + +/******************************************************************************/ +/* + * Set the parse arg + */ + +void exmlSetParseArg(Exml *xp, void *parseArg) +{ + mprAssert(xp); + + xp->parseArg = parseArg; +} + +/******************************************************************************/ +/* + * Set the parse arg + */ + +void *exmlGetParseArg(Exml *xp) +{ + mprAssert(xp); + + return xp->parseArg; +} + +/******************************************************************************/ +/* + * Parse an XML file. Return 0 for success, -1 for error. + */ + +int exmlParse(Exml *xp) +{ + mprAssert(xp); + + return parseNext(xp, EXML_BEGIN); +} + +/******************************************************************************/ +/* + * XML parser. This is a recursive descent parser. Return -1 for errors, 0 for + * EOF and 1 if there is still more data to parse. + */ + +static int parseNext(Exml *xp, int state) +{ + ExmlHandler handler; + ExmlToken token; + MprBuf *tokBuf; + char *tname, *aname; + int rc; + + mprAssert(state >= 0); + + tokBuf = xp->tokBuf; + handler = xp->handler; + tname = aname = 0; + rc = 0; + + /* + * In this parse loop, the state is never assigned EOF or ERR. In + * such cases we always return EOF or ERR. + */ + while (1) { + + token = getToken(xp, state); + + if (token == TOKEN_TOO_BIG) { + error(xp, "XML token is too big"); + goto err; + } + + switch (state) { + case EXML_BEGIN: /* ------------------------------------------ */ + /* + * Expect to get an element, comment or processing instruction + */ + switch (token) { + case TOKEN_EOF: + goto exit; + + case TOKEN_LS: + /* + * Recurse to handle the new element, comment etc. + */ + rc = parseNext(xp, EXML_AFTER_LS); + if (rc < 0) { + goto exit; + } + break; + + default: + error(xp, "Syntax error"); + goto err; + } + break; + + case EXML_AFTER_LS: /* ------------------------------------------ */ + switch (token) { + case TOKEN_COMMENT: + state = EXML_COMMENT; + rc = (*handler)(xp, state, "!--", 0, mprGetBufStart(tokBuf)); + if (rc < 0) { + goto err; + } + rc = 1; + goto exit; + + case TOKEN_CDATA: + state = EXML_CDATA; + rc = (*handler)(xp, state, "!--", 0, mprGetBufStart(tokBuf)); + if (rc < 0) { + goto err; + } + rc = 1; + goto exit; + + case TOKEN_INSTRUCTIONS: + /* Just ignore processing instructions */ + rc = 1; + goto exit; + + case TOKEN_TEXT: + state = EXML_NEW_ELT; + tname = mprStrdup(xp, mprGetBufStart(tokBuf)); + if (tname == 0) { + rc = MPR_ERR_MEMORY; + goto exit; + } + rc = (*handler)(xp, state, tname, 0, 0); + if (rc < 0) { + goto err; + } + break; + + default: + error(xp, "Syntax error"); + goto err; + } + break; + + case EXML_NEW_ELT: /* ------------------------------------------ */ + /* + * We have seen the opening "<element" for a new element and have + * not yet seen the terminating ">" of the opening element. + */ + switch (token) { + case TOKEN_TEXT: + /* + * Must be an attribute name + */ + aname = mprStrdup(xp, mprGetBufStart(tokBuf)); + token = getToken(xp, state); + if (token != TOKEN_EQ) { + error(xp, "Missing assignment for attribute \"%s\"", aname); + goto err; + } + + token = getToken(xp, state); + if (token != TOKEN_TEXT) { + error(xp, "Missing value for attribute \"%s\"", aname); + goto err; + } + state = EXML_NEW_ATT; + rc = (*handler)(xp, state, tname, aname, + mprGetBufStart(tokBuf)); + if (rc < 0) { + goto err; + } + state = EXML_NEW_ELT; + break; + + case TOKEN_GR: + /* + * This is ">" the termination of the opening element + */ + if (*tname == '\0') { + error(xp, "Missing element name"); + goto err; + } + + /* + * Tell the user that the opening element is now complete + */ + state = EXML_ELT_DEFINED; + rc = (*handler)(xp, state, tname, 0, 0); + if (rc < 0) { + goto err; + } + state = EXML_ELT_DATA; + break; + + case TOKEN_SLASH_GR: + /* + * If we see a "/>" then this is a solo element + */ + if (*tname == '\0') { + error(xp, "Missing element name"); + goto err; + } + state = EXML_SOLO_ELT_DEFINED; + rc = (*handler)(xp, state, tname, 0, 0); + if (rc < 0) { + goto err; + } + rc = 1; + goto exit; + + default: + error(xp, "Syntax error"); + goto err; + } + break; + + case EXML_ELT_DATA: /* -------------------------------------- */ + /* + * We have seen the full opening element "<name ...>" and now + * await data or another element. + */ + if (token == TOKEN_LS) { + /* + * Recurse to handle the new element, comment etc. + */ + rc = parseNext(xp, EXML_AFTER_LS); + if (rc < 0) { + goto exit; + } + break; + + } else if (token == TOKEN_LS_SLASH) { + state = EXML_END_ELT; + break; + + } else if (token != TOKEN_TEXT) { + goto err; + } + if (mprGetBufLength(tokBuf) > 0) { + /* + * Pass the data between the element to the user + */ + rc = (*handler)(xp, state, tname, 0, mprGetBufStart(tokBuf)); + if (rc < 0) { + goto err; + } + } + break; + + case EXML_END_ELT: /* -------------------------------------- */ + if (token != TOKEN_TEXT) { + error(xp, "Missing closing element name for \"%s\"", tname); + goto err; + } + /* + * The closing element name must match the opening element name + */ + if (strcmp(tname, mprGetBufStart(tokBuf)) != 0) { + error(xp, + "Closing element name \"%s\" does not match on line %d" + "opening name \"%s\"", + mprGetBufStart(tokBuf), xp->lineNumber, tname); + goto err; + } + rc = (*handler)(xp, state, tname, 0, 0); + if (rc < 0) { + goto err; + } + if (getToken(xp, state) != TOKEN_GR) { + error(xp, "Syntax error"); + goto err; + } + return 1; + + case EXML_EOF: /* ---------------------------------------------- */ + goto exit; + + case EXML_ERR: /* ---------------------------------------------- */ + default: + goto err; + } + } + mprAssert(0); + +err: + rc = -1; + +exit: + mprFree(tname); + mprFree(aname); + + return rc; +} + +/******************************************************************************/ +/* + * Lexical analyser for XML. Return the next token reading input as required. + * It uses a one token look ahead and push back mechanism (LAR1 parser). + * Text token identifiers are left in the tokBuf parser buffer on exit. + * This Lex has special cases for the states EXML_ELT_DATA where we + * have an optimized read of element data, and EXML_AFTER_LS where we + * distinguish between element names, processing instructions and comments. + */ + +static ExmlToken getToken(Exml *xp, int state) +{ + MprBuf *tokBuf, *inBuf; + uchar *cp; + int c, rc; + + tokBuf = xp->tokBuf; + inBuf = xp->inBuf; + + mprAssert(state >= 0); + + if ((c = getNextChar(xp)) < 0) { + return TOKEN_EOF; + } + mprFlushBuf(tokBuf); + + /* + * Special case parsing for names and for element data. We do this for + * performance so we can return to the caller the largest token possible + */ + if (state == EXML_ELT_DATA) { + /* + * Read all the data up to the start of the closing element "<" or the + * start of a sub-element. + */ +#if UNUSED + while (isspace(c)) { + if ((c = getNextChar(xp)) < 0) { + return TOKEN_EOF; + } + } +#endif + if (c == '<') { + if ((c = getNextChar(xp)) < 0) { + return TOKEN_EOF; + } + if (c == '/') { + return TOKEN_LS_SLASH; + } + putLastChar(xp, c); + return TOKEN_LS; + } + do { + if (mprPutCharToBuf(tokBuf, c) < 0) { + return TOKEN_TOO_BIG; + } + if ((c = getNextChar(xp)) < 0) { + return TOKEN_EOF; + } + } while (c != '<'); + + /* + * Put back the last look-ahead character + */ + putLastChar(xp, c); + + /* + * If all white space, then zero the token buffer + */ + for (cp = tokBuf->start; *cp; cp++) { + if (!isspace(*cp)) { + return TOKEN_TEXT; + } + } + mprFlushBuf(tokBuf); + return TOKEN_TEXT; + } + + while (1) { + switch (c) { + case ' ': + case '\n': + case '\t': + case '\r': + break; + + case '<': + if ((c = getNextChar(xp)) < 0) { + return TOKEN_EOF; + } + if (c == '/') { + return TOKEN_LS_SLASH; + } + putLastChar(xp, c); + return TOKEN_LS; + + case '=': + return TOKEN_EQ; + + case '>': + return TOKEN_GR; + + case '/': + if ((c = getNextChar(xp)) < 0) { + return TOKEN_EOF; + } + if (c == '>') { + return TOKEN_SLASH_GR; + } + return TOKEN_ERR; + + case '\"': + case '\'': + xp->quoteChar = c; + /* Fall through */ + + default: + /* + * We handle element names, attribute names and attribute values + * here. We do NOT handle data between elements here. Read the + * token. Stop on white space or a closing element ">" + */ + if (xp->quoteChar) { + if ((c = getNextChar(xp)) < 0) { + return TOKEN_EOF; + } + while (c != xp->quoteChar) { + if (mprPutCharToBuf(tokBuf, c) < 0) { + return TOKEN_TOO_BIG; + } + if ((c = getNextChar(xp)) < 0) { + return TOKEN_EOF; + } + } + xp->quoteChar = 0; + + } else { + while (!isspace(c) && c != '>' && c != '/' && c != '=') { + if (mprPutCharToBuf(tokBuf, c) < 0) { + return TOKEN_TOO_BIG; + } + if ((c = getNextChar(xp)) < 0) { + return TOKEN_EOF; + } + } + putLastChar(xp, c); + } + if (mprGetBufLength(tokBuf) <= 0) { + return TOKEN_ERR; + } + mprAddNullToBuf(tokBuf); + + if (state == EXML_AFTER_LS) { + /* + * If we are just inside an element "<", then analyze what we + * have to see if we have an element name, instruction or + * comment. Tokbuf will hold "?" for instructions or "!--" + * for comments. + */ + if (mprLookAtNextCharInBuf(tokBuf) == '?') { + /* Just ignore processing instructions */ + rc = scanFor(xp, "?>"); + if (rc < 0) { + return TOKEN_TOO_BIG; + } else if (rc == 0) { + return TOKEN_ERR; + } + return TOKEN_INSTRUCTIONS; + + } else if (mprLookAtNextCharInBuf(tokBuf) == '!') { + /* + * First discard the comment leadin "!--" and eat leading + * white space. + */ + if (strcmp((char*) tokBuf->start, "![CDATA[") == 0) { + mprFlushBuf(tokBuf); +#if UNUSED + c = mprLookAtNextCharInBuf(inBuf); + while (isspace(c)) { + if ((c = getNextChar(xp)) < 0) { + return TOKEN_EOF; + } + c = mprLookAtNextCharInBuf(inBuf); + } +#endif + rc = scanFor(xp, "]]>"); + if (rc < 0) { + return TOKEN_TOO_BIG; + } else if (rc == 0) { + return TOKEN_ERR; + } + return TOKEN_CDATA; + + } else { + mprFlushBuf(tokBuf); +#if UNUSED + c = mprLookAtNextCharInBuf(inBuf); + while (isspace(c)) { + if ((c = getNextChar(xp)) < 0) { + return TOKEN_EOF; + } + c = mprLookAtNextCharInBuf(inBuf); + } +#endif + rc = scanFor(xp, "-->"); + if (rc < 0) { + return TOKEN_TOO_BIG; + } else if (rc == 0) { + return TOKEN_ERR; + } + return TOKEN_COMMENT; + } + } + } + trimToken(xp); + return TOKEN_TEXT; + } + if ((c = getNextChar(xp)) < 0) { + return TOKEN_EOF; + } + } + + /* Should never get here */ + mprAssert(0); + return TOKEN_ERR; +} + +/******************************************************************************/ +/* + * Scan for a pattern. Eat and discard input up to the pattern. Return 1 if + * the pattern was found, return 0 if not found. Return < 0 on errors. + */ + +static int scanFor(Exml *xp, char *str) +{ + MprBuf *tokBuf; + char *cp; + int c; + + mprAssert(str); + + tokBuf = xp->tokBuf; + + while (1) { + for (cp = str; *cp; cp++) { + if ((c = getNextChar(xp)) < 0) { + return 0; + } + if (tokBuf) { + if (mprPutCharToBuf(tokBuf, c) < 0) { + return -1; + } + } + if (c != *cp) { + break; + } + } + if (*cp == '\0') { + /* + * Remove the pattern from the tokBuf + */ + if (tokBuf) { + mprAdjustBufEnd(tokBuf, -(int) strlen(str)); + trimToken(xp); + } + return 1; + } + } +} + +/******************************************************************************/ +/* + * Get another character. We read and buffer blocks of data if we need more + * data to parse. + */ + +static int getNextChar(Exml *xp) +{ + MprBuf *inBuf; + char c; + int l; + + inBuf = xp->inBuf; + if (mprGetBufLength(inBuf) <= 0) { + /* + * Flush to reset the servp/endp pointers to the start of the buffer + * so we can do a maximal read + */ + mprFlushBuf(inBuf); + l = (xp->readFn)(xp, xp->inputArg, mprGetBufStart(inBuf), + mprGetBufLinearSpace(inBuf)); + if (l <= 0) { + return -1; + } + mprAdjustBufEnd(inBuf, l); + } + c = mprGetCharFromBuf(inBuf); + + if (c == '\n') { + xp->lineNumber++; + } + return c; +} + +/******************************************************************************/ +/* + * Put back a character in the input buffer + */ + +static int putLastChar(Exml *xp, int c) +{ + if (mprInsertCharToBuf(xp->inBuf, (char) c) < 0) { + mprAssert(0); + return -1; + } + if (c == '\n') { + xp->lineNumber--; + } + return 0; +} + +/******************************************************************************/ +/* + * Output a parse message + */ + +static void error(Exml *xp, char *fmt, ...) +{ + va_list args; + char *buf; + + mprAssert(fmt); + + va_start(args, fmt); + mprAllocVsprintf(MPR_LOC_ARGS(xp), &buf, MPR_MAX_STRING, fmt, args); + va_end(args); + + /* + * MOB need to add the failing line text and a pointer to which column + */ + mprFree(xp->errMsg); + mprAllocSprintf(MPR_LOC_ARGS(xp), &xp->errMsg, MPR_MAX_STRING, + "XML error: %s\nAt line %d\n", buf, xp->lineNumber); + + mprFree(buf); +} + +/******************************************************************************/ +/* + * Remove trailing whitespace in a token and ensure it is terminated with + * a NULL for easy parsing + */ + +static void trimToken(Exml *xp) +{ + while (isspace(mprLookAtLastCharInBuf(xp->tokBuf))) { + mprAdjustBufEnd(xp->tokBuf, -1); + } + mprAddNullToBuf(xp->tokBuf); +} + +/******************************************************************************/ + +const char *exmlGetErrorMsg(Exml *xp) +{ + if (xp->errMsg == 0) { + return ""; + } + return xp->errMsg; +} + +/******************************************************************************/ + +int exmlGetLineNumber(Exml *xp) +{ + return xp->lineNumber; +} + +/******************************************************************************/ +#else + +void exmlParserDummy() {} +#endif /* BLD_FEATURE_EXML */ + +/* + * Local variables: + * tab-width: 4 + * c-basic-offset: 4 + * End: + * vim:tw=78 + * vim600: sw=4 ts=4 fdm=marker + * vim<600: sw=4 ts=4 + */ |