/* * exml.c -- A simple SAX style XML parser */ /********************************* Description ********************************/ /* * This is a recursive descent parser for XML text files. It is a one-pass * simple parser that invokes a user supplied callback for key tokens in the * XML file. The user supplies a read function so that XML files can be parsed * from disk or in-memory. */ /********************************** Includes **********************************/ #include "exml.h" /****************************** Forward Declarations **************************/ /* MOB -- FIX */ #if BLD_FEATURE_EXML || 1 static int parseNext(Exml *xp, int state); static ExmlToken getToken(Exml *xp, int state); static int getNextChar(Exml *xp); static int scanFor(Exml *xp, char *str); static int putLastChar(Exml *xp, int c); static void error(Exml *xp, char *fmt, ...); static void trimToken(Exml *xp); /************************************ Code ************************************/ Exml *exmlOpen(MprCtx ctx, int initialSize, int maxSize) { Exml *xp; xp = mprAllocTypeZeroed(ctx, Exml); xp->inBuf = mprCreateBuf(xp, EXML_BUFSIZE, EXML_BUFSIZE); xp->tokBuf = mprCreateBuf(xp, initialSize, maxSize); return xp; } /******************************************************************************/ void exmlClose(Exml *xp) { mprAssert(xp); mprFree(xp); } /******************************************************************************/ void exmlSetParserHandler(Exml *xp, ExmlHandler h) { mprAssert(xp); xp->handler = h; } /******************************************************************************/ void exmlSetInputStream(Exml *xp, ExmlInputStream s, void *arg) { mprAssert(xp); xp->readFn = s; xp->inputArg = arg; } /******************************************************************************/ /* * Set the parse arg */ void exmlSetParseArg(Exml *xp, void *parseArg) { mprAssert(xp); xp->parseArg = parseArg; } /******************************************************************************/ /* * Set the parse arg */ void *exmlGetParseArg(Exml *xp) { mprAssert(xp); return xp->parseArg; } /******************************************************************************/ /* * Parse an XML file. Return 0 for success, -1 for error. */ int exmlParse(Exml *xp) { mprAssert(xp); return parseNext(xp, EXML_BEGIN); } /******************************************************************************/ /* * XML parser. This is a recursive descent parser. Return -1 for errors, 0 for * EOF and 1 if there is still more data to parse. */ static int parseNext(Exml *xp, int state) { ExmlHandler handler; ExmlToken token; MprBuf *tokBuf; char *tname, *aname; int rc; mprAssert(state >= 0); tokBuf = xp->tokBuf; handler = xp->handler; tname = aname = 0; rc = 0; /* * In this parse loop, the state is never assigned EOF or ERR. In * such cases we always return EOF or ERR. */ while (1) { token = getToken(xp, state); if (token == TOKEN_TOO_BIG) { error(xp, "XML token is too big"); goto err; } switch (state) { case EXML_BEGIN: /* ------------------------------------------ */ /* * Expect to get an element, comment or processing instruction */ switch (token) { case TOKEN_EOF: goto exit; case TOKEN_LS: /* * Recurse to handle the new element, comment etc. */ rc = parseNext(xp, EXML_AFTER_LS); if (rc < 0) { goto exit; } break; default: error(xp, "Syntax error"); goto err; } break; case EXML_AFTER_LS: /* ------------------------------------------ */ switch (token) { case TOKEN_COMMENT: state = EXML_COMMENT; rc = (*handler)(xp, state, "!--", 0, mprGetBufStart(tokBuf)); if (rc < 0) { goto err; } rc = 1; goto exit; case TOKEN_CDATA: state = EXML_CDATA; rc = (*handler)(xp, state, "!--", 0, mprGetBufStart(tokBuf)); if (rc < 0) { goto err; } rc = 1; goto exit; case TOKEN_INSTRUCTIONS: /* Just ignore processing instructions */ rc = 1; goto exit; case TOKEN_TEXT: state = EXML_NEW_ELT; tname = mprStrdup(xp, mprGetBufStart(tokBuf)); if (tname == 0) { rc = MPR_ERR_MEMORY; goto exit; } rc = (*handler)(xp, state, tname, 0, 0); if (rc < 0) { goto err; } break; default: error(xp, "Syntax error"); goto err; } break; case EXML_NEW_ELT: /* ------------------------------------------ */ /* * We have seen the opening "" of the opening element. */ switch (token) { case TOKEN_TEXT: /* * Must be an attribute name */ aname = mprStrdup(xp, mprGetBufStart(tokBuf)); token = getToken(xp, state); if (token != TOKEN_EQ) { error(xp, "Missing assignment for attribute \"%s\"", aname); goto err; } token = getToken(xp, state); if (token != TOKEN_TEXT) { error(xp, "Missing value for attribute \"%s\"", aname); goto err; } state = EXML_NEW_ATT; rc = (*handler)(xp, state, tname, aname, mprGetBufStart(tokBuf)); if (rc < 0) { goto err; } state = EXML_NEW_ELT; break; case TOKEN_GR: /* * This is ">" the termination of the opening element */ if (*tname == '\0') { error(xp, "Missing element name"); goto err; } /* * Tell the user that the opening element is now complete */ state = EXML_ELT_DEFINED; rc = (*handler)(xp, state, tname, 0, 0); if (rc < 0) { goto err; } state = EXML_ELT_DATA; break; case TOKEN_SLASH_GR: /* * If we see a "/>" then this is a solo element */ if (*tname == '\0') { error(xp, "Missing element name"); goto err; } state = EXML_SOLO_ELT_DEFINED; rc = (*handler)(xp, state, tname, 0, 0); if (rc < 0) { goto err; } rc = 1; goto exit; default: error(xp, "Syntax error"); goto err; } break; case EXML_ELT_DATA: /* -------------------------------------- */ /* * We have seen the full opening element "" and now * await data or another element. */ if (token == TOKEN_LS) { /* * Recurse to handle the new element, comment etc. */ rc = parseNext(xp, EXML_AFTER_LS); if (rc < 0) { goto exit; } break; } else if (token == TOKEN_LS_SLASH) { state = EXML_END_ELT; break; } else if (token != TOKEN_TEXT) { goto err; } if (mprGetBufLength(tokBuf) > 0) { /* * Pass the data between the element to the user */ rc = (*handler)(xp, state, tname, 0, mprGetBufStart(tokBuf)); if (rc < 0) { goto err; } } break; case EXML_END_ELT: /* -------------------------------------- */ if (token != TOKEN_TEXT) { error(xp, "Missing closing element name for \"%s\"", tname); goto err; } /* * The closing element name must match the opening element name */ if (strcmp(tname, mprGetBufStart(tokBuf)) != 0) { error(xp, "Closing element name \"%s\" does not match on line %d" "opening name \"%s\"", mprGetBufStart(tokBuf), xp->lineNumber, tname); goto err; } rc = (*handler)(xp, state, tname, 0, 0); if (rc < 0) { goto err; } if (getToken(xp, state) != TOKEN_GR) { error(xp, "Syntax error"); goto err; } return 1; case EXML_EOF: /* ---------------------------------------------- */ goto exit; case EXML_ERR: /* ---------------------------------------------- */ default: goto err; } } mprAssert(0); err: rc = -1; exit: mprFree(tname); mprFree(aname); return rc; } /******************************************************************************/ /* * Lexical analyser for XML. Return the next token reading input as required. * It uses a one token look ahead and push back mechanism (LAR1 parser). * Text token identifiers are left in the tokBuf parser buffer on exit. * This Lex has special cases for the states EXML_ELT_DATA where we * have an optimized read of element data, and EXML_AFTER_LS where we * distinguish between element names, processing instructions and comments. */ static ExmlToken getToken(Exml *xp, int state) { MprBuf *tokBuf, *inBuf; uchar *cp; int c, rc; tokBuf = xp->tokBuf; inBuf = xp->inBuf; mprAssert(state >= 0); if ((c = getNextChar(xp)) < 0) { return TOKEN_EOF; } mprFlushBuf(tokBuf); /* * Special case parsing for names and for element data. We do this for * performance so we can return to the caller the largest token possible */ if (state == EXML_ELT_DATA) { /* * Read all the data up to the start of the closing element "<" or the * start of a sub-element. */ #if UNUSED while (isspace(c)) { if ((c = getNextChar(xp)) < 0) { return TOKEN_EOF; } } #endif if (c == '<') { if ((c = getNextChar(xp)) < 0) { return TOKEN_EOF; } if (c == '/') { return TOKEN_LS_SLASH; } putLastChar(xp, c); return TOKEN_LS; } do { if (mprPutCharToBuf(tokBuf, c) < 0) { return TOKEN_TOO_BIG; } if ((c = getNextChar(xp)) < 0) { return TOKEN_EOF; } } while (c != '<'); /* * Put back the last look-ahead character */ putLastChar(xp, c); /* * If all white space, then zero the token buffer */ for (cp = tokBuf->start; *cp; cp++) { if (!isspace(*cp)) { return TOKEN_TEXT; } } mprFlushBuf(tokBuf); return TOKEN_TEXT; } while (1) { switch (c) { case ' ': case '\n': case '\t': case '\r': break; case '<': if ((c = getNextChar(xp)) < 0) { return TOKEN_EOF; } if (c == '/') { return TOKEN_LS_SLASH; } putLastChar(xp, c); return TOKEN_LS; case '=': return TOKEN_EQ; case '>': return TOKEN_GR; case '/': if ((c = getNextChar(xp)) < 0) { return TOKEN_EOF; } if (c == '>') { return TOKEN_SLASH_GR; } return TOKEN_ERR; case '\"': case '\'': xp->quoteChar = c; /* Fall through */ default: /* * We handle element names, attribute names and attribute values * here. We do NOT handle data between elements here. Read the * token. Stop on white space or a closing element ">" */ if (xp->quoteChar) { if ((c = getNextChar(xp)) < 0) { return TOKEN_EOF; } while (c != xp->quoteChar) { if (mprPutCharToBuf(tokBuf, c) < 0) { return TOKEN_TOO_BIG; } if ((c = getNextChar(xp)) < 0) { return TOKEN_EOF; } } xp->quoteChar = 0; } else { while (!isspace(c) && c != '>' && c != '/' && c != '=') { if (mprPutCharToBuf(tokBuf, c) < 0) { return TOKEN_TOO_BIG; } if ((c = getNextChar(xp)) < 0) { return TOKEN_EOF; } } putLastChar(xp, c); } if (mprGetBufLength(tokBuf) <= 0) { return TOKEN_ERR; } mprAddNullToBuf(tokBuf); if (state == EXML_AFTER_LS) { /* * If we are just inside an element "<", then analyze what we * have to see if we have an element name, instruction or * comment. Tokbuf will hold "?" for instructions or "!--" * for comments. */ if (mprLookAtNextCharInBuf(tokBuf) == '?') { /* Just ignore processing instructions */ rc = scanFor(xp, "?>"); if (rc < 0) { return TOKEN_TOO_BIG; } else if (rc == 0) { return TOKEN_ERR; } return TOKEN_INSTRUCTIONS; } else if (mprLookAtNextCharInBuf(tokBuf) == '!') { /* * First discard the comment leadin "!--" and eat leading * white space. */ if (strcmp((char*) tokBuf->start, "![CDATA[") == 0) { mprFlushBuf(tokBuf); #if UNUSED c = mprLookAtNextCharInBuf(inBuf); while (isspace(c)) { if ((c = getNextChar(xp)) < 0) { return TOKEN_EOF; } c = mprLookAtNextCharInBuf(inBuf); } #endif rc = scanFor(xp, "]]>"); if (rc < 0) { return TOKEN_TOO_BIG; } else if (rc == 0) { return TOKEN_ERR; } return TOKEN_CDATA; } else { mprFlushBuf(tokBuf); #if UNUSED c = mprLookAtNextCharInBuf(inBuf); while (isspace(c)) { if ((c = getNextChar(xp)) < 0) { return TOKEN_EOF; } c = mprLookAtNextCharInBuf(inBuf); } #endif rc = scanFor(xp, "-->"); if (rc < 0) { return TOKEN_TOO_BIG; } else if (rc == 0) { return TOKEN_ERR; } return TOKEN_COMMENT; } } } trimToken(xp); return TOKEN_TEXT; } if ((c = getNextChar(xp)) < 0) { return TOKEN_EOF; } } /* Should never get here */ mprAssert(0); return TOKEN_ERR; } /******************************************************************************/ /* * Scan for a pattern. Eat and discard input up to the pattern. Return 1 if * the pattern was found, return 0 if not found. Return < 0 on errors. */ static int scanFor(Exml *xp, char *str) { MprBuf *tokBuf; char *cp; int c; mprAssert(str); tokBuf = xp->tokBuf; while (1) { for (cp = str; *cp; cp++) { if ((c = getNextChar(xp)) < 0) { return 0; } if (tokBuf) { if (mprPutCharToBuf(tokBuf, c) < 0) { return -1; } } if (c != *cp) { break; } } if (*cp == '\0') { /* * Remove the pattern from the tokBuf */ if (tokBuf) { mprAdjustBufEnd(tokBuf, -(int) strlen(str)); trimToken(xp); } return 1; } } } /******************************************************************************/ /* * Get another character. We read and buffer blocks of data if we need more * data to parse. */ static int getNextChar(Exml *xp) { MprBuf *inBuf; char c; int l; inBuf = xp->inBuf; if (mprGetBufLength(inBuf) <= 0) { /* * Flush to reset the servp/endp pointers to the start of the buffer * so we can do a maximal read */ mprFlushBuf(inBuf); l = (xp->readFn)(xp, xp->inputArg, mprGetBufStart(inBuf), mprGetBufLinearSpace(inBuf)); if (l <= 0) { return -1; } mprAdjustBufEnd(inBuf, l); } c = mprGetCharFromBuf(inBuf); if (c == '\n') { xp->lineNumber++; } return c; } /******************************************************************************/ /* * Put back a character in the input buffer */ static int putLastChar(Exml *xp, int c) { if (mprInsertCharToBuf(xp->inBuf, (char) c) < 0) { mprAssert(0); return -1; } if (c == '\n') { xp->lineNumber--; } return 0; } /******************************************************************************/ /* * Output a parse message */ static void error(Exml *xp, char *fmt, ...) { va_list args; char *buf; mprAssert(fmt); va_start(args, fmt); mprAllocVsprintf(MPR_LOC_ARGS(xp), &buf, MPR_MAX_STRING, fmt, args); va_end(args); /* * MOB need to add the failing line text and a pointer to which column */ mprFree(xp->errMsg); mprAllocSprintf(MPR_LOC_ARGS(xp), &xp->errMsg, MPR_MAX_STRING, "XML error: %s\nAt line %d\n", buf, xp->lineNumber); mprFree(buf); } /******************************************************************************/ /* * Remove trailing whitespace in a token and ensure it is terminated with * a NULL for easy parsing */ static void trimToken(Exml *xp) { while (isspace(mprLookAtLastCharInBuf(xp->tokBuf))) { mprAdjustBufEnd(xp->tokBuf, -1); } mprAddNullToBuf(xp->tokBuf); } /******************************************************************************/ const char *exmlGetErrorMsg(Exml *xp) { if (xp->errMsg == 0) { return ""; } return xp->errMsg; } /******************************************************************************/ int exmlGetLineNumber(Exml *xp) { return xp->lineNumber; } /******************************************************************************/ #else void exmlParserDummy() {} #endif /* BLD_FEATURE_EXML */ /* * Local variables: * tab-width: 4 * c-basic-offset: 4 * End: * vim:tw=78 * vim600: sw=4 ts=4 fdm=marker * vim<600: sw=4 ts=4 */