diff options
Diffstat (limited to 'webapps/qooxdoo-0.6.3-sdk/frontend/framework/tool/modules/tokenizer.py')
-rwxr-xr-x | webapps/qooxdoo-0.6.3-sdk/frontend/framework/tool/modules/tokenizer.py | 349 |
1 files changed, 349 insertions, 0 deletions
diff --git a/webapps/qooxdoo-0.6.3-sdk/frontend/framework/tool/modules/tokenizer.py b/webapps/qooxdoo-0.6.3-sdk/frontend/framework/tool/modules/tokenizer.py new file mode 100755 index 0000000000..2f8e40436b --- /dev/null +++ b/webapps/qooxdoo-0.6.3-sdk/frontend/framework/tool/modules/tokenizer.py @@ -0,0 +1,349 @@ +#!/usr/bin/env python + +import sys, string, re, optparse +import config, filetool, comment + +R_WHITESPACE = re.compile(r"(\s+)") +R_NONWHITESPACE = re.compile("\S+") +R_NUMBER = re.compile("^[0-9]+") +R_NEWLINE = re.compile(r"(\n)") + +# Ideas from: http://www.regular-expressions.info/examplesprogrammer.html +# Multicomment RegExp inspired by: http://ostermiller.org/findcomment.html + +# builds regexp strings +S_STRING_A = "'[^'\\\n]*(\\.|\n[^'\\\n]*)*'" +S_STRING_B = '"[^"\\\n]*(\\.|\n[^"\\\n]*)*"' + +S_FLOAT = "([0-9]+\.[0-9]+)" + +S_OPERATORS_2 = r"(==)|(!=)|(\+\+)|(--)|(-=)|(\+=)|(\*=)|(/=)|(%=)|(&&)|(\|\|)|(\>=)|(\<=)|(>>)|(<<)|(\^\|)|(\|=)|(\^=)|(&=)|(::)|(\.\.)" +S_OPERATORS_3 = r"(===)|(!==)|(\<\<=)|(\>\>=)|(\>\>\>)" +S_OPERATORS_4 = r"(\>\>\>=)" +S_OPERATORS = "(" + S_OPERATORS_4 + "|" + S_OPERATORS_3 + "|" + S_OPERATORS_2 + ")" + +S_REGEXP = "(\/[^\t\n\r\f\v\/]+?\/[mgi]*)" +S_REGEXP_A = "\.(match|search|split)\s*\(\s*\(*\s*" + S_REGEXP + "\s*\)*\s*\)" +S_REGEXP_B = "\.(replace)\s*\(\s*\(*\s*" + S_REGEXP + "\s*\)*\s*?,?" +S_REGEXP_C = "\s*\(*\s*" + S_REGEXP + "\)*\.(test|exec)\s*\(\s*" +S_REGEXP_D = "(:|=|\?)\s*\(*\s*" + S_REGEXP + "\s*\)*" +S_REGEXP_ALL = S_REGEXP_A + "|" + S_REGEXP_B + "|" + S_REGEXP_C + "|" + S_REGEXP_D + +S_ALL = "(" + comment.S_BLOCK_COMMENT + "|" + comment.S_INLINE_COMMENT + "|" + S_STRING_A + "|" + S_STRING_B + "|" + S_REGEXP_ALL + "|" + S_FLOAT + "|" + S_OPERATORS + ")" + +# compile regexp strings +R_STRING_A = re.compile("^" + S_STRING_A + "$") +R_STRING_B = re.compile("^" + S_STRING_B + "$") +R_FLOAT = re.compile("^" + S_FLOAT + "$") +R_OPERATORS = re.compile(S_OPERATORS) +R_REGEXP = re.compile(S_REGEXP) +R_REGEXP_A = re.compile(S_REGEXP_A) +R_REGEXP_B = re.compile(S_REGEXP_B) +R_REGEXP_C = re.compile(S_REGEXP_C) +R_REGEXP_D = re.compile(S_REGEXP_D) +R_ALL = re.compile(S_ALL) + + + + +parseLine = 1 +parseColumn = 1 +parseUniqueId = "" + + + +def protectEscape(s): + return s.replace("\\\\", "__$ESCAPE0$__").replace("\\\"", "__$ESCAPE1$__").replace("\\\'", "__$ESCAPE2__").replace("\/", "__$ESCAPE3__").replace("\!", "__$ESCAPE4__") + + + +def recoverEscape(s): + return s.replace("__$ESCAPE0$__", "\\\\").replace("__$ESCAPE1$__", "\\\"").replace("__$ESCAPE2__", "\\'").replace("__$ESCAPE3__", "\/").replace("__$ESCAPE4__", "\!") + + + +def parseElement(element): + global parseUniqueId + global parseLine + global parseColumn + + if config.JSPROTECTED.has_key(element): + # print "PROTECTED: %s" % PROTECTED[content] + obj = { "type" : "protected", "detail" : config.JSPROTECTED[element], "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId } + + elif element in config.JSBUILTIN: + # print "BUILTIN: %s" % content + obj = { "type" : "builtin", "detail" : "", "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId } + + elif R_NUMBER.search(element): + # print "NUMBER: %s" % content + obj = { "type" : "number", "detail" : "int", "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId } + + elif element.startswith("_"): + # print "PRIVATE NAME: %s" % content + obj = { "type" : "name", "detail" : "private", "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId } + + elif len(element) > 0: + # print "PUBLIC NAME: %s" % content + obj = { "type" : "name", "detail" : "public", "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId } + + parseColumn += len(element) + + return obj + + +def parsePart(part): + global parseUniqueId + global parseLine + global parseColumn + + tokens = [] + element = "" + + for line in R_NEWLINE.split(part): + if line == "\n": + tokens.append({ "type" : "eol", "source" : "", "detail" : "", "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId }) + parseColumn = 1 + parseLine += 1 + + else: + for item in R_WHITESPACE.split(line): + if item == "": + continue + + if not R_NONWHITESPACE.search(item): + parseColumn += len(item) + continue + + # print "ITEM: '%s'" % item + + for char in item: + # work on single character tokens, otherwise concat to a bigger element + if config.JSTOKENS.has_key(char): + # convert existing element + if element != "": + if R_NONWHITESPACE.search(element): + tokens.append(parseElement(element)) + + element = "" + + # add character to token list + tokens.append({ "type" : "token", "detail" : config.JSTOKENS[char], "source" : char, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId }) + parseColumn += 1 + + else: + element += char + + # convert remaining stuff to tokens + if element != "": + if R_NONWHITESPACE.search(element): + tokens.append(parseElement(element)) + + element = "" + + return tokens + + + +def parseFragmentLead(content, fragment, tokens): + pos = content.find(fragment) + + if pos > 0: + tokens.extend(parsePart(recoverEscape(content[0:pos]))) + + return content[pos+len(fragment):] + + + +def hasLeadingContent(tokens): + pos = len(tokens) - 1 + while pos > 0: + if tokens[pos]["type"] == "eol": + break + + else: + return True + + return False + + + + + +def parseStream(content, uniqueId=""): + # make global variables available + global parseLine + global parseColumn + global parseUniqueId + + # reset global stuff + parseColumn = 1 + parseLine = 1 + parseUniqueId = uniqueId + + # prepare storage + tokens = [] + content = protectEscape(content) + + # print " * searching for patterns..." + all = R_ALL.findall(content) + + # print " * structuring..." + for item in all: + fragment = item[0] + + # print "Found: '%s'" % fragment + + if comment.R_BLOCK_COMMENT.match(fragment): + source = recoverEscape(fragment) + format = comment.getFormat(source) + multiline = comment.isMultiLine(source) + + # print "Type:MultiComment" + content = parseFragmentLead(content, fragment, tokens) + + atBegin = not hasLeadingContent(tokens) + if re.compile("^\s*\n").search(content): + atEnd = True + else: + atEnd = False + + # print "Begin: %s, End: %s" % (atBegin, atEnd) + + # Fixing source content + if atBegin: + source = comment.outdent(source, parseColumn - 1) + + source = comment.correct(source) + + connection = "before" + + if atEnd and not atBegin: + connection = "after" + else: + connection = "before" + + tokens.append({ "type" : "comment", "detail" : format, "multiline" : multiline, "connection" : connection, "source" : source, "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn, "begin" : atBegin, "end" : atEnd }) + parseLine += len(fragment.split("\n")) - 1 + + elif comment.R_INLINE_COMMENT.match(fragment): + # print "Type:SingleComment" + source = recoverEscape(fragment) + content = parseFragmentLead(content, fragment, tokens) + + atBegin = hasLeadingContent(tokens) + atEnd = True + + if atBegin: + connection = "after" + else: + connection = "before" + + source = comment.correct(source) + + tokens.append({ "type" : "comment", "detail" : "inline", "multiline" : False, "connection" : connection, "source" : source, "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn, "begin" : atBegin, "end" : atEnd }) + + elif R_STRING_A.match(fragment): + # print "Type:StringA: %s" % fragment + content = parseFragmentLead(content, fragment, tokens) + tokens.append({ "type" : "string", "detail" : "singlequotes", "source" : recoverEscape(fragment)[1:-1].replace("\\\n",""), "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn }) + + elif R_STRING_B.match(fragment): + # print "Type:StringB: %s" % fragment + content = parseFragmentLead(content, fragment, tokens) + tokens.append({ "type" : "string", "detail" : "doublequotes", "source" : recoverEscape(fragment)[1:-1].replace("\\\n",""), "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn }) + + elif R_FLOAT.match(fragment): + # print "Type:Float: %s" % fragment + content = parseFragmentLead(content, fragment, tokens) + tokens.append({ "type" : "number", "detail" : "float", "source" : fragment, "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn }) + + elif R_OPERATORS.match(fragment): + # print "Type:Operator: %s" % fragment + content = parseFragmentLead(content, fragment, tokens) + tokens.append({ "type" : "token", "detail" : config.JSTOKENS[fragment], "source" : fragment, "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn }) + + else: + fragresult = R_REGEXP.search(fragment) + + if fragresult: + # print "Type:RegExp: %s" % fragresult.group(0) + + if R_REGEXP_A.match(fragment) or R_REGEXP_B.match(fragment) or R_REGEXP_C.match(fragment) or R_REGEXP_D.match(fragment): + content = parseFragmentLead(content, fragresult.group(0), tokens) + tokens.append({ "type" : "regexp", "detail" : "", "source" : recoverEscape(fragresult.group(0)), "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn }) + + else: + print "Bad regular expression: %s" % fragresult.group(0) + + else: + print "Type:None!" + + tokens.extend(parsePart(recoverEscape(content))) + tokens.append({ "type" : "eof", "source" : "", "detail" : "", "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn }) + + return tokens + + + +def parseFile(fileName, uniqueId="", encoding="utf-8"): + return parseStream(filetool.read(fileName, encoding), uniqueId) + + + + +def convertTokensToString(tokens): + tokenizedString = "" + + for token in tokens: + tokenizedString += "%s%s" % (token, "\n") + + return tokenizedString + + + + + +def main(): + parser = optparse.OptionParser() + + parser.add_option("-w", "--write", action="store_true", dest="write", default=False, help="Writes file to incoming fileName + EXTENSION.") + parser.add_option("-e", "--extension", dest="extension", metavar="EXTENSION", help="The EXTENSION to use", default=".tokenized") + parser.add_option("--encoding", dest="encoding", default="utf-8", metavar="ENCODING", help="Defines the encoding expected for input files.") + + (options, args) = parser.parse_args() + + if len(args) == 0: + print "Needs one or more arguments (files) to tokenize!" + sys.exit(1) + + for fileName in args: + if options.write: + print "Compiling %s => %s%s" % (fileName, fileName, options.extension) + else: + print "Compiling %s => stdout" % fileName + + tokenString = convertTokensToString(parseFile(fileName, "", options.encoding)) + + if options.write: + filetool.save(fileName + options.extension, tokenString, options.encoding) + + else: + try: + print tokenString + + except UnicodeEncodeError: + print " * Could not encode result to ascii. Use '-w' instead." + sys.exit(1) + + + + +if __name__ == '__main__': + try: + main() + + except KeyboardInterrupt: + print + print " * Keyboard Interrupt" + sys.exit(1) |