#!/usr/bin/env python

import sys, string, re, optparse
import config, filetool, comment

R_WHITESPACE = re.compile(r"(\s+)")
R_NONWHITESPACE = re.compile("\S+")
R_NUMBER = re.compile("^[0-9]+")
R_NEWLINE = re.compile(r"(\n)")

# Ideas from: http://www.regular-expressions.info/examplesprogrammer.html
# Multicomment RegExp inspired by: http://ostermiller.org/findcomment.html

# builds regexp strings
S_STRING_A = "'[^'\\\n]*(\\.|\n[^'\\\n]*)*'"
S_STRING_B = '"[^"\\\n]*(\\.|\n[^"\\\n]*)*"'

S_FLOAT = "([0-9]+\.[0-9]+)"

S_OPERATORS_2 = r"(==)|(!=)|(\+\+)|(--)|(-=)|(\+=)|(\*=)|(/=)|(%=)|(&&)|(\|\|)|(\>=)|(\<=)|(>>)|(<<)|(\^\|)|(\|=)|(\^=)|(&=)|(::)|(\.\.)"
S_OPERATORS_3 = r"(===)|(!==)|(\<\<=)|(\>\>=)|(\>\>\>)"
S_OPERATORS_4 = r"(\>\>\>=)"
S_OPERATORS = "(" + S_OPERATORS_4 + "|" + S_OPERATORS_3 + "|" + S_OPERATORS_2 + ")"

S_REGEXP = "(\/[^\t\n\r\f\v\/]+?\/[mgi]*)"
S_REGEXP_A = "\.(match|search|split)\s*\(\s*\(*\s*" + S_REGEXP + "\s*\)*\s*\)"
S_REGEXP_B = "\.(replace)\s*\(\s*\(*\s*" + S_REGEXP + "\s*\)*\s*?,?"
S_REGEXP_C = "\s*\(*\s*" + S_REGEXP + "\)*\.(test|exec)\s*\(\s*"
S_REGEXP_D = "(:|=|\?)\s*\(*\s*" + S_REGEXP + "\s*\)*"
S_REGEXP_ALL = S_REGEXP_A + "|" + S_REGEXP_B + "|" + S_REGEXP_C + "|" + S_REGEXP_D

S_ALL = "(" + comment.S_BLOCK_COMMENT + "|" + comment.S_INLINE_COMMENT + "|" + S_STRING_A + "|" + S_STRING_B + "|" + S_REGEXP_ALL + "|" + S_FLOAT + "|" + S_OPERATORS + ")"

# compile regexp strings
R_STRING_A = re.compile("^" + S_STRING_A + "$")
R_STRING_B = re.compile("^" + S_STRING_B + "$")
R_FLOAT = re.compile("^" + S_FLOAT + "$")
R_OPERATORS = re.compile(S_OPERATORS)
R_REGEXP = re.compile(S_REGEXP)
R_REGEXP_A = re.compile(S_REGEXP_A)
R_REGEXP_B = re.compile(S_REGEXP_B)
R_REGEXP_C = re.compile(S_REGEXP_C)
R_REGEXP_D = re.compile(S_REGEXP_D)
R_ALL = re.compile(S_ALL)


parseLine = 1
parseColumn = 1
parseUniqueId = ""


def protectEscape(s):
  return s.replace("\\\\", "__$ESCAPE0$__").replace("\\\"", "__$ESCAPE1$__").replace("\\\'", "__$ESCAPE2__").replace("\/", "__$ESCAPE3__").replace("\!", "__$ESCAPE4__")


def recoverEscape(s):
  return s.replace("__$ESCAPE0$__", "\\\\").replace("__$ESCAPE1$__", "\\\"").replace("__$ESCAPE2__", "\\'").replace("__$ESCAPE3__", "\/").replace("__$ESCAPE4__", "\!")


def parseElement(element):
  global parseUniqueId
  global parseLine
  global parseColumn

  if config.JSPROTECTED.has_key(element):
    # print "PROTECTED: %s" % PROTECTED[content]
    obj = { "type" : "protected", "detail" : config.JSPROTECTED[element], "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId }

  elif element in config.JSBUILTIN:
    # print "BUILTIN: %s" % content
    obj = { "type" : "builtin", "detail" : "", "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId }

  elif R_NUMBER.search(element):
    # print "NUMBER: %s" % content
    obj = { "type" : "number", "detail" : "int", "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId }

  elif element.startswith("_"):
    # print "PRIVATE NAME: %s" % content
    obj = { "type" : "name", "detail" : "private", "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId }

  elif len(element) > 0:
    # print "PUBLIC NAME: %s" % content
    obj = { "type" : "name", "detail" : "public", "source" : element, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId }

  parseColumn += len(element)

  return obj


def parsePart(part):
  global parseUniqueId
  global parseLine
  global parseColumn

  tokens = []
  element = ""

  for line in R_NEWLINE.split(part):
    if line == "\n":
      tokens.append({ "type" : "eol", "source" : "", "detail" : "", "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId })
      parseColumn = 1
      parseLine += 1

    else:
      for item in R_WHITESPACE.split(line):
        if item == "":
          continue

        if not R_NONWHITESPACE.search(item):
          parseColumn += len(item)
          continue

        # print "ITEM: '%s'" % item

        for char in item:
          # work on single character tokens, otherwise concat to a bigger element
          if config.JSTOKENS.has_key(char):
            # convert existing element
            if element != "":
              if R_NONWHITESPACE.search(element):
                tokens.append(parseElement(element))

              element = ""

            # add character to token list
            tokens.append({ "type" : "token", "detail" : config.JSTOKENS[char], "source" : char, "line" : parseLine, "column" : parseColumn, "id" : parseUniqueId })
            parseColumn += 1

          else:
            element += char

        # convert remaining stuff to tokens
        if element != "":
          if R_NONWHITESPACE.search(element):
            tokens.append(parseElement(element))

          element = ""

  return tokens


def parseFragmentLead(content, fragment, tokens):
  pos = content.find(fragment)

  if pos > 0:
    tokens.extend(parsePart(recoverEscape(content[0:pos])))

  return content[pos+len(fragment):]


def hasLeadingContent(tokens):
  pos = len(tokens) - 1
  while pos > 0:
    if tokens[pos]["type"] == "eol":
      break

    else:
      return True

  return False


def parseStream(content, uniqueId=""):
  # make global variables available
  global parseLine
  global parseColumn
  global parseUniqueId

  # reset global stuff
  parseColumn = 1
  parseLine = 1
  parseUniqueId = uniqueId

  # prepare storage
  tokens = []
  content = protectEscape(content)

  # print "      * searching for patterns..."
  all = R_ALL.findall(content)

  # print "      * structuring..."
  for item in all:
    fragment = item[0]

    # print "Found: '%s'" % fragment

    if comment.R_BLOCK_COMMENT.match(fragment):
      source = recoverEscape(fragment)
      format = comment.getFormat(source)
      multiline = comment.isMultiLine(source)

      # print "Type:MultiComment"
      content = parseFragmentLead(content, fragment, tokens)

      atBegin = not hasLeadingContent(tokens)
      if re.compile("^\s*\n").search(content):
        atEnd = True
      else:
        atEnd = False

      # print "Begin: %s, End: %s" % (atBegin, atEnd)

      # Fixing source content
      if atBegin:
        source = comment.outdent(source, parseColumn - 1)

      source = comment.correct(source)

      connection = "before"

      if atEnd and not atBegin:
        connection = "after"
      else:
        connection = "before"

      tokens.append({ "type" : "comment", "detail" : format, "multiline" : multiline, "connection" : connection, "source" : source, "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn, "begin" : atBegin, "end" : atEnd })
      parseLine += len(fragment.split("\n")) - 1

    elif comment.R_INLINE_COMMENT.match(fragment):
      # print "Type:SingleComment"
      source = recoverEscape(fragment)
      content = parseFragmentLead(content, fragment, tokens)

      atBegin = hasLeadingContent(tokens)
      atEnd = True

      if atBegin:
        connection = "after"
      else:
        connection = "before"

      source = comment.correct(source)

      tokens.append({ "type" : "comment", "detail" : "inline", "multiline" : False, "connection" : connection, "source" : source, "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn, "begin" : atBegin, "end" : atEnd })

    elif R_STRING_A.match(fragment):
      # print "Type:StringA: %s" % fragment
      content = parseFragmentLead(content, fragment, tokens)
      tokens.append({ "type" : "string", "detail" : "singlequotes", "source" : recoverEscape(fragment)[1:-1].replace("\\\n",""), "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn })

    elif R_STRING_B.match(fragment):
      # print "Type:StringB: %s" % fragment
      content = parseFragmentLead(content, fragment, tokens)
      tokens.append({ "type" : "string", "detail" : "doublequotes", "source" : recoverEscape(fragment)[1:-1].replace("\\\n",""), "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn })

    elif R_FLOAT.match(fragment):
      # print "Type:Float: %s" % fragment
      content = parseFragmentLead(content, fragment, tokens)
      tokens.append({ "type" : "number", "detail" : "float", "source" : fragment, "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn })

    elif R_OPERATORS.match(fragment):
      # print "Type:Operator: %s" % fragment
      content = parseFragmentLead(content, fragment, tokens)
      tokens.append({ "type" : "token", "detail" : config.JSTOKENS[fragment], "source" : fragment, "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn })

    else:
      fragresult = R_REGEXP.search(fragment)

      if fragresult:
        # print "Type:RegExp: %s" % fragresult.group(0)

        if R_REGEXP_A.match(fragment) or R_REGEXP_B.match(fragment) or R_REGEXP_C.match(fragment) or R_REGEXP_D.match(fragment):
          content = parseFragmentLead(content, fragresult.group(0), tokens)
          tokens.append({ "type" : "regexp", "detail" : "", "source" : recoverEscape(fragresult.group(0)), "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn })

        else:
          print "Bad regular expression: %s" % fragresult.group(0)

      else:
        print "Type:None!"

  tokens.extend(parsePart(recoverEscape(content)))
  tokens.append({ "type" : "eof", "source" : "", "detail" : "", "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn })

  return tokens


def parseFile(fileName, uniqueId="", encoding="utf-8"):
  return parseStream(filetool.read(fileName, encoding), uniqueId)


def convertTokensToString(tokens):
  tokenizedString = ""

  for token in tokens:
    tokenizedString += "%s%s" % (token, "\n")

  return tokenizedString


def main():
  parser = optparse.OptionParser()

  parser.add_option("-w", "--write", action="store_true", dest="write", default=False, help="Writes file to incoming fileName + EXTENSION.")
  parser.add_option("-e", "--extension", dest="extension", metavar="EXTENSION", help="The EXTENSION to use", default=".tokenized")
  parser.add_option("--encoding", dest="encoding", default="utf-8", metavar="ENCODING", help="Defines the encoding expected for input files.")

  (options, args) = parser.parse_args()

  if len(args) == 0:
    print "Needs one or more arguments (files) to tokenize!"
    sys.exit(1)

  for fileName in args:
    if options.write:
      print "Compiling %s => %s%s" % (fileName, fileName, options.extension)
    else:
      print "Compiling %s => stdout" % fileName

    tokenString = convertTokensToString(parseFile(fileName, "", options.encoding))

    if options.write:
      filetool.save(fileName + options.extension, tokenString, options.encoding)
      
    else:
      try:
        print tokenString

      except UnicodeEncodeError:
        print "  * Could not encode result to ascii. Use '-w' instead."
        sys.exit(1)


if __name__ == '__main__':
  try:
    main()

  except KeyboardInterrupt:
    print
    print "  * Keyboard Interrupt"
    sys.exit(1)