1 files changed, 547 insertions, 0 deletions
diff --git a/lib/dnspython/dns/tokenizer.py b/lib/dnspython/dns/tokenizer.py
new file mode 100644
index 0000000000..4f68a2a495
--- /dev/null
+++ b/lib/dnspython/dns/tokenizer.py
@@ -0,0 +1,547 @@
+# Copyright (C) 2003-2007, 2009, 2010 Nominum, Inc.
+#
+# Permission to use, copy, modify, and distribute this software and its
+# documentation for any purpose with or without fee is hereby granted,
+# provided that the above copyright notice and this permission notice
+# appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND NOMINUM DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL NOMINUM BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+"""Tokenize DNS master file format"""
+
+import cStringIO
+import sys
+
+import dns.exception
+import dns.name
+import dns.ttl
+
+_DELIMITERS = {
+    ' ' : True,
+    '\t' : True,
+    '\n' : True,
+    ';' : True,
+    '(' : True,
+    ')' : True,
+    '"' : True }
+
+_QUOTING_DELIMITERS = { '"' : True }
+
+EOF = 0
+EOL = 1
+WHITESPACE = 2
+IDENTIFIER = 3
+QUOTED_STRING = 4
+COMMENT = 5
+DELIMITER = 6
+
+class UngetBufferFull(dns.exception.DNSException):
+    """Raised when an attempt is made to unget a token when the unget
+    buffer is full."""
+    pass
+
+class Token(object):
+    """A DNS master file format token.
+
+    @ivar ttype: The token type
+    @type ttype: int
+    @ivar value: The token value
+    @type value: string
+    @ivar has_escape: Does the token value contain escapes?
+    @type has_escape: bool
+    """
+
+    def __init__(self, ttype, value='', has_escape=False):
+        """Initialize a token instance.
+
+        @param ttype: The token type
+        @type ttype: int
+        @ivar value: The token value
+        @type value: string
+        @ivar has_escape: Does the token value contain escapes?
+        @type has_escape: bool
+        """
+        self.ttype = ttype
+        self.value = value
+        self.has_escape = has_escape
+
+    def is_eof(self):
+        return self.ttype == EOF
+
+    def is_eol(self):
+        return self.ttype == EOL
+
+    def is_whitespace(self):
+        return self.ttype == WHITESPACE
+
+    def is_identifier(self):
+        return self.ttype == IDENTIFIER
+
+    def is_quoted_string(self):
+        return self.ttype == QUOTED_STRING
+
+    def is_comment(self):
+        return self.ttype == COMMENT
+
+    def is_delimiter(self):
+        return self.ttype == DELIMITER
+
+    def is_eol_or_eof(self):
+        return (self.ttype == EOL or self.ttype == EOF)
+
+    def __eq__(self, other):
+        if not isinstance(other, Token):
+            return False
+        return (self.ttype == other.ttype and
+                self.value == other.value)
+
+    def __ne__(self, other):
+        if not isinstance(other, Token):
+            return True
+        return (self.ttype != other.ttype or
+                self.value != other.value)
+
+    def __str__(self):
+        return '%d "%s"' % (self.ttype, self.value)
+
+    def unescape(self):
+        if not self.has_escape:
+            return self
+        unescaped = ''
+        l = len(self.value)
+        i = 0
+        while i < l:
+            c = self.value[i]
+            i += 1
+            if c == '\\':
+                if i >= l:
+                    raise dns.exception.UnexpectedEnd
+                c = self.value[i]
+                i += 1
+                if c.isdigit():
+                    if i >= l:
+                        raise dns.exception.UnexpectedEnd
+                    c2 = self.value[i]
+                    i += 1
+                    if i >= l:
+                        raise dns.exception.UnexpectedEnd
+                    c3 = self.value[i]
+                    i += 1
+                    if not (c2.isdigit() and c3.isdigit()):
+                        raise dns.exception.SyntaxError
+                    c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
+            unescaped += c
+        return Token(self.ttype, unescaped)
+
+    # compatibility for old-style tuple tokens
+
+    def __len__(self):
+        return 2
+
+    def __iter__(self):
+        return iter((self.ttype, self.value))
+
+    def __getitem__(self, i):
+        if i == 0:
+            return self.ttype
+        elif i == 1:
+            return self.value
+        else:
+            raise IndexError
+
+class Tokenizer(object):
+    """A DNS master file format tokenizer.
+
+    A token is a (type, value) tuple, where I{type} is an int, and
+    I{value} is a string.  The valid types are EOF, EOL, WHITESPACE,
+    IDENTIFIER, QUOTED_STRING, COMMENT, and DELIMITER.
+
+    @ivar file: The file to tokenize
+    @type file: file
+    @ivar ungotten_char: The most recently ungotten character, or None.
+    @type ungotten_char: string
+    @ivar ungotten_token: The most recently ungotten token, or None.
+    @type ungotten_token: (int, string) token tuple
+    @ivar multiline: The current multiline level.  This value is increased
+    by one every time a '(' delimiter is read, and decreased by one every time
+    a ')' delimiter is read.
+    @type multiline: int
+    @ivar quoting: This variable is true if the tokenizer is currently
+    reading a quoted string.
+    @type quoting: bool
+    @ivar eof: This variable is true if the tokenizer has encountered EOF.
+    @type eof: bool
+    @ivar delimiters: The current delimiter dictionary.
+    @type delimiters: dict
+    @ivar line_number: The current line number
+    @type line_number: int
+    @ivar filename: A filename that will be returned by the L{where} method.
+    @type filename: string
+    """
+
+    def __init__(self, f=sys.stdin, filename=None):
+        """Initialize a tokenizer instance.
+
+        @param f: The file to tokenize.  The default is sys.stdin.
+        This parameter may also be a string, in which case the tokenizer
+        will take its input from the contents of the string.
+        @type f: file or string
+        @param filename: the name of the filename that the L{where} method
+        will return.
+        @type filename: string
+        """
+
+        if isinstance(f, str):
+            f = cStringIO.StringIO(f)
+            if filename is None:
+                filename = '<string>'
+        else:
+            if filename is None:
+                if f is sys.stdin:
+                    filename = '<stdin>'
+                else:
+                    filename = '<file>'
+        self.file = f
+        self.ungotten_char = None
+        self.ungotten_token = None
+        self.multiline = 0
+        self.quoting = False
+        self.eof = False
+        self.delimiters = _DELIMITERS
+        self.line_number = 1
+        self.filename = filename
+
+    def _get_char(self):
+        """Read a character from input.
+        @rtype: string
+        """
+
+        if self.ungotten_char is None:
+            if self.eof:
+                c = ''
+            else:
+                c = self.file.read(1)
+                if c == '':
+                    self.eof = True
+                elif c == '\n':
+                    self.line_number += 1
+        else:
+            c = self.ungotten_char
+            self.ungotten_char = None
+        return c
+
+    def where(self):
+        """Return the current location in the input.
+
+        @rtype: (string, int) tuple.  The first item is the filename of
+        the input, the second is the current line number.
+        """
+
+        return (self.filename, self.line_number)
+
+    def _unget_char(self, c):
+        """Unget a character.
+
+        The unget buffer for characters is only one character large; it is
+        an error to try to unget a character when the unget buffer is not
+        empty.
+
+        @param c: the character to unget
+        @type c: string
+        @raises UngetBufferFull: there is already an ungotten char
+        """
+
+        if not self.ungotten_char is None:
+            raise UngetBufferFull
+        self.ungotten_char = c
+
+    def skip_whitespace(self):
+        """Consume input until a non-whitespace character is encountered.
+
+        The non-whitespace character is then ungotten, and the number of
+        whitespace characters consumed is returned.
+
+        If the tokenizer is in multiline mode, then newlines are whitespace.
+
+        @rtype: int
+        """
+
+        skipped = 0
+        while True:
+            c = self._get_char()
+            if c != ' ' and c != '\t':
+                if (c != '\n') or not self.multiline:
+                    self._unget_char(c)
+                    return skipped
+            skipped += 1
+
+    def get(self, want_leading = False, want_comment = False):
+        """Get the next token.
+
+        @param want_leading: If True, return a WHITESPACE token if the
+        first character read is whitespace.  The default is False.
+        @type want_leading: bool
+        @param want_comment: If True, return a COMMENT token if the
+        first token read is a comment.  The default is False.
+        @type want_comment: bool
+        @rtype: Token object
+        @raises dns.exception.UnexpectedEnd: input ended prematurely
+        @raises dns.exception.SyntaxError: input was badly formed
+        """
+
+        if not self.ungotten_token is None:
+            token = self.ungotten_token
+            self.ungotten_token = None
+            if token.is_whitespace():
+                if want_leading:
+                    return token
+            elif token.is_comment():
+                if want_comment:
+                    return token
+            else:
+                return token
+        skipped = self.skip_whitespace()
+        if want_leading and skipped > 0:
+            return Token(WHITESPACE, ' ')
+        token = ''
+        ttype = IDENTIFIER
+        has_escape = False
+        while True:
+            c = self._get_char()
+            if c == '' or c in self.delimiters:
+                if c == '' and self.quoting:
+                    raise dns.exception.UnexpectedEnd
+                if token == '' and ttype != QUOTED_STRING:
+                    if c == '(':
+                        self.multiline += 1
+                        self.skip_whitespace()
+                        continue
+                    elif c == ')':
+                        if not self.multiline > 0:
+                            raise dns.exception.SyntaxError
+                        self.multiline -= 1
+                        self.skip_whitespace()
+                        continue
+                    elif c == '"':
+                        if not self.quoting:
+                            self.quoting = True
+                            self.delimiters = _QUOTING_DELIMITERS
+                            ttype = QUOTED_STRING
+                            continue
+                        else:
+                            self.quoting = False
+                            self.delimiters = _DELIMITERS
+                            self.skip_whitespace()
+                            continue
+                    elif c == '\n':
+                        return Token(EOL, '\n')
+                    elif c == ';':
+                        while 1:
+                            c = self._get_char()
+                            if c == '\n' or c == '':
+                                break
+                            token += c
+                        if want_comment:
+                            self._unget_char(c)
+                            return Token(COMMENT, token)
+                        elif c == '':
+                            if self.multiline:
+                                raise dns.exception.SyntaxError('unbalanced parentheses')
+                            return Token(EOF)
+                        elif self.multiline:
+                            self.skip_whitespace()
+                            token = ''
+                            continue
+                        else:
+                            return Token(EOL, '\n')
+                    else:
+                        # This code exists in case we ever want a
+                        # delimiter to be returned.  It never produces
+                        # a token currently.
+                        token = c
+                        ttype = DELIMITER
+                else:
+                    self._unget_char(c)
+                break
+            elif self.quoting:
+                if c == '\\':
+                    c = self._get_char()
+                    if c == '':
+                        raise dns.exception.UnexpectedEnd
+                    if c.isdigit():
+                        c2 = self._get_char()
+                        if c2 == '':
+                            raise dns.exception.UnexpectedEnd
+                        c3 = self._get_char()
+                        if c == '':
+                            raise dns.exception.UnexpectedEnd
+                        if not (c2.isdigit() and c3.isdigit()):
+                            raise dns.exception.SyntaxError
+                        c = chr(int(c) * 100 + int(c2) * 10 + int(c3))
+                elif c == '\n':
+                    raise dns.exception.SyntaxError('newline in quoted string')
+            elif c == '\\':
+                #
+                # It's an escape.  Put it and the next character into
+                # the token; it will be checked later for goodness.
+                #
+                token += c
+                has_escape = True
+                c = self._get_char()
+                if c == '' or c == '\n':
+                    raise dns.exception.UnexpectedEnd
+            token += c
+        if token == '' and ttype != QUOTED_STRING:
+            if self.multiline:
+                raise dns.exception.SyntaxError('unbalanced parentheses')
+            ttype = EOF
+        return Token(ttype, token, has_escape)
+
+    def unget(self, token):
+        """Unget a token.
+
+        The unget buffer for tokens is only one token large; it is
+        an error to try to unget a token when the unget buffer is not
+        empty.
+
+        @param token: the token to unget
+        @type token: Token object
+        @raises UngetBufferFull: there is already an ungotten token
+        """
+
+        if not self.ungotten_token is None:
+            raise UngetBufferFull
+        self.ungotten_token = token
+
+    def next(self):
+        """Return the next item in an iteration.
+        @rtype: (int, string)
+        """
+
+        token = self.get()
+        if token.is_eof():
+            raise StopIteration
+        return token
+
+    def __iter__(self):
+        return self
+
+    # Helpers
+
+    def get_int(self):
+        """Read the next token and interpret it as an integer.
+
+        @raises dns.exception.SyntaxError:
+        @rtype: int
+        """
+
+        token = self.get().unescape()
+        if not token.is_identifier():
+            raise dns.exception.SyntaxError('expecting an identifier')
+        if not token.value.isdigit():
+            raise dns.exception.SyntaxError('expecting an integer')
+        return int(token.value)
+
+    def get_uint8(self):
+        """Read the next token and interpret it as an 8-bit unsigned
+        integer.
+
+        @raises dns.exception.SyntaxError:
+        @rtype: int
+        """
+
+        value = self.get_int()
+        if value < 0 or value > 255:
+            raise dns.exception.SyntaxError('%d is not an unsigned 8-bit integer' % value)
+        return value
+
+    def get_uint16(self):
+        """Read the next token and interpret it as a 16-bit unsigned
+        integer.
+
+        @raises dns.exception.SyntaxError:
+        @rtype: int
+        """
+
+        value = self.get_int()
+        if value < 0 or value > 65535:
+            raise dns.exception.SyntaxError('%d is not an unsigned 16-bit integer' % value)
+        return value
+
+    def get_uint32(self):
+        """Read the next token and interpret it as a 32-bit unsigned
+        integer.
+
+        @raises dns.exception.SyntaxError:
+        @rtype: int
+        """
+
+        token = self.get().unescape()
+        if not token.is_identifier():
+            raise dns.exception.SyntaxError('expecting an identifier')
+        if not token.value.isdigit():
+            raise dns.exception.SyntaxError('expecting an integer')
+        value = long(token.value)
+        if value < 0 or value > 4294967296L:
+            raise dns.exception.SyntaxError('%d is not an unsigned 32-bit integer' % value)
+        return value
+
+    def get_string(self, origin=None):
+        """Read the next token and interpret it as a string.
+
+        @raises dns.exception.SyntaxError:
+        @rtype: string
+        """
+
+        token = self.get().unescape()
+        if not (token.is_identifier() or token.is_quoted_string()):
+            raise dns.exception.SyntaxError('expecting a string')
+        return token.value
+
+    def get_identifier(self, origin=None):
+        """Read the next token and raise an exception if it is not an identifier.
+
+        @raises dns.exception.SyntaxError:
+        @rtype: string
+        """
+
+        token = self.get().unescape()
+        if not token.is_identifier():
+            raise dns.exception.SyntaxError('expecting an identifier')
+        return token.value
+
+    def get_name(self, origin=None):
+        """Read the next token and interpret it as a DNS name.
+
+        @raises dns.exception.SyntaxError:
+        @rtype: dns.name.Name object"""
+
+        token = self.get()
+        if not token.is_identifier():
+            raise dns.exception.SyntaxError('expecting an identifier')
+        return dns.name.from_text(token.value, origin)
+
+    def get_eol(self):
+        """Read the next token and raise an exception if it isn't EOL or
+        EOF.
+
+        @raises dns.exception.SyntaxError:
+        @rtype: string
+        """
+
+        token = self.get()
+        if not token.is_eol_or_eof():
+            raise dns.exception.SyntaxError('expected EOL or EOF, got %d "%s"' % (token.ttype, token.value))
+        return token.value
+
+    def get_ttl(self):
+        token = self.get().unescape()
+        if not token.is_identifier():
+            raise dns.exception.SyntaxError('expecting an identifier')
+        return dns.ttl.from_text(token.value)