Added foundations for parser, need to implement some form of precedence parsing now. - valhallac - Compiler for set-theoretic programming language.

commit bec75cc31cee86eb7fece599abca75450f6869df
parent 8067a7ad116baace1dbd9537bbb8cc66023b870a
Author: Fredrik Knutsen <moi@knutsen.co>
Date:   Sat, 13 Jul 2019 20:02:53 +0100

Added foundations for parser, need to implement some form of precedence parsing now.

Diffstat:
M Cargo.toml  | 14 ++++++++++++--
M src/main.rs  | 8 ++++++++
A src/syntax/ast.rs  | 234 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D src/syntax/internal_macros.rs  | 0 
M src/syntax/lexer.rs  | 158 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M src/syntax/location.rs  | 11 ++++++-----
M src/syntax/mod.rs  | 19 ++++++++++++++++++-
M src/syntax/parser.rs  | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/syntax/token.rs  | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
M test.vh  | 5 +++--

10 files changed, 597 insertions(+), 38 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,9 +1,20 @@
 [package]
 name = "valhalla"
+description = "Valhalla Language frontend, parser and  AST compiler."
+homepage = "https://knutsen.co"
+repository    = "https://github.com/Demonstrandum/valhalla"
+documentation = "https://github.com/Demonstrandum/valhalla"
+keywords = ["set-theory", "programming", "language", "parser", "compiler"]
+categories = ["parser-implementations", "parsing", "encoding", "command-line-interface"]
+license = "GPL-3.0"
+license-file = "LICENSE.md"
+readme = "README.md"
 version = "0.1.0"
 authors = ["Demonstrandum <moi@knutsen.co>"]
 edition = "2018"
 
 [dependencies]
 lazy_static = "1.3.0"
-regex = "1"-
\ No newline at end of file
+regex = "1"
+snailquote = "0.2.0"
+unicode-width = "0.1.5"
diff --git a/src/main.rs b/src/main.rs
@@ -1,7 +1,15 @@
+//! Crate responsible for parsing and compiling
+//! the generated AST to Brokkr-bytecode for the
+//! Valhalla set theoretic programming language.
+
+/// Syntax submodule, responsible for lexical analysis,
+/// parsing and static analysis.
 mod syntax;
 
+
 fn main() {
     println!("\nTill Valhalla!\n");
     
     syntax::parse_file("./test.vh");
 }
+
diff --git a/src/syntax/ast.rs b/src/syntax/ast.rs
@@ -0,0 +1,233 @@
+use std::convert::TryFrom;
+use std::fmt;
+
+/// Identifiers, node representing a name that
+/// will represent a value stored.
+pub struct IdentNode {
+    /// The name of the identifer.
+    pub value : String
+}
+
+/// Different types of possible number types in the langauge.
+/// Max size is determined by max pointer size.
+#[derive(PartialEq, Debug)]
+pub enum Numerics {
+    /// Naturals are unsigned ints.
+    Natural(usize),
+    /// Integers are signed.
+    Integer(isize),
+    /// Reals are represented as a double.
+    Real(f64)
+}
+
+/// Parse a string of more than two chars with a specified radix, into an ast::Numeric.
+fn parse_with_radix(neg : bool, s : &str, radix : u32) -> Numerics {
+    let unsigned = usize::from_str_radix(s.get(2..).unwrap(), radix).unwrap();
+    if neg {
+        return Numerics::Integer(-(unsigned as isize));
+    }
+    return Numerics::Natural(unsigned);
+}
+
+/// Converts primitive types into ast::Numerics.
+pub trait ToNumeric { fn to_numeric(&self) -> Numerics; }
+impl ToNumeric for &str {
+    fn to_numeric(&self) -> Numerics {
+        let mut test_str = self.clone().to_ascii_lowercase();
+
+        let is_neg = self.starts_with('-');
+        if is_neg { test_str = test_str.get(1..).unwrap().to_string(); }
+
+        return match test_str.get(0..2) {
+            Some("0x") => parse_with_radix(is_neg, &test_str, 16),
+            Some("0o") => parse_with_radix(is_neg, &test_str,  8),
+            Some("0b") => parse_with_radix(is_neg, &test_str,  2),
+            Some(_) => {
+                let exp_notation : Vec<&str> = test_str.split('e').collect();
+                let     mantissa : &str = exp_notation.get(0).unwrap();
+                let mut exponent : &str = exp_notation.get(1).unwrap_or(&"0");
+                if exponent.is_empty() { exponent = "0"; }
+                let exponent : i32 = exponent.parse().unwrap();
+
+                if mantissa.contains('.') || exponent < 0 {
+                    let mut number = mantissa.parse::<f64>().unwrap() * 10f64.powi(exponent);
+                    if is_neg { number *= -1f64; }
+                    return Numerics::Real(number);
+                }
+
+                let number : usize = mantissa.parse().unwrap();
+                if is_neg {
+                    return Numerics::Integer(-(number as isize) * 10isize.pow(exponent as u32));
+                }
+                return Numerics::Natural(number * 10usize.pow(exponent as u32));
+            }
+            None => {
+                if is_neg {
+                    return Numerics::Integer(-test_str.parse::<isize>().unwrap());
+                }
+                Numerics::Natural(test_str.parse::<usize>().unwrap())
+            }
+        };
+    }
+}
+
+impl ToNumeric for usize {
+    fn to_numeric(&self) -> Numerics { Numerics::Natural(*self) }
+}
+impl ToNumeric for u32 {
+    fn to_numeric(&self) -> Numerics { Numerics::Natural(*self as usize) }
+}
+impl ToNumeric for isize {
+    fn to_numeric(&self) -> Numerics {
+        if *self > 0 { return Numerics::Natural(*self as usize); }
+        Numerics::Integer(*self)
+    }
+}
+impl ToNumeric for i32 {
+    fn to_numeric(&self) -> Numerics {
+        if *self > 0 { return Numerics::Natural(*self as usize); }
+        Numerics::Integer(*self as isize)
+    }
+}
+impl ToNumeric for f64 {
+    fn to_numeric(&self) -> Numerics { Numerics::Real(*self) }
+}
+
+impl fmt::Display for Numerics {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let printable = match self {
+            Numerics::Natural(n) => n.to_string(),
+            Numerics::Integer(n) => n.to_string(),
+            Numerics::Real(n)    => n.to_string(),
+        };
+        write!(f, "{}", printable)
+    }
+}
+
+/// Node that represents a number.
+pub struct NumNode {
+    /// Holds a the numeric value.
+    pub value : Numerics
+}
+
+
+/// Node for holding strings.
+pub struct StrNode {
+    /// Contents of the utf-8 string.
+    pub value : String
+}
+
+/// Symbol Node.
+pub struct SymNode {
+    /// Value/name stored as a string and
+    /// excludes the colon (:) in front.
+    pub value : String
+}
+
+/// Call Node has a pointer to the callee node
+/// and a list of operand nodes.
+pub struct CallNode {
+    /// Pointer to heap allocated calling node. 
+    pub callee : Box<Nodes>,
+    /// Pointer to list of operand nodes.
+    pub operands : Vec<Nodes>
+}
+
+/// Represents a block of code / compound statements
+/// in order of when they will be executed.
+pub struct BlockNode {
+    /// Pointer to list of nodes in the code block.
+    pub statements : Vec<Nodes>
+}
+
+/// All node types.
+pub enum Nodes {
+    Ident(IdentNode),
+    Num(NumNode),
+    Str(StrNode),
+    Sym(SymNode),
+    Call(CallNode),
+    Block(BlockNode)
+}
+
+
+impl fmt::Display for Nodes {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let printable = match self {
+            Nodes::Ident(node)  => format!("%ident {{ :value \"{}\" }}", node.value),
+            Nodes::Num(node)    => format!("%num {{ :value {} }}", node.value),
+            Nodes::Str(node)    => format!("%str {{ :value \"{}\" }}", node.value),
+            Nodes::Sym(node)    => format!("%sym {{ :value \"{}\" }}", node.value),
+            Nodes::Call(node)   => format!("%call {{ :callee \"{}\" }}", node.callee),
+            Nodes::Block(node)  => format!("%block {{ ... }}"),
+        };
+        write!(f, "{}", printable)
+    }
+}
+
+macro_rules! unwrap_enum {
+    ($e:expr, $m:path) => {
+        match $e {
+            $m(inner) => Some(&inner),
+            _ => None
+        }
+    };
+}
+
+
+impl Nodes {
+    pub fn ident(&self) -> Option<&IdentNode> { unwrap_enum!(self, Nodes::Ident) }
+    pub fn   num(&self) -> Option<&NumNode>   { unwrap_enum!(self, Nodes::Num)   }
+    pub fn   str(&self) -> Option<&StrNode>   { unwrap_enum!(self, Nodes::Str)   }
+    pub fn   sym(&self) -> Option<&SymNode>   { unwrap_enum!(self, Nodes::Sym)   }
+    pub fn  call(&self) -> Option<&CallNode>  { unwrap_enum!(self, Nodes::Call)  }
+    pub fn block(&self) -> Option<&BlockNode> { unwrap_enum!(self, Nodes::Block) }
+
+    pub fn is_atomic(&self) -> bool {
+        match self {
+            Nodes::Ident(_)  => true,
+            Nodes::Num(_)    => true,
+            Nodes::Str(_)    => true,
+            Nodes::Sym(_)    => true,
+            Nodes::Call(_)   => false,
+            Nodes::Block(_)  => false,
+        }
+    }
+}
+
+impl IdentNode {
+    pub fn new(value : &str) -> Nodes { Nodes::Ident(IdentNode { value: value.to_string() }) }
+}
+
+impl NumNode {
+    pub fn new<Num : ToNumeric>(number : Num) -> Nodes {
+        let value = number.to_numeric();
+        Nodes::Num(NumNode { value })
+    }
+}
+
+impl StrNode {
+    pub fn new(value : &str) -> Nodes { Nodes::Str(StrNode { value: value.to_string() }) }
+}
+
+impl SymNode {
+    pub fn new(value : &str) -> Nodes { Nodes::Sym(SymNode { value: value.to_string() }) }
+}
+
+
+/// Root branch of the AST.
+pub struct Root {
+    pub branches : Vec<Nodes>
+}
+
+impl Root {
+    pub fn new() -> Self {
+        Root { branches: Vec::new() }
+    }
+}
+impl fmt::Display for Root {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let str_mapped : Vec<String> = self.branches.iter().map(Nodes::to_string).collect();
+        write!(f, "%root{{\n  {}\n}}", str_mapped.join(",\n  "))
+    }
+}+
\ No newline at end of file
diff --git a/src/syntax/internal_macros.rs b/src/syntax/internal_macros.rs
diff --git a/src/syntax/lexer.rs b/src/syntax/lexer.rs
@@ -6,13 +6,21 @@ use super::location;
 use lazy_static::lazy_static;
 use regex::Regex;
 
+use unicode_width::UnicodeWidthChar;
+use unicode_width::UnicodeWidthStr;
+
 macro_rules! re {
     ($string:expr) => {
         Regex::new($string).unwrap()
     };
 }
 
-trait RegexExt { fn first_match(&self, string : &str) -> Option<String>; }
+/// Extension allows first Regex match to be easily picked out
+/// and returns Option<String> containing the string for the capture.
+trait RegexExt {
+    /// Gets first match in string.
+    fn first_match(&self, string : &str) -> Option<String>;
+}
 impl RegexExt for Regex {
     fn first_match(&self, string : &str) -> Option<String> {
         let cap = self.captures(string);
@@ -28,12 +36,14 @@ impl RegexExt for Regex {
     }
 }
 
-const IDENT_CHARS : &str = r"\p{L}\?\!\'\-\_";
+/// All chars that may constitue an ident.
+const IDENT_CHARS : &str = r"\p{L}\?!'\-_";
 
 lazy_static! {
     static ref OP    : Regex = re!(r"\A([\+\.\*\|\\/\&%\$\^\~><=¬@\-]+)");
-    static ref IDENT : Regex = re!(&format!(r"\A([{id}][{id}\p{{N}}]+)", id=IDENT_CHARS));
-    static ref NUM   : Regex = re!(r"\A(\-?(?:(?:[0-9]+(?:\.[0-9]+)?(?:e[+-]?[0-9]+)?)|(?:0x[0-9a-f]+)|(?:0b[01]+)|(?:0o[0-7]+)))");
+    static ref IDENT : Regex = re!(&format!(r"\A([{id}][{id}\p{{N}}]*)", id=IDENT_CHARS));
+    static ref SYM   : Regex = re!(&format!(r"\A(:[{id}\p{{N}}]+)", id=IDENT_CHARS));
+    static ref NUM   : Regex = re!(r"\A(\-?(?:(?:0[xX][0-9a-f]+)|(?:0[bB][01]+)|(?:0[Oo][0-7]+)|(?:(?:[0-9]+(?:\.[0-9]+)?(?:e[\+\-]?[0-9]+)?))))");
 }
 
 macro_rules! try_match {
@@ -41,11 +51,10 @@ macro_rules! try_match {
      $reg:expr, $token_type:expr,
      $current_char:expr, $line:expr, $col:expr) => {
         if let Some(matched) = $reg.first_match($partial) {
-            let span = matched.chars().count() as u32;
+            let span = matched.width() as u32;
             $stream.push(Token::new(
                 $token_type, &matched,
-                location::new($line, $col, span)
-            ));
+                location::new($line, $col, span)));
             $current_char += matched.len();
             $col += span;
             continue;
@@ -53,18 +62,129 @@ macro_rules! try_match {
     };
 }
 
-pub fn lex(string : String) -> Vec<Token> {
+/// Takes a piece of code (as a &str) and returns
+/// the generated token-stream (as a Vec<Token>).
+pub fn lex(string : &str) -> Vec<Token> {
     let mut token_stream : Vec<Token> = Vec::new();
     
     let mut current_char = 0;
-    let string_size = string.len();
+    let string_size = string.bytes().count();
 
     let mut partial : &str;
     let mut line = 1;
     let mut col  = 1;
 
     while current_char < string_size {
-        partial = &string[current_char..];
+        if let Some(slice) = &string.get(current_char..) {
+            partial = slice;
+        } else { // Not on boundary yet.
+            current_char += 1;
+            continue;
+        }
+
+        let maybe_vec = &partial.get(0..2).unwrap_or("");
+        let vec_brack = match maybe_vec {
+            &"[|" => Some(TokenType::LVec),
+            &"|]" => Some(TokenType::RVec),
+              _  => None
+        };
+        if let Some(tt) = vec_brack {
+            token_stream.push(Token::new(
+                tt, maybe_vec,
+                location::new(line, col, 2)));
+            col += 2;
+            current_char += 2;
+            continue;
+        }
+
+        let first_char = partial.chars().nth(0)
+            .expect("Empty program was trying to be lexed."); // This should't happen.
+
+        let single_char_token = match first_char {
+            '(' => Some(TokenType::LParen),
+            ')' => Some(TokenType::RParen),
+            '[' => Some(TokenType::LBrack),
+            ']' => Some(TokenType::RBrack),
+            '{' => Some(TokenType::LBrace),
+            '}' => Some(TokenType::RBrace),
+            '\n' | ';' => Some(TokenType::Term),
+             _  => None
+        };
+
+        if let Some(tt) = single_char_token {
+            token_stream.push(Token::new(
+                tt, &first_char.to_string(),
+                location::new(line, col, 1)));
+            if first_char == '\n' {
+                line += 1;
+                col = 1;
+            } else {
+                col += 1;
+            }
+            current_char += 1;
+            continue;
+        }
+
+        if first_char == '"' {
+            let mut contents = String::new();
+
+            let mut eos = false;
+            let mut i = 1;
+            let old_col = col;
+            while !eos {  // Spaghet
+                if let Some(character) = partial.chars().nth(i) {
+                    if character == '"' {
+                        current_char += 1;
+                        col += 1;
+                        eos = true;
+                    } else if character == '\\' {
+                        if let Some(next) = partial.chars().nth(i + 1) {
+                            let escaped : String = match next {
+                               '\\' => String::from("\\"),
+                                'r' => String::from("\r"),
+                                'n' => String::from("\n"),
+                                't' => String::from("\t"),
+                                'b' => String::from("\x08"),
+                                '0' => String::from("\0"),
+                                'x' => {
+                                    if let Some(code) = partial.get((current_char + 2)..(current_char + 4)) {
+                                        i += 2;
+                                        col += 2;
+                                        current_char += 2;
+                                        (u8::from_str_radix(code, 16).expect("Malformed hex.") as char).to_string()
+                                    } else { String::new() }
+                                }
+                                c => c.to_string()
+                            };
+                            i += 1;
+                            col += 1;
+                            current_char += 1;
+                            contents.push_str(&escaped);
+                            continue;
+                        } else {
+                            eos = true;
+                            // Error: Unexpected EOS!
+                        }
+                    } else {
+                        contents.push(character);
+                        i += 1;
+                        col += character.width().unwrap_or(2) as u32;
+                        current_char += character.len_utf8();
+                        continue;
+                    }
+                } else {
+                    eos = true;
+                    // Error: Unexpected EOS!
+                }
+                i += 1;
+                current_char += 1;
+                col += 1;
+            }
+            token_stream.push(Token::new(
+                TokenType::Str, &contents,
+                location::new(line, old_col, col - old_col)));
+            continue;
+        }
 
         try_match!(token_stream, partial,
             NUM, TokenType::Num,
@@ -74,14 +194,20 @@ pub fn lex(string : String) -> Vec<Token> {
             OP, TokenType::Op,
             current_char, line, col);
 
-        if partial.chars().nth(0).unwrap() == '\n' {
-            line += 1;
-            col = 1;
-            current_char += 1;
-            continue;
-        }        
+        try_match!(token_stream, partial,
+            IDENT, TokenType::Ident,
+            current_char, line, col);
+
+        try_match!(token_stream, partial,
+            SYM, TokenType::Sym,
+            current_char, line, col);
+
         current_char += 1;
         if partial.is_char_boundary(0) { col += 1 }
     }
+
+    token_stream.push(Token::new(
+        TokenType::EOF, "\0",
+        location::new(line, col, 1)));
     token_stream
 } 
\ No newline at end of file
diff --git a/src/syntax/location.rs b/src/syntax/location.rs
@@ -1,14 +1,15 @@
+/// Holds line, column and span of a lexical token.
 pub struct Loc {
+    /// Line number.
     pub line : u32,
+    /// Column number.
     pub col  : u32,
+    /// Span/Width (in characters) of token.
     pub span : u32,
 }
 
+/// Construct new Loc structure.
 pub fn new(line : u32, col : u32, span : u32) -> Loc {
-    Loc {
-        line: line,
-        col:  col,
-        span: span
-    }
+    Loc { line, col, span }
 }
 
diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs
@@ -1,17 +1,34 @@
+/// location manages line and column location of
+/// lexical tokens as well as their span.
 mod location;
+
+/// Provides token classes and methods
 mod token;
 
+/// Abstract Syntax Tree nodes and methods.
+mod ast;
+
+/// Lexer splits code up into a token-stream
+/// of relevant lexical tokens, making the
+/// parsing step a lot easier.
 pub mod lexer;
+
+/// Converts a token-stream into a nested AST.
 pub mod parser;
 
 use std::fs;
 use token::ShowStream;
 
+/// Parses a given file, calling various methods from
+/// the `syntax` sub-module.
 pub fn parse_file(filename : &str) {
     let code = fs::read_to_string(filename)
         .expect("Could not open file for reading.");
     println!("Code:\n{}\n", code);
 
-    let stream = lexer::lex(code);
+    let stream = lexer::lex(&code);
     println!("Stream:\n{}\n", stream.to_string());
+
+    let tree = parser::parse(stream);
+    println!("AST:\n{}\n", tree)
 } 
\ No newline at end of file
diff --git a/src/syntax/parser.rs b/src/syntax/parser.rs
@@ -0,0 +1,77 @@
+use super::token;
+use super::ast;
+
+use token::{Token, TokenType};
+use ast::{Numerics, Nodes};
+
+pub fn parse(stream : Vec<Token>) -> ast::Root {
+    let mut tree = ast::Root::new();
+
+    for token in stream {
+        if token.is_atomic() {
+            tree.branches.push(atom(&token));
+        }
+    }
+
+    tree
+}
+
+fn atom(token : &Token) -> Nodes {
+    match token.class {
+        TokenType::Ident => ast::IdentNode::new(&token.string),
+        TokenType::Op => ast::IdentNode::new(&token.string),
+        TokenType::Num => ast::NumNode::new(&*token.string),
+        TokenType::Str => ast::StrNode::new(&token.string),
+        _ => panic!("Passed non-atomic token to `atom` parser.")
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn numeric_parsing() {
+        assert_eq!(ast::NumNode::new(2).num().unwrap().value, Numerics::Natural(2usize));
+        assert_eq!(ast::NumNode::new(2usize).num().unwrap().value, Numerics::Natural(2usize));
+        assert_eq!(ast::NumNode::new(2u32).num().unwrap().value, Numerics::Natural(2usize));
+        assert_eq!(ast::NumNode::new(2i32).num().unwrap().value, Numerics::Natural(2usize));
+
+        assert_eq!(ast::NumNode::new(-2).num().unwrap().value, Numerics::Integer(-2isize));
+        assert_eq!(ast::NumNode::new(-2i32).num().unwrap().value, Numerics::Integer(-2isize));
+        assert_eq!(ast::NumNode::new(-2isize).num().unwrap().value, Numerics::Integer(-2isize));
+        
+        assert_eq!(ast::NumNode::new(-2.62).num().unwrap().value, Numerics::Real(-2.62f64));
+        assert_eq!(ast::NumNode::new(2.62).num().unwrap().value, Numerics::Real(2.62f64));
+
+        assert_eq!(ast::NumNode::new("2").num().unwrap().value, Numerics::Natural(2));
+        assert_eq!(ast::NumNode::new("325").num().unwrap().value, Numerics::Natural(325));
+        assert_eq!(ast::NumNode::new("0b01010110").num().unwrap().value, Numerics::Natural(0b01010110));
+        assert_eq!(ast::NumNode::new("0o721").num().unwrap().value, Numerics::Natural(0o721));
+        assert_eq!(ast::NumNode::new("0xfa").num().unwrap().value, Numerics::Natural(0xfa));
+        assert_eq!(ast::NumNode::new("0xf").num().unwrap().value, Numerics::Natural(0xf));
+        assert_eq!(ast::NumNode::new("2.672").num().unwrap().value, Numerics::Real(2.672));
+        assert_eq!(ast::NumNode::new("2.672e3").num().unwrap().value, Numerics::Real(2672.0));
+        assert_eq!(ast::NumNode::new("2.672e+16").num().unwrap().value, Numerics::Real(2.672 * 10f64.powf(16f64)));
+        assert_eq!(ast::NumNode::new("2.672e-10").num().unwrap().value, Numerics::Real(2.672 * 10f64.powf(-10f64)));
+        assert_eq!(ast::NumNode::new("67e-4").num().unwrap().value, Numerics::Real(0.0067));
+        assert_eq!(ast::NumNode::new("67e+10").num().unwrap().value, Numerics::Natural(670000000000));
+        assert_eq!(ast::NumNode::new("-2").num().unwrap().value, Numerics::Integer(-2));
+        assert_eq!(ast::NumNode::new("-325").num().unwrap().value, Numerics::Integer(-325));
+        assert_eq!(ast::NumNode::new("-0b01010110").num().unwrap().value, Numerics::Integer(-0b01010110));
+        assert_eq!(ast::NumNode::new("-0o721").num().unwrap().value, Numerics::Integer(-0o721));
+        assert_eq!(ast::NumNode::new("-0xfa").num().unwrap().value, Numerics::Integer(-250));
+        assert_eq!(ast::NumNode::new("-0xf").num().unwrap().value, Numerics::Integer(-15));
+        assert_eq!(ast::NumNode::new("-2.672").num().unwrap().value, Numerics::Real(-2.672));
+        assert_eq!(ast::NumNode::new("-2.672e3").num().unwrap().value, Numerics::Real(-2672.0));
+        assert_eq!(ast::NumNode::new("-2.672e+16").num().unwrap().value, Numerics::Real(-26720000000000000.0));
+        assert_eq!(ast::NumNode::new("-2.672e-10").num().unwrap().value, Numerics::Real(-0.0000000002672));
+        assert_eq!(ast::NumNode::new("-67e-4").num().unwrap().value, Numerics::Real(-0.0067));
+        assert_eq!(ast::NumNode::new("-67e+10").num().unwrap().value, Numerics::Integer(-670000000000));
+
+        let s : String = String::from("-6e12");
+        let num = ast::NumNode::new(&*s);
+
+        assert_eq!(num.num().unwrap().value, Numerics::Integer(-6000000000000));
+    }
+}+
\ No newline at end of file
diff --git a/src/syntax/token.rs b/src/syntax/token.rs
@@ -1,49 +1,133 @@
 use std::fmt;
 use super::location;
 
+use snailquote::escape;
+use unicode_width::UnicodeWidthStr;
+
+/// Contains all possible types/classes of
+/// lexiacal tokens.
+#[derive(PartialEq)]
 pub enum TokenType {
+    /// Identifiers, variables, function names etc.
     Ident,
+    /// Numerics, anything that directly represents a number.
     Num,
+    /// Any operators, simular to idents but are lexed differently.
     Op,
+    /// Symbols, they are like elements of enums, they begin with a colon.
     Sym,
+    /// Strings, enclosed by double quotes ("...").
     Str,
+    /// Left Parenthesis.
+    LParen,
+    /// Rigt Parenthesis.
+    RParen,
+    /// Left Square Bracket.
+    LBrack,
+    /// Right Square Bracket.
+    RBrack,
+    /// Left curly-brace.
+    LBrace,
+    /// Right curly-brace.
+    RBrace,
+    /// Left vector-list bracket.
+    LVec,
+    /// Right vector-list bracket.
+    RVec,
+    /// Terminator, something that ends a line.
+    /// Either a semi-colon (;) or a new-line (\n).
+    Term,
+    /// End Of File, last token in the stream.
+    EOF,
 }
 
 impl fmt::Display for TokenType {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         let printable = match *self {
-            TokenType::Ident => "Identifier",
-            TokenType::Num   => "Numeric",
-            TokenType::Op    => "Operator",
-            TokenType::Sym   => "Symbol",
-            TokenType::Str   => "String"
+            TokenType::Ident  => "Identifier",
+            TokenType::Num    => "Numeric",
+            TokenType::Op     => "Operator",
+            TokenType::Sym    => "Symbol",
+            TokenType::Str    => "String",
+            TokenType::LParen => "L-Paren",
+            TokenType::RParen => "R-Paren",
+            TokenType::LBrack => "L-Bracket",
+            TokenType::RBrack => "R-Bracket",
+            TokenType::LBrace => "L-Brace",
+            TokenType::RBrace => "R-Brace",
+            TokenType::LVec   => "L-Vector",
+            TokenType::RVec   => "R-Vector",
+            TokenType::Term   => "Terminator",
+            TokenType::EOF    => "End-Of-File",
         };
         write!(f, "{}", printable)
     }
 }
 
+/// Token structure, an individual lexiacal token,
+/// represented by its type/class, what it was written as
+/// in the program, and its location in the code.
 pub struct Token {
+    /// What type/class of token it is.
     pub class  : TokenType,
+    /// What string the token matched with.
     pub string : String,
+    /// Where the token is in the code.
     pub location : location::Loc,
 }
 
 impl Token {
+    /// Constructs a new Token structure.
     pub fn new(class : TokenType, string : &str, loc : location::Loc) -> Token {
-        Token { class: class, string: String::from(string), location: loc }
+        Token { class, string: String::from(string), location: loc }
+    }
+
+    /// Checks if the token represents an atomic datum.
+    pub fn is_atomic(&self) -> bool {
+        match self.class {
+            TokenType::Ident  => true,
+            TokenType::Num    => true,
+            TokenType::Op     => true,
+            TokenType::Sym    => true,
+            TokenType::Str    => true,
+            TokenType::LParen => false,
+            TokenType::RParen => false,
+            TokenType::LBrack => false,
+            TokenType::RBrack => false,
+            TokenType::LBrace => false,
+            TokenType::RBrace => false,
+            TokenType::LVec   => false,
+            TokenType::RVec   => false,
+            TokenType::Term   => false,
+            TokenType::EOF    => false,
+        }
     }
 
+    /// String representation of the token.
     pub fn to_string(&self) -> String {
-        String::from(format!("[ {class}: \"{rep}\" ({l}, {c}) ]",
-            class=self.class, rep=self.string,
-            l=self.location.line, c=self.location.col))
+        let mut escaped = escape(&self.string.to_string()).into_owned();
+        if !escaped.ends_with('"') {
+            escaped = format!("\"{}\"", escaped);
+        }
+
+        format!("[ {class}:{spaces1}{rep}{spaces2}({l}, {c}):{span} ]",
+            class=self.class, rep=escaped,
+            spaces1=" ".repeat(12 - self.class.to_string().width()),
+            spaces2=" ".repeat(50 - escaped.width()),
+            l=self.location.line, c=self.location.col,
+            span=self.location.span)
     }
 }
 
-pub trait ShowStream { fn to_string(&self) -> String; }
+/// Allows for a custom string representation for the
+/// token-stream as a whole.
+pub trait ShowStream {
+    /// String representation of token-stream.
+    fn to_string(&self) -> String;
+}
 impl ShowStream for Vec<Token> {
     fn to_string(&self) -> String {
-        let lines : Vec<String> = self.into_iter().map(|t| t.to_string()).collect();
-        format!("[ {} ]", lines.join("\n  "))
+        let lines : Vec<String> = self.iter().map(Token::to_string).collect();
+        format!("[ {} ]", lines.join(",\n  "))
     }
 } 
\ No newline at end of file
diff --git a/test.vh b/test.vh
@@ -1 +1,2 @@
-1 + 2 * 3 + -4 - 5-
\ No newline at end of file
+漢字 = "hello漢字漢字 world"
+ 漢字漢字   漢字v+
\ No newline at end of file

	valhallac Compiler for set-theoretic programming language.
	git clone git://git.knutsen.co/valhallac
	Log \| Files \| Refs \| README \| LICENSE

M	Cargo.toml	\|	14	++++++++++++--
M	src/main.rs	\|	8	++++++++
A	src/syntax/ast.rs	\|	234	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D	src/syntax/internal_macros.rs	\|	0
M	src/syntax/lexer.rs	\|	158	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
M	src/syntax/location.rs	\|	11	++++++-----
M	src/syntax/mod.rs	\|	19	++++++++++++++++++-
M	src/syntax/parser.rs	\|	78	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/syntax/token.rs	\|	108	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
M	test.vh	\|	5	+++--