valhallac

Compiler for set-theoretic programming language.
git clone git://git.knutsen.co/valhallac
Log | Files | Refs | README | LICENSE

commit bec75cc31cee86eb7fece599abca75450f6869df
parent 8067a7ad116baace1dbd9537bbb8cc66023b870a
Author: Fredrik Knutsen <moi@knutsen.co>
Date:   Sat, 13 Jul 2019 20:02:53 +0100

Added foundations for parser, need to implement some form of precedence parsing now.

Diffstat:
MCargo.toml | 14++++++++++++--
Msrc/main.rs | 8++++++++
Asrc/syntax/ast.rs | 234+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/syntax/internal_macros.rs | 0
Msrc/syntax/lexer.rs | 158+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
Msrc/syntax/location.rs | 11++++++-----
Msrc/syntax/mod.rs | 19++++++++++++++++++-
Msrc/syntax/parser.rs | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/syntax/token.rs | 108++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------
Mtest.vh | 5+++--
10 files changed, 597 insertions(+), 38 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml @@ -1,9 +1,20 @@ [package] name = "valhalla" +description = "Valhalla Language frontend, parser and AST compiler." +homepage = "https://knutsen.co" +repository = "https://github.com/Demonstrandum/valhalla" +documentation = "https://github.com/Demonstrandum/valhalla" +keywords = ["set-theory", "programming", "language", "parser", "compiler"] +categories = ["parser-implementations", "parsing", "encoding", "command-line-interface"] +license = "GPL-3.0" +license-file = "LICENSE.md" +readme = "README.md" version = "0.1.0" authors = ["Demonstrandum <moi@knutsen.co>"] edition = "2018" [dependencies] lazy_static = "1.3.0" -regex = "1"- \ No newline at end of file +regex = "1" +snailquote = "0.2.0" +unicode-width = "0.1.5" diff --git a/src/main.rs b/src/main.rs @@ -1,7 +1,15 @@ +//! Crate responsible for parsing and compiling +//! the generated AST to Brokkr-bytecode for the +//! Valhalla set theoretic programming language. + +/// Syntax submodule, responsible for lexical analysis, +/// parsing and static analysis. mod syntax; + fn main() { println!("\nTill Valhalla!\n"); syntax::parse_file("./test.vh"); } + diff --git a/src/syntax/ast.rs b/src/syntax/ast.rs @@ -0,0 +1,233 @@ +use std::convert::TryFrom; +use std::fmt; + +/// Identifiers, node representing a name that +/// will represent a value stored. +pub struct IdentNode { + /// The name of the identifer. + pub value : String +} + +/// Different types of possible number types in the langauge. +/// Max size is determined by max pointer size. +#[derive(PartialEq, Debug)] +pub enum Numerics { + /// Naturals are unsigned ints. + Natural(usize), + /// Integers are signed. + Integer(isize), + /// Reals are represented as a double. + Real(f64) +} + +/// Parse a string of more than two chars with a specified radix, into an ast::Numeric. +fn parse_with_radix(neg : bool, s : &str, radix : u32) -> Numerics { + let unsigned = usize::from_str_radix(s.get(2..).unwrap(), radix).unwrap(); + if neg { + return Numerics::Integer(-(unsigned as isize)); + } + return Numerics::Natural(unsigned); +} + +/// Converts primitive types into ast::Numerics. +pub trait ToNumeric { fn to_numeric(&self) -> Numerics; } +impl ToNumeric for &str { + fn to_numeric(&self) -> Numerics { + let mut test_str = self.clone().to_ascii_lowercase(); + + let is_neg = self.starts_with('-'); + if is_neg { test_str = test_str.get(1..).unwrap().to_string(); } + + return match test_str.get(0..2) { + Some("0x") => parse_with_radix(is_neg, &test_str, 16), + Some("0o") => parse_with_radix(is_neg, &test_str, 8), + Some("0b") => parse_with_radix(is_neg, &test_str, 2), + Some(_) => { + let exp_notation : Vec<&str> = test_str.split('e').collect(); + let mantissa : &str = exp_notation.get(0).unwrap(); + let mut exponent : &str = exp_notation.get(1).unwrap_or(&"0"); + if exponent.is_empty() { exponent = "0"; } + let exponent : i32 = exponent.parse().unwrap(); + + if mantissa.contains('.') || exponent < 0 { + let mut number = mantissa.parse::<f64>().unwrap() * 10f64.powi(exponent); + if is_neg { number *= -1f64; } + return Numerics::Real(number); + } + + let number : usize = mantissa.parse().unwrap(); + if is_neg { + return Numerics::Integer(-(number as isize) * 10isize.pow(exponent as u32)); + } + return Numerics::Natural(number * 10usize.pow(exponent as u32)); + } + None => { + if is_neg { + return Numerics::Integer(-test_str.parse::<isize>().unwrap()); + } + Numerics::Natural(test_str.parse::<usize>().unwrap()) + } + }; + } +} + +impl ToNumeric for usize { + fn to_numeric(&self) -> Numerics { Numerics::Natural(*self) } +} +impl ToNumeric for u32 { + fn to_numeric(&self) -> Numerics { Numerics::Natural(*self as usize) } +} +impl ToNumeric for isize { + fn to_numeric(&self) -> Numerics { + if *self > 0 { return Numerics::Natural(*self as usize); } + Numerics::Integer(*self) + } +} +impl ToNumeric for i32 { + fn to_numeric(&self) -> Numerics { + if *self > 0 { return Numerics::Natural(*self as usize); } + Numerics::Integer(*self as isize) + } +} +impl ToNumeric for f64 { + fn to_numeric(&self) -> Numerics { Numerics::Real(*self) } +} + +impl fmt::Display for Numerics { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let printable = match self { + Numerics::Natural(n) => n.to_string(), + Numerics::Integer(n) => n.to_string(), + Numerics::Real(n) => n.to_string(), + }; + write!(f, "{}", printable) + } +} + +/// Node that represents a number. +pub struct NumNode { + /// Holds a the numeric value. + pub value : Numerics +} + + +/// Node for holding strings. +pub struct StrNode { + /// Contents of the utf-8 string. + pub value : String +} + +/// Symbol Node. +pub struct SymNode { + /// Value/name stored as a string and + /// excludes the colon (:) in front. + pub value : String +} + +/// Call Node has a pointer to the callee node +/// and a list of operand nodes. +pub struct CallNode { + /// Pointer to heap allocated calling node. + pub callee : Box<Nodes>, + /// Pointer to list of operand nodes. + pub operands : Vec<Nodes> +} + +/// Represents a block of code / compound statements +/// in order of when they will be executed. +pub struct BlockNode { + /// Pointer to list of nodes in the code block. + pub statements : Vec<Nodes> +} + +/// All node types. +pub enum Nodes { + Ident(IdentNode), + Num(NumNode), + Str(StrNode), + Sym(SymNode), + Call(CallNode), + Block(BlockNode) +} + + +impl fmt::Display for Nodes { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let printable = match self { + Nodes::Ident(node) => format!("%ident {{ :value \"{}\" }}", node.value), + Nodes::Num(node) => format!("%num {{ :value {} }}", node.value), + Nodes::Str(node) => format!("%str {{ :value \"{}\" }}", node.value), + Nodes::Sym(node) => format!("%sym {{ :value \"{}\" }}", node.value), + Nodes::Call(node) => format!("%call {{ :callee \"{}\" }}", node.callee), + Nodes::Block(node) => format!("%block {{ ... }}"), + }; + write!(f, "{}", printable) + } +} + +macro_rules! unwrap_enum { + ($e:expr, $m:path) => { + match $e { + $m(inner) => Some(&inner), + _ => None + } + }; +} + + +impl Nodes { + pub fn ident(&self) -> Option<&IdentNode> { unwrap_enum!(self, Nodes::Ident) } + pub fn num(&self) -> Option<&NumNode> { unwrap_enum!(self, Nodes::Num) } + pub fn str(&self) -> Option<&StrNode> { unwrap_enum!(self, Nodes::Str) } + pub fn sym(&self) -> Option<&SymNode> { unwrap_enum!(self, Nodes::Sym) } + pub fn call(&self) -> Option<&CallNode> { unwrap_enum!(self, Nodes::Call) } + pub fn block(&self) -> Option<&BlockNode> { unwrap_enum!(self, Nodes::Block) } + + pub fn is_atomic(&self) -> bool { + match self { + Nodes::Ident(_) => true, + Nodes::Num(_) => true, + Nodes::Str(_) => true, + Nodes::Sym(_) => true, + Nodes::Call(_) => false, + Nodes::Block(_) => false, + } + } +} + +impl IdentNode { + pub fn new(value : &str) -> Nodes { Nodes::Ident(IdentNode { value: value.to_string() }) } +} + +impl NumNode { + pub fn new<Num : ToNumeric>(number : Num) -> Nodes { + let value = number.to_numeric(); + Nodes::Num(NumNode { value }) + } +} + +impl StrNode { + pub fn new(value : &str) -> Nodes { Nodes::Str(StrNode { value: value.to_string() }) } +} + +impl SymNode { + pub fn new(value : &str) -> Nodes { Nodes::Sym(SymNode { value: value.to_string() }) } +} + + +/// Root branch of the AST. +pub struct Root { + pub branches : Vec<Nodes> +} + +impl Root { + pub fn new() -> Self { + Root { branches: Vec::new() } + } +} +impl fmt::Display for Root { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let str_mapped : Vec<String> = self.branches.iter().map(Nodes::to_string).collect(); + write!(f, "%root{{\n {}\n}}", str_mapped.join(",\n ")) + } +}+ \ No newline at end of file diff --git a/src/syntax/internal_macros.rs b/src/syntax/internal_macros.rs diff --git a/src/syntax/lexer.rs b/src/syntax/lexer.rs @@ -6,13 +6,21 @@ use super::location; use lazy_static::lazy_static; use regex::Regex; +use unicode_width::UnicodeWidthChar; +use unicode_width::UnicodeWidthStr; + macro_rules! re { ($string:expr) => { Regex::new($string).unwrap() }; } -trait RegexExt { fn first_match(&self, string : &str) -> Option<String>; } +/// Extension allows first Regex match to be easily picked out +/// and returns Option<String> containing the string for the capture. +trait RegexExt { + /// Gets first match in string. + fn first_match(&self, string : &str) -> Option<String>; +} impl RegexExt for Regex { fn first_match(&self, string : &str) -> Option<String> { let cap = self.captures(string); @@ -28,12 +36,14 @@ impl RegexExt for Regex { } } -const IDENT_CHARS : &str = r"\p{L}\?\!\'\-\_"; +/// All chars that may constitue an ident. +const IDENT_CHARS : &str = r"\p{L}\?!'\-_"; lazy_static! { static ref OP : Regex = re!(r"\A([\+\.\*\|\\/\&%\$\^\~><=¬@\-]+)"); - static ref IDENT : Regex = re!(&format!(r"\A([{id}][{id}\p{{N}}]+)", id=IDENT_CHARS)); - static ref NUM : Regex = re!(r"\A(\-?(?:(?:[0-9]+(?:\.[0-9]+)?(?:e[+-]?[0-9]+)?)|(?:0x[0-9a-f]+)|(?:0b[01]+)|(?:0o[0-7]+)))"); + static ref IDENT : Regex = re!(&format!(r"\A([{id}][{id}\p{{N}}]*)", id=IDENT_CHARS)); + static ref SYM : Regex = re!(&format!(r"\A(:[{id}\p{{N}}]+)", id=IDENT_CHARS)); + static ref NUM : Regex = re!(r"\A(\-?(?:(?:0[xX][0-9a-f]+)|(?:0[bB][01]+)|(?:0[Oo][0-7]+)|(?:(?:[0-9]+(?:\.[0-9]+)?(?:e[\+\-]?[0-9]+)?))))"); } macro_rules! try_match { @@ -41,11 +51,10 @@ macro_rules! try_match { $reg:expr, $token_type:expr, $current_char:expr, $line:expr, $col:expr) => { if let Some(matched) = $reg.first_match($partial) { - let span = matched.chars().count() as u32; + let span = matched.width() as u32; $stream.push(Token::new( $token_type, &matched, - location::new($line, $col, span) - )); + location::new($line, $col, span))); $current_char += matched.len(); $col += span; continue; @@ -53,18 +62,129 @@ macro_rules! try_match { }; } -pub fn lex(string : String) -> Vec<Token> { +/// Takes a piece of code (as a &str) and returns +/// the generated token-stream (as a Vec<Token>). +pub fn lex(string : &str) -> Vec<Token> { let mut token_stream : Vec<Token> = Vec::new(); let mut current_char = 0; - let string_size = string.len(); + let string_size = string.bytes().count(); let mut partial : &str; let mut line = 1; let mut col = 1; while current_char < string_size { - partial = &string[current_char..]; + if let Some(slice) = &string.get(current_char..) { + partial = slice; + } else { // Not on boundary yet. + current_char += 1; + continue; + } + + let maybe_vec = &partial.get(0..2).unwrap_or(""); + let vec_brack = match maybe_vec { + &"[|" => Some(TokenType::LVec), + &"|]" => Some(TokenType::RVec), + _ => None + }; + if let Some(tt) = vec_brack { + token_stream.push(Token::new( + tt, maybe_vec, + location::new(line, col, 2))); + col += 2; + current_char += 2; + continue; + } + + let first_char = partial.chars().nth(0) + .expect("Empty program was trying to be lexed."); // This should't happen. + + let single_char_token = match first_char { + '(' => Some(TokenType::LParen), + ')' => Some(TokenType::RParen), + '[' => Some(TokenType::LBrack), + ']' => Some(TokenType::RBrack), + '{' => Some(TokenType::LBrace), + '}' => Some(TokenType::RBrace), + '\n' | ';' => Some(TokenType::Term), + _ => None + }; + + if let Some(tt) = single_char_token { + token_stream.push(Token::new( + tt, &first_char.to_string(), + location::new(line, col, 1))); + if first_char == '\n' { + line += 1; + col = 1; + } else { + col += 1; + } + current_char += 1; + continue; + } + + if first_char == '"' { + let mut contents = String::new(); + + let mut eos = false; + let mut i = 1; + let old_col = col; + while !eos { // Spaghet + if let Some(character) = partial.chars().nth(i) { + if character == '"' { + current_char += 1; + col += 1; + eos = true; + } else if character == '\\' { + if let Some(next) = partial.chars().nth(i + 1) { + let escaped : String = match next { + '\\' => String::from("\\"), + 'r' => String::from("\r"), + 'n' => String::from("\n"), + 't' => String::from("\t"), + 'b' => String::from("\x08"), + '0' => String::from("\0"), + 'x' => { + if let Some(code) = partial.get((current_char + 2)..(current_char + 4)) { + i += 2; + col += 2; + current_char += 2; + (u8::from_str_radix(code, 16).expect("Malformed hex.") as char).to_string() + } else { String::new() } + } + c => c.to_string() + }; + i += 1; + col += 1; + current_char += 1; + contents.push_str(&escaped); + continue; + } else { + eos = true; + // Error: Unexpected EOS! + } + } else { + contents.push(character); + i += 1; + col += character.width().unwrap_or(2) as u32; + current_char += character.len_utf8(); + continue; + } + } else { + eos = true; + // Error: Unexpected EOS! + } + i += 1; + current_char += 1; + col += 1; + } + token_stream.push(Token::new( + TokenType::Str, &contents, + location::new(line, old_col, col - old_col))); + continue; + } try_match!(token_stream, partial, NUM, TokenType::Num, @@ -74,14 +194,20 @@ pub fn lex(string : String) -> Vec<Token> { OP, TokenType::Op, current_char, line, col); - if partial.chars().nth(0).unwrap() == '\n' { - line += 1; - col = 1; - current_char += 1; - continue; - } + try_match!(token_stream, partial, + IDENT, TokenType::Ident, + current_char, line, col); + + try_match!(token_stream, partial, + SYM, TokenType::Sym, + current_char, line, col); + current_char += 1; if partial.is_char_boundary(0) { col += 1 } } + + token_stream.push(Token::new( + TokenType::EOF, "\0", + location::new(line, col, 1))); token_stream } \ No newline at end of file diff --git a/src/syntax/location.rs b/src/syntax/location.rs @@ -1,14 +1,15 @@ +/// Holds line, column and span of a lexical token. pub struct Loc { + /// Line number. pub line : u32, + /// Column number. pub col : u32, + /// Span/Width (in characters) of token. pub span : u32, } +/// Construct new Loc structure. pub fn new(line : u32, col : u32, span : u32) -> Loc { - Loc { - line: line, - col: col, - span: span - } + Loc { line, col, span } } diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs @@ -1,17 +1,34 @@ +/// location manages line and column location of +/// lexical tokens as well as their span. mod location; + +/// Provides token classes and methods mod token; +/// Abstract Syntax Tree nodes and methods. +mod ast; + +/// Lexer splits code up into a token-stream +/// of relevant lexical tokens, making the +/// parsing step a lot easier. pub mod lexer; + +/// Converts a token-stream into a nested AST. pub mod parser; use std::fs; use token::ShowStream; +/// Parses a given file, calling various methods from +/// the `syntax` sub-module. pub fn parse_file(filename : &str) { let code = fs::read_to_string(filename) .expect("Could not open file for reading."); println!("Code:\n{}\n", code); - let stream = lexer::lex(code); + let stream = lexer::lex(&code); println!("Stream:\n{}\n", stream.to_string()); + + let tree = parser::parse(stream); + println!("AST:\n{}\n", tree) } \ No newline at end of file diff --git a/src/syntax/parser.rs b/src/syntax/parser.rs @@ -0,0 +1,77 @@ +use super::token; +use super::ast; + +use token::{Token, TokenType}; +use ast::{Numerics, Nodes}; + +pub fn parse(stream : Vec<Token>) -> ast::Root { + let mut tree = ast::Root::new(); + + for token in stream { + if token.is_atomic() { + tree.branches.push(atom(&token)); + } + } + + tree +} + +fn atom(token : &Token) -> Nodes { + match token.class { + TokenType::Ident => ast::IdentNode::new(&token.string), + TokenType::Op => ast::IdentNode::new(&token.string), + TokenType::Num => ast::NumNode::new(&*token.string), + TokenType::Str => ast::StrNode::new(&token.string), + _ => panic!("Passed non-atomic token to `atom` parser.") + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn numeric_parsing() { + assert_eq!(ast::NumNode::new(2).num().unwrap().value, Numerics::Natural(2usize)); + assert_eq!(ast::NumNode::new(2usize).num().unwrap().value, Numerics::Natural(2usize)); + assert_eq!(ast::NumNode::new(2u32).num().unwrap().value, Numerics::Natural(2usize)); + assert_eq!(ast::NumNode::new(2i32).num().unwrap().value, Numerics::Natural(2usize)); + + assert_eq!(ast::NumNode::new(-2).num().unwrap().value, Numerics::Integer(-2isize)); + assert_eq!(ast::NumNode::new(-2i32).num().unwrap().value, Numerics::Integer(-2isize)); + assert_eq!(ast::NumNode::new(-2isize).num().unwrap().value, Numerics::Integer(-2isize)); + + assert_eq!(ast::NumNode::new(-2.62).num().unwrap().value, Numerics::Real(-2.62f64)); + assert_eq!(ast::NumNode::new(2.62).num().unwrap().value, Numerics::Real(2.62f64)); + + assert_eq!(ast::NumNode::new("2").num().unwrap().value, Numerics::Natural(2)); + assert_eq!(ast::NumNode::new("325").num().unwrap().value, Numerics::Natural(325)); + assert_eq!(ast::NumNode::new("0b01010110").num().unwrap().value, Numerics::Natural(0b01010110)); + assert_eq!(ast::NumNode::new("0o721").num().unwrap().value, Numerics::Natural(0o721)); + assert_eq!(ast::NumNode::new("0xfa").num().unwrap().value, Numerics::Natural(0xfa)); + assert_eq!(ast::NumNode::new("0xf").num().unwrap().value, Numerics::Natural(0xf)); + assert_eq!(ast::NumNode::new("2.672").num().unwrap().value, Numerics::Real(2.672)); + assert_eq!(ast::NumNode::new("2.672e3").num().unwrap().value, Numerics::Real(2672.0)); + assert_eq!(ast::NumNode::new("2.672e+16").num().unwrap().value, Numerics::Real(2.672 * 10f64.powf(16f64))); + assert_eq!(ast::NumNode::new("2.672e-10").num().unwrap().value, Numerics::Real(2.672 * 10f64.powf(-10f64))); + assert_eq!(ast::NumNode::new("67e-4").num().unwrap().value, Numerics::Real(0.0067)); + assert_eq!(ast::NumNode::new("67e+10").num().unwrap().value, Numerics::Natural(670000000000)); + assert_eq!(ast::NumNode::new("-2").num().unwrap().value, Numerics::Integer(-2)); + assert_eq!(ast::NumNode::new("-325").num().unwrap().value, Numerics::Integer(-325)); + assert_eq!(ast::NumNode::new("-0b01010110").num().unwrap().value, Numerics::Integer(-0b01010110)); + assert_eq!(ast::NumNode::new("-0o721").num().unwrap().value, Numerics::Integer(-0o721)); + assert_eq!(ast::NumNode::new("-0xfa").num().unwrap().value, Numerics::Integer(-250)); + assert_eq!(ast::NumNode::new("-0xf").num().unwrap().value, Numerics::Integer(-15)); + assert_eq!(ast::NumNode::new("-2.672").num().unwrap().value, Numerics::Real(-2.672)); + assert_eq!(ast::NumNode::new("-2.672e3").num().unwrap().value, Numerics::Real(-2672.0)); + assert_eq!(ast::NumNode::new("-2.672e+16").num().unwrap().value, Numerics::Real(-26720000000000000.0)); + assert_eq!(ast::NumNode::new("-2.672e-10").num().unwrap().value, Numerics::Real(-0.0000000002672)); + assert_eq!(ast::NumNode::new("-67e-4").num().unwrap().value, Numerics::Real(-0.0067)); + assert_eq!(ast::NumNode::new("-67e+10").num().unwrap().value, Numerics::Integer(-670000000000)); + + let s : String = String::from("-6e12"); + let num = ast::NumNode::new(&*s); + + assert_eq!(num.num().unwrap().value, Numerics::Integer(-6000000000000)); + } +}+ \ No newline at end of file diff --git a/src/syntax/token.rs b/src/syntax/token.rs @@ -1,49 +1,133 @@ use std::fmt; use super::location; +use snailquote::escape; +use unicode_width::UnicodeWidthStr; + +/// Contains all possible types/classes of +/// lexiacal tokens. +#[derive(PartialEq)] pub enum TokenType { + /// Identifiers, variables, function names etc. Ident, + /// Numerics, anything that directly represents a number. Num, + /// Any operators, simular to idents but are lexed differently. Op, + /// Symbols, they are like elements of enums, they begin with a colon. Sym, + /// Strings, enclosed by double quotes ("..."). Str, + /// Left Parenthesis. + LParen, + /// Rigt Parenthesis. + RParen, + /// Left Square Bracket. + LBrack, + /// Right Square Bracket. + RBrack, + /// Left curly-brace. + LBrace, + /// Right curly-brace. + RBrace, + /// Left vector-list bracket. + LVec, + /// Right vector-list bracket. + RVec, + /// Terminator, something that ends a line. + /// Either a semi-colon (;) or a new-line (\n). + Term, + /// End Of File, last token in the stream. + EOF, } impl fmt::Display for TokenType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let printable = match *self { - TokenType::Ident => "Identifier", - TokenType::Num => "Numeric", - TokenType::Op => "Operator", - TokenType::Sym => "Symbol", - TokenType::Str => "String" + TokenType::Ident => "Identifier", + TokenType::Num => "Numeric", + TokenType::Op => "Operator", + TokenType::Sym => "Symbol", + TokenType::Str => "String", + TokenType::LParen => "L-Paren", + TokenType::RParen => "R-Paren", + TokenType::LBrack => "L-Bracket", + TokenType::RBrack => "R-Bracket", + TokenType::LBrace => "L-Brace", + TokenType::RBrace => "R-Brace", + TokenType::LVec => "L-Vector", + TokenType::RVec => "R-Vector", + TokenType::Term => "Terminator", + TokenType::EOF => "End-Of-File", }; write!(f, "{}", printable) } } +/// Token structure, an individual lexiacal token, +/// represented by its type/class, what it was written as +/// in the program, and its location in the code. pub struct Token { + /// What type/class of token it is. pub class : TokenType, + /// What string the token matched with. pub string : String, + /// Where the token is in the code. pub location : location::Loc, } impl Token { + /// Constructs a new Token structure. pub fn new(class : TokenType, string : &str, loc : location::Loc) -> Token { - Token { class: class, string: String::from(string), location: loc } + Token { class, string: String::from(string), location: loc } + } + + /// Checks if the token represents an atomic datum. + pub fn is_atomic(&self) -> bool { + match self.class { + TokenType::Ident => true, + TokenType::Num => true, + TokenType::Op => true, + TokenType::Sym => true, + TokenType::Str => true, + TokenType::LParen => false, + TokenType::RParen => false, + TokenType::LBrack => false, + TokenType::RBrack => false, + TokenType::LBrace => false, + TokenType::RBrace => false, + TokenType::LVec => false, + TokenType::RVec => false, + TokenType::Term => false, + TokenType::EOF => false, + } } + /// String representation of the token. pub fn to_string(&self) -> String { - String::from(format!("[ {class}: \"{rep}\" ({l}, {c}) ]", - class=self.class, rep=self.string, - l=self.location.line, c=self.location.col)) + let mut escaped = escape(&self.string.to_string()).into_owned(); + if !escaped.ends_with('"') { + escaped = format!("\"{}\"", escaped); + } + + format!("[ {class}:{spaces1}{rep}{spaces2}({l}, {c}):{span} ]", + class=self.class, rep=escaped, + spaces1=" ".repeat(12 - self.class.to_string().width()), + spaces2=" ".repeat(50 - escaped.width()), + l=self.location.line, c=self.location.col, + span=self.location.span) } } -pub trait ShowStream { fn to_string(&self) -> String; } +/// Allows for a custom string representation for the +/// token-stream as a whole. +pub trait ShowStream { + /// String representation of token-stream. + fn to_string(&self) -> String; +} impl ShowStream for Vec<Token> { fn to_string(&self) -> String { - let lines : Vec<String> = self.into_iter().map(|t| t.to_string()).collect(); - format!("[ {} ]", lines.join("\n ")) + let lines : Vec<String> = self.iter().map(Token::to_string).collect(); + format!("[ {} ]", lines.join(",\n ")) } } \ No newline at end of file diff --git a/test.vh b/test.vh @@ -1 +1,2 @@ -1 + 2 * 3 + -4 - 5- \ No newline at end of file +漢字 = "hello漢字漢字 world" + 漢字漢字 漢字v+ \ No newline at end of file