commit bec75cc31cee86eb7fece599abca75450f6869df
parent 8067a7ad116baace1dbd9537bbb8cc66023b870a
Author: Fredrik Knutsen <moi@knutsen.co>
Date: Sat, 13 Jul 2019 20:02:53 +0100
Added foundations for parser, need to implement some form of precedence parsing now.
Diffstat:
10 files changed, 597 insertions(+), 38 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,9 +1,20 @@
[package]
name = "valhalla"
+description = "Valhalla Language frontend, parser and AST compiler."
+homepage = "https://knutsen.co"
+repository = "https://github.com/Demonstrandum/valhalla"
+documentation = "https://github.com/Demonstrandum/valhalla"
+keywords = ["set-theory", "programming", "language", "parser", "compiler"]
+categories = ["parser-implementations", "parsing", "encoding", "command-line-interface"]
+license = "GPL-3.0"
+license-file = "LICENSE.md"
+readme = "README.md"
version = "0.1.0"
authors = ["Demonstrandum <moi@knutsen.co>"]
edition = "2018"
[dependencies]
lazy_static = "1.3.0"
-regex = "1"-
\ No newline at end of file
+regex = "1"
+snailquote = "0.2.0"
+unicode-width = "0.1.5"
diff --git a/src/main.rs b/src/main.rs
@@ -1,7 +1,15 @@
+//! Crate responsible for parsing and compiling
+//! the generated AST to Brokkr-bytecode for the
+//! Valhalla set theoretic programming language.
+
+/// Syntax submodule, responsible for lexical analysis,
+/// parsing and static analysis.
mod syntax;
+
fn main() {
println!("\nTill Valhalla!\n");
syntax::parse_file("./test.vh");
}
+
diff --git a/src/syntax/ast.rs b/src/syntax/ast.rs
@@ -0,0 +1,233 @@
+use std::convert::TryFrom;
+use std::fmt;
+
+/// Identifiers, node representing a name that
+/// will represent a value stored.
+pub struct IdentNode {
+ /// The name of the identifer.
+ pub value : String
+}
+
+/// Different types of possible number types in the langauge.
+/// Max size is determined by max pointer size.
+#[derive(PartialEq, Debug)]
+pub enum Numerics {
+ /// Naturals are unsigned ints.
+ Natural(usize),
+ /// Integers are signed.
+ Integer(isize),
+ /// Reals are represented as a double.
+ Real(f64)
+}
+
+/// Parse a string of more than two chars with a specified radix, into an ast::Numeric.
+fn parse_with_radix(neg : bool, s : &str, radix : u32) -> Numerics {
+ let unsigned = usize::from_str_radix(s.get(2..).unwrap(), radix).unwrap();
+ if neg {
+ return Numerics::Integer(-(unsigned as isize));
+ }
+ return Numerics::Natural(unsigned);
+}
+
+/// Converts primitive types into ast::Numerics.
+pub trait ToNumeric { fn to_numeric(&self) -> Numerics; }
+impl ToNumeric for &str {
+ fn to_numeric(&self) -> Numerics {
+ let mut test_str = self.clone().to_ascii_lowercase();
+
+ let is_neg = self.starts_with('-');
+ if is_neg { test_str = test_str.get(1..).unwrap().to_string(); }
+
+ return match test_str.get(0..2) {
+ Some("0x") => parse_with_radix(is_neg, &test_str, 16),
+ Some("0o") => parse_with_radix(is_neg, &test_str, 8),
+ Some("0b") => parse_with_radix(is_neg, &test_str, 2),
+ Some(_) => {
+ let exp_notation : Vec<&str> = test_str.split('e').collect();
+ let mantissa : &str = exp_notation.get(0).unwrap();
+ let mut exponent : &str = exp_notation.get(1).unwrap_or(&"0");
+ if exponent.is_empty() { exponent = "0"; }
+ let exponent : i32 = exponent.parse().unwrap();
+
+ if mantissa.contains('.') || exponent < 0 {
+ let mut number = mantissa.parse::<f64>().unwrap() * 10f64.powi(exponent);
+ if is_neg { number *= -1f64; }
+ return Numerics::Real(number);
+ }
+
+ let number : usize = mantissa.parse().unwrap();
+ if is_neg {
+ return Numerics::Integer(-(number as isize) * 10isize.pow(exponent as u32));
+ }
+ return Numerics::Natural(number * 10usize.pow(exponent as u32));
+ }
+ None => {
+ if is_neg {
+ return Numerics::Integer(-test_str.parse::<isize>().unwrap());
+ }
+ Numerics::Natural(test_str.parse::<usize>().unwrap())
+ }
+ };
+ }
+}
+
+impl ToNumeric for usize {
+ fn to_numeric(&self) -> Numerics { Numerics::Natural(*self) }
+}
+impl ToNumeric for u32 {
+ fn to_numeric(&self) -> Numerics { Numerics::Natural(*self as usize) }
+}
+impl ToNumeric for isize {
+ fn to_numeric(&self) -> Numerics {
+ if *self > 0 { return Numerics::Natural(*self as usize); }
+ Numerics::Integer(*self)
+ }
+}
+impl ToNumeric for i32 {
+ fn to_numeric(&self) -> Numerics {
+ if *self > 0 { return Numerics::Natural(*self as usize); }
+ Numerics::Integer(*self as isize)
+ }
+}
+impl ToNumeric for f64 {
+ fn to_numeric(&self) -> Numerics { Numerics::Real(*self) }
+}
+
+impl fmt::Display for Numerics {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let printable = match self {
+ Numerics::Natural(n) => n.to_string(),
+ Numerics::Integer(n) => n.to_string(),
+ Numerics::Real(n) => n.to_string(),
+ };
+ write!(f, "{}", printable)
+ }
+}
+
+/// Node that represents a number.
+pub struct NumNode {
+ /// Holds a the numeric value.
+ pub value : Numerics
+}
+
+
+/// Node for holding strings.
+pub struct StrNode {
+ /// Contents of the utf-8 string.
+ pub value : String
+}
+
+/// Symbol Node.
+pub struct SymNode {
+ /// Value/name stored as a string and
+ /// excludes the colon (:) in front.
+ pub value : String
+}
+
+/// Call Node has a pointer to the callee node
+/// and a list of operand nodes.
+pub struct CallNode {
+ /// Pointer to heap allocated calling node.
+ pub callee : Box<Nodes>,
+ /// Pointer to list of operand nodes.
+ pub operands : Vec<Nodes>
+}
+
+/// Represents a block of code / compound statements
+/// in order of when they will be executed.
+pub struct BlockNode {
+ /// Pointer to list of nodes in the code block.
+ pub statements : Vec<Nodes>
+}
+
+/// All node types.
+pub enum Nodes {
+ Ident(IdentNode),
+ Num(NumNode),
+ Str(StrNode),
+ Sym(SymNode),
+ Call(CallNode),
+ Block(BlockNode)
+}
+
+
+impl fmt::Display for Nodes {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let printable = match self {
+ Nodes::Ident(node) => format!("%ident {{ :value \"{}\" }}", node.value),
+ Nodes::Num(node) => format!("%num {{ :value {} }}", node.value),
+ Nodes::Str(node) => format!("%str {{ :value \"{}\" }}", node.value),
+ Nodes::Sym(node) => format!("%sym {{ :value \"{}\" }}", node.value),
+ Nodes::Call(node) => format!("%call {{ :callee \"{}\" }}", node.callee),
+ Nodes::Block(node) => format!("%block {{ ... }}"),
+ };
+ write!(f, "{}", printable)
+ }
+}
+
+macro_rules! unwrap_enum {
+ ($e:expr, $m:path) => {
+ match $e {
+ $m(inner) => Some(&inner),
+ _ => None
+ }
+ };
+}
+
+
+impl Nodes {
+ pub fn ident(&self) -> Option<&IdentNode> { unwrap_enum!(self, Nodes::Ident) }
+ pub fn num(&self) -> Option<&NumNode> { unwrap_enum!(self, Nodes::Num) }
+ pub fn str(&self) -> Option<&StrNode> { unwrap_enum!(self, Nodes::Str) }
+ pub fn sym(&self) -> Option<&SymNode> { unwrap_enum!(self, Nodes::Sym) }
+ pub fn call(&self) -> Option<&CallNode> { unwrap_enum!(self, Nodes::Call) }
+ pub fn block(&self) -> Option<&BlockNode> { unwrap_enum!(self, Nodes::Block) }
+
+ pub fn is_atomic(&self) -> bool {
+ match self {
+ Nodes::Ident(_) => true,
+ Nodes::Num(_) => true,
+ Nodes::Str(_) => true,
+ Nodes::Sym(_) => true,
+ Nodes::Call(_) => false,
+ Nodes::Block(_) => false,
+ }
+ }
+}
+
+impl IdentNode {
+ pub fn new(value : &str) -> Nodes { Nodes::Ident(IdentNode { value: value.to_string() }) }
+}
+
+impl NumNode {
+ pub fn new<Num : ToNumeric>(number : Num) -> Nodes {
+ let value = number.to_numeric();
+ Nodes::Num(NumNode { value })
+ }
+}
+
+impl StrNode {
+ pub fn new(value : &str) -> Nodes { Nodes::Str(StrNode { value: value.to_string() }) }
+}
+
+impl SymNode {
+ pub fn new(value : &str) -> Nodes { Nodes::Sym(SymNode { value: value.to_string() }) }
+}
+
+
+/// Root branch of the AST.
+pub struct Root {
+ pub branches : Vec<Nodes>
+}
+
+impl Root {
+ pub fn new() -> Self {
+ Root { branches: Vec::new() }
+ }
+}
+impl fmt::Display for Root {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ let str_mapped : Vec<String> = self.branches.iter().map(Nodes::to_string).collect();
+ write!(f, "%root{{\n {}\n}}", str_mapped.join(",\n "))
+ }
+}+
\ No newline at end of file
diff --git a/src/syntax/internal_macros.rs b/src/syntax/internal_macros.rs
diff --git a/src/syntax/lexer.rs b/src/syntax/lexer.rs
@@ -6,13 +6,21 @@ use super::location;
use lazy_static::lazy_static;
use regex::Regex;
+use unicode_width::UnicodeWidthChar;
+use unicode_width::UnicodeWidthStr;
+
macro_rules! re {
($string:expr) => {
Regex::new($string).unwrap()
};
}
-trait RegexExt { fn first_match(&self, string : &str) -> Option<String>; }
+/// Extension allows first Regex match to be easily picked out
+/// and returns Option<String> containing the string for the capture.
+trait RegexExt {
+ /// Gets first match in string.
+ fn first_match(&self, string : &str) -> Option<String>;
+}
impl RegexExt for Regex {
fn first_match(&self, string : &str) -> Option<String> {
let cap = self.captures(string);
@@ -28,12 +36,14 @@ impl RegexExt for Regex {
}
}
-const IDENT_CHARS : &str = r"\p{L}\?\!\'\-\_";
+/// All chars that may constitue an ident.
+const IDENT_CHARS : &str = r"\p{L}\?!'\-_";
lazy_static! {
static ref OP : Regex = re!(r"\A([\+\.\*\|\\/\&%\$\^\~><=¬@\-]+)");
- static ref IDENT : Regex = re!(&format!(r"\A([{id}][{id}\p{{N}}]+)", id=IDENT_CHARS));
- static ref NUM : Regex = re!(r"\A(\-?(?:(?:[0-9]+(?:\.[0-9]+)?(?:e[+-]?[0-9]+)?)|(?:0x[0-9a-f]+)|(?:0b[01]+)|(?:0o[0-7]+)))");
+ static ref IDENT : Regex = re!(&format!(r"\A([{id}][{id}\p{{N}}]*)", id=IDENT_CHARS));
+ static ref SYM : Regex = re!(&format!(r"\A(:[{id}\p{{N}}]+)", id=IDENT_CHARS));
+ static ref NUM : Regex = re!(r"\A(\-?(?:(?:0[xX][0-9a-f]+)|(?:0[bB][01]+)|(?:0[Oo][0-7]+)|(?:(?:[0-9]+(?:\.[0-9]+)?(?:e[\+\-]?[0-9]+)?))))");
}
macro_rules! try_match {
@@ -41,11 +51,10 @@ macro_rules! try_match {
$reg:expr, $token_type:expr,
$current_char:expr, $line:expr, $col:expr) => {
if let Some(matched) = $reg.first_match($partial) {
- let span = matched.chars().count() as u32;
+ let span = matched.width() as u32;
$stream.push(Token::new(
$token_type, &matched,
- location::new($line, $col, span)
- ));
+ location::new($line, $col, span)));
$current_char += matched.len();
$col += span;
continue;
@@ -53,18 +62,129 @@ macro_rules! try_match {
};
}
-pub fn lex(string : String) -> Vec<Token> {
+/// Takes a piece of code (as a &str) and returns
+/// the generated token-stream (as a Vec<Token>).
+pub fn lex(string : &str) -> Vec<Token> {
let mut token_stream : Vec<Token> = Vec::new();
let mut current_char = 0;
- let string_size = string.len();
+ let string_size = string.bytes().count();
let mut partial : &str;
let mut line = 1;
let mut col = 1;
while current_char < string_size {
- partial = &string[current_char..];
+ if let Some(slice) = &string.get(current_char..) {
+ partial = slice;
+ } else { // Not on boundary yet.
+ current_char += 1;
+ continue;
+ }
+
+ let maybe_vec = &partial.get(0..2).unwrap_or("");
+ let vec_brack = match maybe_vec {
+ &"[|" => Some(TokenType::LVec),
+ &"|]" => Some(TokenType::RVec),
+ _ => None
+ };
+ if let Some(tt) = vec_brack {
+ token_stream.push(Token::new(
+ tt, maybe_vec,
+ location::new(line, col, 2)));
+ col += 2;
+ current_char += 2;
+ continue;
+ }
+
+ let first_char = partial.chars().nth(0)
+ .expect("Empty program was trying to be lexed."); // This should't happen.
+
+ let single_char_token = match first_char {
+ '(' => Some(TokenType::LParen),
+ ')' => Some(TokenType::RParen),
+ '[' => Some(TokenType::LBrack),
+ ']' => Some(TokenType::RBrack),
+ '{' => Some(TokenType::LBrace),
+ '}' => Some(TokenType::RBrace),
+ '\n' | ';' => Some(TokenType::Term),
+ _ => None
+ };
+
+ if let Some(tt) = single_char_token {
+ token_stream.push(Token::new(
+ tt, &first_char.to_string(),
+ location::new(line, col, 1)));
+ if first_char == '\n' {
+ line += 1;
+ col = 1;
+ } else {
+ col += 1;
+ }
+ current_char += 1;
+ continue;
+ }
+
+ if first_char == '"' {
+ let mut contents = String::new();
+
+ let mut eos = false;
+ let mut i = 1;
+ let old_col = col;
+ while !eos { // Spaghet
+ if let Some(character) = partial.chars().nth(i) {
+ if character == '"' {
+ current_char += 1;
+ col += 1;
+ eos = true;
+ } else if character == '\\' {
+ if let Some(next) = partial.chars().nth(i + 1) {
+ let escaped : String = match next {
+ '\\' => String::from("\\"),
+ 'r' => String::from("\r"),
+ 'n' => String::from("\n"),
+ 't' => String::from("\t"),
+ 'b' => String::from("\x08"),
+ '0' => String::from("\0"),
+ 'x' => {
+ if let Some(code) = partial.get((current_char + 2)..(current_char + 4)) {
+ i += 2;
+ col += 2;
+ current_char += 2;
+ (u8::from_str_radix(code, 16).expect("Malformed hex.") as char).to_string()
+ } else { String::new() }
+ }
+ c => c.to_string()
+ };
+ i += 1;
+ col += 1;
+ current_char += 1;
+ contents.push_str(&escaped);
+ continue;
+ } else {
+ eos = true;
+ // Error: Unexpected EOS!
+ }
+ } else {
+ contents.push(character);
+ i += 1;
+ col += character.width().unwrap_or(2) as u32;
+ current_char += character.len_utf8();
+ continue;
+ }
+ } else {
+ eos = true;
+ // Error: Unexpected EOS!
+ }
+ i += 1;
+ current_char += 1;
+ col += 1;
+ }
+ token_stream.push(Token::new(
+ TokenType::Str, &contents,
+ location::new(line, old_col, col - old_col)));
+ continue;
+ }
try_match!(token_stream, partial,
NUM, TokenType::Num,
@@ -74,14 +194,20 @@ pub fn lex(string : String) -> Vec<Token> {
OP, TokenType::Op,
current_char, line, col);
- if partial.chars().nth(0).unwrap() == '\n' {
- line += 1;
- col = 1;
- current_char += 1;
- continue;
- }
+ try_match!(token_stream, partial,
+ IDENT, TokenType::Ident,
+ current_char, line, col);
+
+ try_match!(token_stream, partial,
+ SYM, TokenType::Sym,
+ current_char, line, col);
+
current_char += 1;
if partial.is_char_boundary(0) { col += 1 }
}
+
+ token_stream.push(Token::new(
+ TokenType::EOF, "\0",
+ location::new(line, col, 1)));
token_stream
}
\ No newline at end of file
diff --git a/src/syntax/location.rs b/src/syntax/location.rs
@@ -1,14 +1,15 @@
+/// Holds line, column and span of a lexical token.
pub struct Loc {
+ /// Line number.
pub line : u32,
+ /// Column number.
pub col : u32,
+ /// Span/Width (in characters) of token.
pub span : u32,
}
+/// Construct new Loc structure.
pub fn new(line : u32, col : u32, span : u32) -> Loc {
- Loc {
- line: line,
- col: col,
- span: span
- }
+ Loc { line, col, span }
}
diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs
@@ -1,17 +1,34 @@
+/// location manages line and column location of
+/// lexical tokens as well as their span.
mod location;
+
+/// Provides token classes and methods
mod token;
+/// Abstract Syntax Tree nodes and methods.
+mod ast;
+
+/// Lexer splits code up into a token-stream
+/// of relevant lexical tokens, making the
+/// parsing step a lot easier.
pub mod lexer;
+
+/// Converts a token-stream into a nested AST.
pub mod parser;
use std::fs;
use token::ShowStream;
+/// Parses a given file, calling various methods from
+/// the `syntax` sub-module.
pub fn parse_file(filename : &str) {
let code = fs::read_to_string(filename)
.expect("Could not open file for reading.");
println!("Code:\n{}\n", code);
- let stream = lexer::lex(code);
+ let stream = lexer::lex(&code);
println!("Stream:\n{}\n", stream.to_string());
+
+ let tree = parser::parse(stream);
+ println!("AST:\n{}\n", tree)
}
\ No newline at end of file
diff --git a/src/syntax/parser.rs b/src/syntax/parser.rs
@@ -0,0 +1,77 @@
+use super::token;
+use super::ast;
+
+use token::{Token, TokenType};
+use ast::{Numerics, Nodes};
+
+pub fn parse(stream : Vec<Token>) -> ast::Root {
+ let mut tree = ast::Root::new();
+
+ for token in stream {
+ if token.is_atomic() {
+ tree.branches.push(atom(&token));
+ }
+ }
+
+ tree
+}
+
+fn atom(token : &Token) -> Nodes {
+ match token.class {
+ TokenType::Ident => ast::IdentNode::new(&token.string),
+ TokenType::Op => ast::IdentNode::new(&token.string),
+ TokenType::Num => ast::NumNode::new(&*token.string),
+ TokenType::Str => ast::StrNode::new(&token.string),
+ _ => panic!("Passed non-atomic token to `atom` parser.")
+ }
+}
+
+#[cfg(test)]
+mod test {
+ use super::*;
+
+ #[test]
+ fn numeric_parsing() {
+ assert_eq!(ast::NumNode::new(2).num().unwrap().value, Numerics::Natural(2usize));
+ assert_eq!(ast::NumNode::new(2usize).num().unwrap().value, Numerics::Natural(2usize));
+ assert_eq!(ast::NumNode::new(2u32).num().unwrap().value, Numerics::Natural(2usize));
+ assert_eq!(ast::NumNode::new(2i32).num().unwrap().value, Numerics::Natural(2usize));
+
+ assert_eq!(ast::NumNode::new(-2).num().unwrap().value, Numerics::Integer(-2isize));
+ assert_eq!(ast::NumNode::new(-2i32).num().unwrap().value, Numerics::Integer(-2isize));
+ assert_eq!(ast::NumNode::new(-2isize).num().unwrap().value, Numerics::Integer(-2isize));
+
+ assert_eq!(ast::NumNode::new(-2.62).num().unwrap().value, Numerics::Real(-2.62f64));
+ assert_eq!(ast::NumNode::new(2.62).num().unwrap().value, Numerics::Real(2.62f64));
+
+ assert_eq!(ast::NumNode::new("2").num().unwrap().value, Numerics::Natural(2));
+ assert_eq!(ast::NumNode::new("325").num().unwrap().value, Numerics::Natural(325));
+ assert_eq!(ast::NumNode::new("0b01010110").num().unwrap().value, Numerics::Natural(0b01010110));
+ assert_eq!(ast::NumNode::new("0o721").num().unwrap().value, Numerics::Natural(0o721));
+ assert_eq!(ast::NumNode::new("0xfa").num().unwrap().value, Numerics::Natural(0xfa));
+ assert_eq!(ast::NumNode::new("0xf").num().unwrap().value, Numerics::Natural(0xf));
+ assert_eq!(ast::NumNode::new("2.672").num().unwrap().value, Numerics::Real(2.672));
+ assert_eq!(ast::NumNode::new("2.672e3").num().unwrap().value, Numerics::Real(2672.0));
+ assert_eq!(ast::NumNode::new("2.672e+16").num().unwrap().value, Numerics::Real(2.672 * 10f64.powf(16f64)));
+ assert_eq!(ast::NumNode::new("2.672e-10").num().unwrap().value, Numerics::Real(2.672 * 10f64.powf(-10f64)));
+ assert_eq!(ast::NumNode::new("67e-4").num().unwrap().value, Numerics::Real(0.0067));
+ assert_eq!(ast::NumNode::new("67e+10").num().unwrap().value, Numerics::Natural(670000000000));
+ assert_eq!(ast::NumNode::new("-2").num().unwrap().value, Numerics::Integer(-2));
+ assert_eq!(ast::NumNode::new("-325").num().unwrap().value, Numerics::Integer(-325));
+ assert_eq!(ast::NumNode::new("-0b01010110").num().unwrap().value, Numerics::Integer(-0b01010110));
+ assert_eq!(ast::NumNode::new("-0o721").num().unwrap().value, Numerics::Integer(-0o721));
+ assert_eq!(ast::NumNode::new("-0xfa").num().unwrap().value, Numerics::Integer(-250));
+ assert_eq!(ast::NumNode::new("-0xf").num().unwrap().value, Numerics::Integer(-15));
+ assert_eq!(ast::NumNode::new("-2.672").num().unwrap().value, Numerics::Real(-2.672));
+ assert_eq!(ast::NumNode::new("-2.672e3").num().unwrap().value, Numerics::Real(-2672.0));
+ assert_eq!(ast::NumNode::new("-2.672e+16").num().unwrap().value, Numerics::Real(-26720000000000000.0));
+ assert_eq!(ast::NumNode::new("-2.672e-10").num().unwrap().value, Numerics::Real(-0.0000000002672));
+ assert_eq!(ast::NumNode::new("-67e-4").num().unwrap().value, Numerics::Real(-0.0067));
+ assert_eq!(ast::NumNode::new("-67e+10").num().unwrap().value, Numerics::Integer(-670000000000));
+
+ let s : String = String::from("-6e12");
+ let num = ast::NumNode::new(&*s);
+
+ assert_eq!(num.num().unwrap().value, Numerics::Integer(-6000000000000));
+ }
+}+
\ No newline at end of file
diff --git a/src/syntax/token.rs b/src/syntax/token.rs
@@ -1,49 +1,133 @@
use std::fmt;
use super::location;
+use snailquote::escape;
+use unicode_width::UnicodeWidthStr;
+
+/// Contains all possible types/classes of
+/// lexiacal tokens.
+#[derive(PartialEq)]
pub enum TokenType {
+ /// Identifiers, variables, function names etc.
Ident,
+ /// Numerics, anything that directly represents a number.
Num,
+ /// Any operators, simular to idents but are lexed differently.
Op,
+ /// Symbols, they are like elements of enums, they begin with a colon.
Sym,
+ /// Strings, enclosed by double quotes ("...").
Str,
+ /// Left Parenthesis.
+ LParen,
+ /// Rigt Parenthesis.
+ RParen,
+ /// Left Square Bracket.
+ LBrack,
+ /// Right Square Bracket.
+ RBrack,
+ /// Left curly-brace.
+ LBrace,
+ /// Right curly-brace.
+ RBrace,
+ /// Left vector-list bracket.
+ LVec,
+ /// Right vector-list bracket.
+ RVec,
+ /// Terminator, something that ends a line.
+ /// Either a semi-colon (;) or a new-line (\n).
+ Term,
+ /// End Of File, last token in the stream.
+ EOF,
}
impl fmt::Display for TokenType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let printable = match *self {
- TokenType::Ident => "Identifier",
- TokenType::Num => "Numeric",
- TokenType::Op => "Operator",
- TokenType::Sym => "Symbol",
- TokenType::Str => "String"
+ TokenType::Ident => "Identifier",
+ TokenType::Num => "Numeric",
+ TokenType::Op => "Operator",
+ TokenType::Sym => "Symbol",
+ TokenType::Str => "String",
+ TokenType::LParen => "L-Paren",
+ TokenType::RParen => "R-Paren",
+ TokenType::LBrack => "L-Bracket",
+ TokenType::RBrack => "R-Bracket",
+ TokenType::LBrace => "L-Brace",
+ TokenType::RBrace => "R-Brace",
+ TokenType::LVec => "L-Vector",
+ TokenType::RVec => "R-Vector",
+ TokenType::Term => "Terminator",
+ TokenType::EOF => "End-Of-File",
};
write!(f, "{}", printable)
}
}
+/// Token structure, an individual lexiacal token,
+/// represented by its type/class, what it was written as
+/// in the program, and its location in the code.
pub struct Token {
+ /// What type/class of token it is.
pub class : TokenType,
+ /// What string the token matched with.
pub string : String,
+ /// Where the token is in the code.
pub location : location::Loc,
}
impl Token {
+ /// Constructs a new Token structure.
pub fn new(class : TokenType, string : &str, loc : location::Loc) -> Token {
- Token { class: class, string: String::from(string), location: loc }
+ Token { class, string: String::from(string), location: loc }
+ }
+
+ /// Checks if the token represents an atomic datum.
+ pub fn is_atomic(&self) -> bool {
+ match self.class {
+ TokenType::Ident => true,
+ TokenType::Num => true,
+ TokenType::Op => true,
+ TokenType::Sym => true,
+ TokenType::Str => true,
+ TokenType::LParen => false,
+ TokenType::RParen => false,
+ TokenType::LBrack => false,
+ TokenType::RBrack => false,
+ TokenType::LBrace => false,
+ TokenType::RBrace => false,
+ TokenType::LVec => false,
+ TokenType::RVec => false,
+ TokenType::Term => false,
+ TokenType::EOF => false,
+ }
}
+ /// String representation of the token.
pub fn to_string(&self) -> String {
- String::from(format!("[ {class}: \"{rep}\" ({l}, {c}) ]",
- class=self.class, rep=self.string,
- l=self.location.line, c=self.location.col))
+ let mut escaped = escape(&self.string.to_string()).into_owned();
+ if !escaped.ends_with('"') {
+ escaped = format!("\"{}\"", escaped);
+ }
+
+ format!("[ {class}:{spaces1}{rep}{spaces2}({l}, {c}):{span} ]",
+ class=self.class, rep=escaped,
+ spaces1=" ".repeat(12 - self.class.to_string().width()),
+ spaces2=" ".repeat(50 - escaped.width()),
+ l=self.location.line, c=self.location.col,
+ span=self.location.span)
}
}
-pub trait ShowStream { fn to_string(&self) -> String; }
+/// Allows for a custom string representation for the
+/// token-stream as a whole.
+pub trait ShowStream {
+ /// String representation of token-stream.
+ fn to_string(&self) -> String;
+}
impl ShowStream for Vec<Token> {
fn to_string(&self) -> String {
- let lines : Vec<String> = self.into_iter().map(|t| t.to_string()).collect();
- format!("[ {} ]", lines.join("\n "))
+ let lines : Vec<String> = self.iter().map(Token::to_string).collect();
+ format!("[ {} ]", lines.join(",\n "))
}
}
\ No newline at end of file
diff --git a/test.vh b/test.vh
@@ -1 +1,2 @@
-1 + 2 * 3 + -4 - 5-
\ No newline at end of file
+漢字 = "hello漢字漢字 world"
+ 漢字漢字 漢字v+
\ No newline at end of file