commit 3566ca3f0bdfc2c0684253c013b5d0953c898a36
parent 6782df906b1cb2c6dbc29fba56e5e2b84ae65145
Author: Demonstrandum <moi@knutsen.co>
Date:   Mon, 22 Jun 2020 01:55:18 +0100
Make lexer and parser more whitespace aware.
Diffstat:
8 files changed, 117 insertions(+), 45 deletions(-)
diff --git a/src/assemble/html.rs b/src/assemble/html.rs
@@ -16,7 +16,7 @@ impl HTMLFormatter {
 }
 
 pub const DEFAULT : &str =
-    "<!DOCTYPE>\n\
+    "<!DOCTYPE html>\n\
     <html>\n\
         <head></head>\n\
         <body></body>\n\
@@ -43,7 +43,7 @@ impl Documentise for HTMLFormatter {
         if has_declaration {
             current_node = &self.tree[1];
         } else {
-            doc += "<!DOCTYPE html>"
+            doc += "<!DOCTYPE html>\n"
         }
         // Check if <html></html> root object exists.
         // Check if head exits, if not, make an empty one.
@@ -69,11 +69,21 @@ impl Documentise for HTMLFormatter {
 /// Converting the tree to an HTML string.
 impl Display for HTMLFormatter {
     fn fmt(&self, f : &mut fmt::Formatter<'_>) -> fmt::Result {
-        for node in &self.tree {
+        let mut tree_iter = self.tree.iter().peekable();
+        while let Some(node) = tree_iter.next() {
             match node {
-                ParseNode::Symbol(node) => write!(f, " {}", node.value)?,
-                ParseNode::Number(node) => write!(f, " {}", node.value)?,
-                ParseNode::String(node) => write!(f, " {}", node.value)?,
+                ParseNode::Symbol(node)
+                | ParseNode::Number(node) => {
+                    // If symbol ahead is so-called "symbolic", we can
+                    // infere there was a space between them.
+                    write!(f, "{}", node.value)?;
+                    if let Some(peek) = tree_iter.peek() {
+                        if peek.symbolic().is_some() {
+                            write!(f, " ")?
+                        }
+                    }
+                },
+                ParseNode::String(node) => write!(f, "{}", node.value)?,
                 ParseNode::List(list) => {
                     let head = list.first();
                     let mut tag = "";
@@ -92,9 +102,12 @@ impl Display for HTMLFormatter {
 
                     // Declarations behave differently.
                     if tag.as_bytes()[0] == '!' as u8 {
-                        // TODO: Following can only be symbols.
                         while !rest.is_empty() {
-                            write!(f, " {}", rest[0])?;
+                            if let Some(node) = rest[0].symbolic() {
+                                write!(f, " {}", node.value)?;
+                            } else {
+                                // TODO: Make and send error (can only be symbolic).
+                            }
                             rest = &rest[1..];
                         }
                         write!(f, ">")?;
@@ -109,13 +122,13 @@ impl Display for HTMLFormatter {
                             // Error! Cannot be non atomic.
                         }
                     }
-                    writeln!(f, ">")?;
+                    write!(f, ">")?;
 
                     let html_fmt = HTMLFormatter::new(rest.to_owned());
-                    writeln!(f, "{}", html_fmt)?;
+                    write!(f, "{}", html_fmt)?;
                     write!(f, "</{}>", tag)?;
                 },
-                _ => write!(f, "hi")?,
+                _ => panic!("Uh {:?}", node),
             }
         }
         write!(f, "")
diff --git a/src/bin.rs b/src/bin.rs
@@ -50,9 +50,10 @@ fn main() -> Result<(), Box<dyn Error>> {
 
     for file in files {
         let tree = seam::parse_file(&file)?;
-        /*eprintln!("{}", &tree
+        #[cfg(feature="debug")]
+        eprintln!("{}", &tree
             .iter().fold(String::new(),
-            |acc, s| acc + "\n" + &s.to_string()));*/
+            |acc, s| acc + "\n" + &s.to_string()));
         if target == "html" {
             let fmt = seam::assemble::html::HTMLFormatter::new(tree);
             let result = fmt.document();
diff --git a/src/lib.rs b/src/lib.rs
@@ -11,6 +11,8 @@ pub const VERSION : (u8, u8, u8) = (0, 1, 0);
 pub fn parse<P: AsRef<Path>>(string : String, source : Option<P>)
     -> Result<parser::ParseTree, Box<dyn Error>> {
     let tokens = lexer::lex(string, source)?;
+    #[cfg(feature="debug")]
+    eprintln!("{:#?}", &tokens);
     let tree = parser::parse_stream(tokens)?;
     Ok(tree)
 }
diff --git a/src/parse/lexer.rs b/src/parse/lexer.rs
@@ -15,6 +15,10 @@ impl fmt::Display for LexError {
 
 impl Error for LexError { }
 
+fn is_whitespace(character : char) -> bool {
+    ['\n', '\r', '\t', ' '].contains(&character)
+}
+
 fn character_kind(character : char, prev : Option<tokens::Kind>)
     -> Option<tokens::Kind> {
     let kind = match character {
@@ -47,7 +51,7 @@ pub fn lex<P: AsRef<Path>>(string : String, _source : Option<P>)
     let mut line_bytes : usize = 0;
 
     let mut accumulator : Vec<u8> = Vec::new();
-    let mut tokens = Vec::new();
+    let mut tokens : TokenStream = Vec::new();
 
     let mut token_start : usize = 0;
     let mut current_kind = None;
@@ -65,33 +69,74 @@ pub fn lex<P: AsRef<Path>>(string : String, _source : Option<P>)
 
         let character = current_byte as char;
 
+        if character == ';' {  // EON Comment
+            let mut i = 0;
+            while string.as_bytes()[bytes + i] != '\n' as u8 {
+                i += 1;
+            }
+            bytes += i;
+            continue;
+        }
+
         let mut prev_kind = current_kind;
         current_kind = character_kind(character, current_kind);
 
         let string_start = character == '"'
-        && prev_kind != Some(tokens::Kind::String);
+            && prev_kind != Some(tokens::Kind::String);
         if string_start {
             current_kind = None;
         }
 
-        let mut peek_kind = if bytes == eof - 1 {
+        let peek_char = if bytes == eof - 1 {
             None
         } else {
             let peek_char = string.as_bytes()[bytes + 1] as char;
-            character_kind(peek_char, current_kind)
+            Some(peek_char)
         };
+        let mut peek_kind = if let Some(peeked) = peek_char {
+            character_kind(peeked, current_kind)
+        } else { None };
+
+        let some_lparen = Some(tokens::Kind::LParen);
+        let some_rparen = Some(tokens::Kind::RParen);
+
+        let was_lparen = current_kind == some_lparen;
+        let was_rparen = current_kind == some_rparen;
+
+        let peek_string = peek_char == Some('"');
+        let peek_lparen = peek_kind == some_lparen;
+        let peek_rparen = peek_kind == some_rparen;
 
-        let was_lparen = current_kind == Some(tokens::Kind::LParen);
-        let was_rparen = current_kind == Some(tokens::Kind::RParen);
-        let peek_rparen = peek_kind == Some(tokens::Kind::RParen);
         if was_lparen || was_rparen {
             peek_kind = None;
             prev_kind = None;
-        }
-        if peek_rparen {
+        } else if peek_rparen || peek_lparen {
+            peek_kind = None;
+        } else if peek_string {
             peek_kind = None;
         }
 
+        // If we're on a whitespace, and there's a bracket (or quote) ahead,
+        // we need to explicitly say there's whitespace between the
+        // last token and the next bracket/quotation.
+        // (Ignore the whitespace, if it is consecutive to another whitespace)
+        match tokens.last() {
+            Some(token) if token.kind != tokens::Kind::Whitespace
+                        && token.kind != tokens::Kind::Keyword
+                        && is_whitespace(character)
+                        && (peek_rparen
+                         || peek_lparen
+                         || peek_char == Some('"')
+                         || token.kind == tokens::Kind::String
+                         || token.kind == tokens::Kind::RParen) => {
+                let kind = tokens::Kind::Whitespace;
+                let site = tokens::Site::from_line(lines, line_bytes, 1);
+                let value = character.to_string();
+                tokens.push(Token::new(kind, value, site));
+            },
+            Some(_) | None => (),
+        }
+
         if let Some(kind_current) = current_kind {
             if prev_kind.is_none() {
                 old_kind = current_kind;
diff --git a/src/parse/parser.rs b/src/parse/parser.rs
@@ -32,6 +32,13 @@ pub enum ParseNode {
 }
 
 impl ParseNode {
+    pub fn symbolic(&self) -> Option<Node> {
+        match self {
+            Self::Symbol(node)
+            | Self::Number(node) => Some(node.to_owned()),
+            _ => None
+        }
+    }
     pub fn atomic(&self) -> Option<Node> {
         match self {
             Self::Symbol(node)
@@ -62,6 +69,7 @@ fn parse_atomic(token : &Token) -> Result<ParseNode, ParseError> {
         Kind::Symbol => Ok(ParseNode::Symbol(node)),
         Kind::String => Ok(ParseNode::String(node)),
         Kind::Number => Ok(ParseNode::Number(node)),
+        Kind::Whitespace => Ok(ParseNode::String(node)),
         _ => Err(ParseError(
             String::from("Atomic token not found here."),
             token.site.clone()))
@@ -96,11 +104,17 @@ pub fn parse(tokens : &[Token])
         },
         Kind::Keyword => {
             // Parse second token, make attribute.
-            let (node, slice) = parse(&tokens[1..])?;
+            let (node, mut slice) = parse(&tokens[1..])?;
             let attribute = AttributeNode {
                 keyword: token.value[1..].to_owned(),
                 node: Box::new(node)
             };
+            // White space after attributes don't count.
+            if let Some(next) = slice.first() {
+                if next.kind == Kind::Whitespace {
+                    slice = &slice[1..];
+                }
+            }
             Ok((ParseNode::Attribute(attribute), slice))
         },
         Kind::RParen => {
@@ -127,12 +141,19 @@ pub fn parse_stream(tokens: tokens::TokenStream)
 }
 
 /// Pretty printing for parse nodes.
+#[cfg(feature="debug")]
 impl fmt::Display for ParseNode {
     fn fmt(&self, f : &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
             ParseNode::Symbol(node)
             | ParseNode::Number(node)  => write!(f, "{}", &node.value),
-            ParseNode::String(node)    => write!(f, "\"{}\"", &node.value),
+            ParseNode::String(node)    => {
+                if node.value.trim().is_empty() {
+                    write!(f, "")
+                } else {
+                    write!(f, "\"{}\"", &node.value)
+                }
+            },
             ParseNode::Attribute(attr) => write!(f, ":{} {}",
                 &attr.keyword, &*attr.node),
             ParseNode::List(list) => write!(f, "({}{})", &list[0],
diff --git a/src/parse/tokens.rs b/src/parse/tokens.rs
@@ -36,6 +36,7 @@ pub enum Kind {
     String,
     Number,
     Keyword,
+    Whitespace,
 }
 
 #[derive(Debug, Clone)]
diff --git a/test.html b/test.html
@@ -1,21 +1,9 @@
-<!DOCTYPE html><html>
-<head>
-<title>
- Example HTML Document
-</title>
-</head><body>
-<p id="hello">
- Hello, World!
-</p><p>
- something something text...
-</p><h1>
- A (big) Header!
-</h1><p>
- Yet some more<span style="color: red">
- text
-</span> <3
-</p><img alt="Cute Cat" src="https://static.insider.com/image/5d24d6b921a861093e71fef3.jpg" width="300">
+<!DOCTYPE html>
+<html> <head> <title>Example HTML Document</title></head>
+<body> <p id="hello">Hello, World!</p>
+<p>something something text...</p>
+<h1> A (big) Header!</h1>
+<p>Yet some more <span style="color: red">text</span> <3</p>
+<p>Hello<span style="color: green">World</span>!</p>
+<img alt="Cat" src="https://static.insider.com/image/5d24d6b921a861093e71fef3.jpg" width="300"></img></body></html>
 
-</img>
-</body>
-</html>
diff --git a/test.sex b/test.sex
@@ -8,8 +8,9 @@
     (h1 "A (big) Header!")
     (p Yet some more
        (span :style "color: red" text) <3)
+    (p Hello(span :style "color: green" World)!)
     (img
-      :alt "Cute Cat"
+      :alt Cat
       :src "https://static.insider.com/image/5d24d6b921a861093e71fef3.jpg"
       :width 300)))