unmarshal.rs (7326B)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | /*!
* # Parse/unmarshal bytecode compiled blobs.
*
* Files get directly parsed into stack frames, using an abundance
* of pre-existing code in this project, forming them into well-understood
* data structures, that are used throughout the program.
*
* The immediate next step from here is to pass the frames to the VM
* and have it follow the instructions byte-for-byte.
*
*/
use std::collections::HashSet;
use super::address::Address;
use super::opcodes;
use super::frame;
use super::frame::Instruction;
use num_traits::FromPrimitive;
const POINTER_BYTES : usize = std::mem::size_of::<usize>();
type Bytes = Vec<u8>;
type ByteSlice = [u8];
#[inline]
fn fix_slice_size<T, const N : usize>(slice : &[T]) -> &[T; N] {
let ptr = slice.as_ptr() as *const [T; N];
unsafe { &*ptr }
}
/// Functions that consume byte by byte, to reconstruct a code block.
mod eat {
use super::*;
/// Consume a null terminated string.
pub fn null_string(mut i : usize, bytes : &ByteSlice) -> (usize, String) {
let mut string : Bytes = vec![];
while bytes[i] != 0x00 {
string.push(bytes[i]);
i += 1;
} // Trust these are valid bytes.
let string = std::str::from_utf8(&string)
.expect("Invalid utf8 bytes in null-terminated string. Bad bytecode.")
.to_owned();
return (i + 1, string);
}
fn consume_sized(mut i : usize, bytes : &ByteSlice) -> (usize, Bytes) {
let size = bytes[i] as usize;
i += 1;
let mut padded = vec![0_u8; POINTER_BYTES];
let slice = bytes[i..i + size].to_owned();
for j in 0..size {
padded[POINTER_BYTES - j - 1] = slice[size - j - 1];
}
(i + size, padded)
}
#[derive(Debug)]
pub struct Egg {
a : f64,
b : String
}
fn constant(mut i : usize, bytes : &ByteSlice) -> (usize, Address) {
let const_type = bytes[i];
i += 1;
return match const_type {
// Parse number-types
0x01..=0x03 => {
let (i, bytes_slice) = consume_sized(i, bytes);
let bytes_slice = fix_slice_size::<u8, POINTER_BYTES>(&bytes_slice[..POINTER_BYTES]);
let value = Address(usize::from_be_bytes(*bytes_slice));
(i, value)
},
// Parse Strings
0x04 => {
let (i, bytes_slice) = consume_sized(i, bytes);
let bytes_slice = fix_slice_size::<u8, POINTER_BYTES>(&bytes_slice[..POINTER_BYTES]);
let str_len = usize::from_be_bytes(*bytes_slice);
// Store string on heap, `Address` holds a raw pointer to it.
let string = Address::new(std::str::from_utf8(&bytes[i..i + str_len])
.expect("Invalid utf8 bytes in string. Bad bytecode."));
(i + str_len, string)
}
_ => panic!(format!(
"Type-specifier-prefix ({:x}) is not recognised.",
const_type))
}
}
pub fn constants(mut i : usize, bytes : &ByteSlice) -> (usize, Vec<Address>) {
// Constant blocks are expected to start with `0x11`.
#[cfg(debug_assertions)]
assert_eq!(bytes[i], 0x11);
i += 1;
let mut consts : Vec<Address> = vec![];
while bytes[i] != 0x00 {
let (j, void) = constant(i, bytes);
i = j;
consts.push(void);
}
return (i + 1, consts);
}
/// Parse local variable names (null terminated strings).
pub fn locals(mut i : usize, bytes : &ByteSlice) -> (usize, HashSet<String>) {
let mut set : HashSet<String> = HashSet::new();
#[cfg(debug_assertions)]
assert_eq!(bytes[i], 0x12);
i += 1;
while bytes[i] != 0x00 { // Read strings until end of block.
let (j, local) = eat::null_string(i, bytes);
set.insert(local);
i = j;
}
(i + 1, set)
}
pub fn instructions(mut i : usize, bytes : &ByteSlice) -> (usize, Vec<Instruction>) {
let mut instrs : Vec<Instruction> = vec![];
#[cfg(debug_assertions)]
assert_eq!(bytes[i], 0x13);
i += 1;
while bytes[i] != 0x00 {
instrs.push(Instruction::from(bytes[i]));
let maybe_instr : Option<opcodes::Operators> =
FromPrimitive::from_usize(bytes[i] as usize);
if let Some(instr) = maybe_instr {
// If the opcode takes an operand (u16), consume this too.
if instr.takes_operand() {
i += 2;
let operand = (u16::from(bytes[i - 1]) << 8)
+ u16::from(bytes[i]);
instrs.push(Instruction::from(operand));
}
}
i += 1;
}
instrs.push(Instruction(0));
(i, instrs)
}
/// Parse whole code-block.
pub fn block(i : usize, bytes : &ByteSlice) -> (usize, frame::Frame) {
// Parse source filename.
let (i, filename) = eat::null_string(i, bytes);
// Parse module name.
let (i, module) = eat::null_string(i, bytes);
// Parse max evaluation-stack depth.
let stack_depth = (u16::from(bytes[i]) << 8) + u16::from(bytes[i + 1]);
let i = i + 2;
// Parse constants.
let (i, constants) = eat::constants(i, bytes);
// Parse locals.
let (i, locals) = eat::locals(i, bytes);
// Parse instructions.
let (i, instructions) = eat::instructions(i, bytes);
// Construct call-frame.
let stack_frame = frame::Frame::new(
filename, module, constants,
locals, instructions, stack_depth);
return (i, stack_frame);
}
}
#[must_use]
pub fn parse_blob(bytes : &ByteSlice) -> frame::Frame {
let mut i : usize = 0;
// Parse compiler version number.
let _version = bytes[0..2].as_ref();
i += 3;
// Parse primary/root code block.
let (_, stack_frame) = eat::block(i, bytes);
#[cfg(feature="debug")]
println!("{:#?}", stack_frame);
// If `stack_frame.constants[2]` is a pointer to a string, then, to use
// it in Rust, all you have to do is:
// ```
// let string : &str = unsafe {
// *(stack_frame.constants[2].0 as *const &str)
// };
// println!("str: {}", string);
// ```
// Or even better:
// ```
// let string : &str = unsafe { stack_frame.constants[2].deref() };
// println!("str: {}", string);
// ```
return stack_frame;
}
/* === ROOT BLOB FORMAT ===:
* | VERSION [u8; 3]
* | === MARSHALLED CODE BLOCK FORMAT ===:
* | | source-filename [u8; x] (abs path, null terminated, utf8)
* | | module-name [u8; x] (null terminated, utf8)
* | | stack-depth [u8; 2]
* | |
* | | CONSTANTS [u8; x] (block begin: 0x11 byte)
* | | (can contain other marshalled code blocks)
* | | (block end: 0x00)
* | | LOCAL NAMES [u8; x] (block begin: 0x12)
* | | (contains null terminated strings)
* | | (block end: 0x00)
* | | INSTRUCTION CODES [u8; x] (block begin: 0x13)
* | | (contains stream of operators and operands)
* | | (block end: 0x00 (EOI))
*/
|