From 6a27f8fdf8369c8dc69b7663b81b15f73caded50 Mon Sep 17 00:00:00 2001 From: nganhkhoa Date: Mon, 19 Feb 2024 02:54:02 +0700 Subject: [PATCH] update parser --- src/albireo.pest | 164 +++++++++++++++----- src/albireo.rs | 361 +++++++++++++++++++++++++++++++++++++++++-- src/main.rs | 7 +- test/simple/main.air | 45 +++++- 4 files changed, 516 insertions(+), 61 deletions(-) diff --git a/src/albireo.pest b/src/albireo.pest index f810423..e030679 100644 --- a/src/albireo.pest +++ b/src/albireo.pest @@ -17,54 +17,133 @@ keywords = { | "import" | "module" | "export" -} -identifier = @{ !keywords ~ ASCII_ALPHA ~ ASCII_ALPHANUMERIC* } - -function_type_param = { type } -function_type_return = { type } -function_type = { - "fn" ~ function_type_param* ~ "=>" ~ function_type_return + | "record" + | "tuple" + | "type" + | "enum" + | "auto" } -type = { +// if identifier starts with something like a keyword, it must follows something else +// in any case, the identifier cannot begin with a digit +identifier = @{ keywords? ~ ("_" | ASCII_ALPHANUMERIC)+ + | !keywords ~ ASCII_ALPHA ~ ("_" | ASCII_ALPHANUMERIC)* } + +function_type_param = { type* } +function_type_return = _{ type } +function_type = { + "fn" ~ function_type_param ~ "=>" ~ function_type_return +} + +// record name { prop: type, prop: type, prop: type } +record_type_field = { identifier ~ ":" ~ type } +record_type_fields = _{ record_type_field ~ "," ~ record_type_fields | record_type_field } +record_type = { + "record" ~ "{" ~ record_type_fields ~ "}" +} + +// tuple (type, type) +tuple_type_item = _{ type } +tuple_type_item_list = _{ tuple_type_item ~ "," ~ tuple_type_item_list | tuple_type_item } +tuple_type = { + "tuple" ~ "(" ~ tuple_type_item_list ~ ")" +} + +// enum name { name: type, name: type } +enum_field = { identifier ~ ":" ~ type } +enum_fields = _{ enum_field ~ "," ~ enum_fields | enum_field } +enum_type = { + "enum" ~ "{" ~ enum_fields ~ "}" +} + +// allow for type alias, aka create new type +type_name = { identifier } +type_declaration = { "type" ~ type_name ~ "=" ~ type } + +type_number = { "number" } +type_string = { "string" } +type_bool = { "bool" } + +// for gradual typing +type_auto = { "?" | "auto" } + +// higher-kinded types will be added later +// this will be very complex to parse, +// because allowing types to receive an argument as type +type = _{ function_type - | "number" - | "bool" - | "string" + | record_type + | tuple_type + | enum_type + | type_name + | type_number + | type_bool + | type_string } -number = @{ ASCII_DIGIT+ } -string = @{ "\"" ~ !("\"") ~ ASCII_ALPHANUMERIC* ~ "\"" } -bool = @{ "true" | "false" } -function = { "function" ~ function_param* ~ "is" ~ function_body } -function_param = { identifier } -function_body = { - binding* ~ expr +number = @{ ASCII_DIGIT+ } +string = @{ "\"" ~ !("\"") ~ ASCII_ALPHANUMERIC* ~ "\"" } +bool = @{ "true" | "false" } +tuple = { "tuple" ~ "(" ~ tuple_item_list ~ ")" } +tuple_item_list = _{ expr ~ "," ~ tuple_item_list | expr } +record = { "record" ~ "{" ~ record_item_list ~ "}" } +record_item = { identifier ~ ":" ~ expr } +record_item_list = _{ record_item ~ "," ~ record_item_list | record_item } +function = { "function" ~ function_params ~ "is" ~ function_body } +function_params = { function_param* } +function_param = _{ identifier } +function_body = _{ + bindings ~ expr } -binding = { +bindings = { binding* } +binding = { "let" ~ identifier ~ "=" ~ expr ~ "in" } -// will be expr in later version of the language -if_cond = { identifier } -if_branch = { identifier } -else_branch = { identifier } -if_expr = { - // "if" ~ expr ~ "then" ~ expr ~ "else" ~ expr +if_cond = _{ expr } +if_branch = _{ expr } +else_branch = _{ expr } +if_expr = { "if" ~ if_cond ~ "then" ~ if_branch ~ "else" ~ else_branch } -call_function = { identifier } -call_param = { identifier | identifier ~ call_param } -call_expr = { - call_function ~ "(" ~ call_param? ~ ")" +call_function = _{ identifier } +call_param_list = _{ expr ~ ("," ~ expr)* } +call_param = { call_param_list? } +call_expr = { + call_function ~ "(" ~ call_param ~ ")" } -expr = { +binop = _{ + add + | sub + | div + | mul + | mod + | and + | or + | xor +} +add = { "+" } +sub = { "-" } +div = { "/" } +mul = { "*" } +mod = { "%" } +and = { "&" } +or = { "|" } +xor = { "^" } +// disambiguous between arithmetic and logical operators +// add comparision + +expr = { + single_expr ~ (binop ~ single_expr)* +} +single_expr = _{ if_expr | call_expr | identifier | value + | "(" ~ expr ~ ")" } // function is a value only if we allow to have thunks @@ -74,17 +153,22 @@ value = _{ number | string | bool + | tuple + | record | function } -declaration = { +variable_declaration = { identifier ~ ":" ~ type ~ "=" ~ expr } +declaration = _{ + type_declaration + | variable_declaration +} -module_name = @{ identifier } +module_name = _{ identifier } export_list = _{ - identifier - | identifier ~ "," ~ export_list + identifier ~ ("," ~ identifier)* } module_export = { "export" ~ export_list @@ -94,24 +178,24 @@ module_export = { // import module only name, name // import module as change_name import_list = _{ - identifier ~ "," ~ import_list - | identifier + identifier ~ ("," ~ identifier)* } import_selective = { "only" ~ import_list } import_change_name = { "as" ~ module_name } -import_types = { +import_types = _{ import_selective | import_change_name } module_import = { "import" ~ module_name ~ import_types? } +module_export_import = { (module_export | module_import)* } module_declare = { - "module" ~ module_name ~ module_export* ~ module_import* + "module" ~ module_name ~ module_export_import } program = { - module_declare ~ declaration* + SOI ~ module_declare ~ declaration* } WHITESPACE = _{ " " | NEWLINE } -COMMENT = _{ ";.*" ~ NEWLINE } +COMMENT = _{ ";" ~ (!NEWLINE ~ ANY)* ~ NEWLINE } diff --git a/src/albireo.rs b/src/albireo.rs index 26cb3cc..5057668 100644 --- a/src/albireo.rs +++ b/src/albireo.rs @@ -4,30 +4,365 @@ pub struct AlbireoParser; use pest::iterators::Pair; -struct Module { +use std::vec::Vec; +use std::collections::HashMap; + +#[derive(Clone, Debug, Eq, PartialEq, Hash)] +pub struct Identifier { name: String, - declaration: HashMap } -struct Declaration { - type: Type, - definition: Expression, +#[derive(Clone, Debug)] +pub struct Module { + module: ModuleInformation, + declaration: Vec } -enum Type { +#[derive(Clone, Debug)] +pub struct ModuleInformation { + name: Identifier, + export: Export, + import: Vec, +} + +#[derive(Clone, Debug)] +pub enum Export { + All, + Selective(Vec), +} + +#[derive(Clone, Debug)] +pub struct Import { + from: Identifier, + rename: Identifier, // change name, perhaps + imported: Vec, +} + +#[derive(Clone, Debug)] +pub enum Declaration { + Type(Identifier, Type), + Variable(Identifier, Type, Box), +} + +#[derive(Clone, Debug)] +pub enum Type { + Identifier(Identifier), Number, String, Bool, - Function(/*inputs*/Vec, /*output*/Type) + Function(/*inputs*/Vec, /*output*/Box), + Tuple(Vec), } -// limitation as of now, everything must be passed in as identifier -enum Expression { - IfClause(String, String, String), - Application(/*function name*/String, /*params*/Vec), // function call - Identifier(String), +#[derive(Clone, Debug)] +pub enum BinaryOp { + Add, Sub, Div, Mul, Mod, And, Or, Xor +} + +impl Into for &str { + fn into(self) -> BinaryOp { + match self { + "+" => BinaryOp::Add, + "-" => BinaryOp::Sub, + "*" => BinaryOp::Mul, + "/" => BinaryOp::Div, + "%" => BinaryOp::Mod, + // "<" => BinaryOp::Lt, + // ">" => BinaryOp::Gt, + // "<=" => BinaryOp::Le, + // ">=" => BinaryOp::Ge, + // "==" => BinaryOp::Eq, + // "!=" => BinaryOp::Neq, + // "|" => BinaryOp::ArithOr, + // "&" => BinaryOp::ArithAnd, + // "||" => BinaryOp::LogicalOr, + // "&&" => BinaryOp::LogicalAnd, + // "<<" => BinaryOp::ShiftLeft, + // ">>" => BinaryOp::ShiftRight, + // "**" => BinaryOp::Power, + _ => { + unreachable!() + } + } + } +} + +#[derive(Clone, Debug)] +pub enum Expression { + IfClause(Box, Box, Box), + Application(/*function name*/Identifier, /*params*/Vec>), // function call + Identifier(Identifier), Number(u64), String(String), Bool(bool), - Function(String, Vec, Expression), + Function(/*params*/Vec, Vec, Box), + Binary(Box, BinaryOp, Box), + Tuple(Vec>), +} + +#[derive(Clone, Debug)] +pub struct Binding { + lhs: Identifier, + rhs: Box, +} + +fn parse_definition(parsed: Pair) -> Box { + parse_expr(parsed) +} + +fn parse_expr(parsed: Pair) -> Box { + use pest::pratt_parser::PrattParser; + use pest::pratt_parser::{Assoc, Op}; + let pratt = + PrattParser::new() + .op(Op::infix(Rule::r#mod, Assoc::Left)) + .op(Op::infix(Rule::add, Assoc::Left) | Op::infix(Rule::sub, Assoc::Left)) + .op(Op::infix(Rule::mul, Assoc::Left) | Op::infix(Rule::div, Assoc::Left)) + ; + + let expr = pratt + .map_primary(|p| { + parse_single_expr(p) + }) + .map_infix(|lhs, op, rhs| { + Box::new(Expression::Binary(lhs.clone(), op.as_str().into(), rhs.clone())) + }) + .parse(parsed.into_inner()); + + expr +} + +fn parse_bindings(parsed: Pair) -> Binding { + let mut p = parsed.into_inner(); + Binding { + lhs: parse_identifier(p.next().unwrap()), + rhs: parse_expr(p.next().unwrap()), + } +} + +fn parse_function_expr(parsed: Pair) -> Option> { + let mut p = parsed.into_inner(); + let parsed_params = p.next()?; + let parsed_bindings = p.next()?; + let parsed_body = p.next()?; + + let params = parsed_params.into_inner().map(parse_identifier).collect(); + let bindings = parsed_bindings.into_inner().map(parse_bindings).collect(); + let body = parse_expr(parsed_body); + + Some(Box::new(Expression::Function(params, bindings, body))) +} + +fn parse_if_expr(parsed: Pair) -> Option> { + let mut p = parsed.into_inner(); + let condition = parse_expr(p.next()?); + let ifbranch = parse_expr(p.next()?); + let elsebranch = parse_expr(p.next()?); + Some(Box::new(Expression::IfClause(condition, ifbranch, elsebranch))) +} + +fn parse_call_expr(parsed: Pair) -> Option> { + let mut p = parsed.into_inner(); + let identifier = parse_identifier(p.next()?); + let parsed_params = p.next()?; + let params = parsed_params.into_inner().map(parse_expr).collect(); + Some(Box::new(Expression::Application(identifier, params))) +} + +fn parse_single_expr(parsed: Pair) -> Box { + match parsed.as_rule() { + Rule::if_expr => { + parse_if_expr(parsed).unwrap() + }, + Rule::call_expr => { + parse_call_expr(parsed).unwrap() + }, + Rule::identifier => { + Box::new(Expression::Identifier(parse_identifier(parsed))) + }, + Rule::number => { + let num = parsed.as_str(); + Box::new(Expression::Number(num.parse().unwrap())) + }, + // Rule::string => {}, + Rule::bool => { + if parsed.as_str() == "false" { + Box::new(Expression::Bool(false)) + } else { + Box::new(Expression::Bool(true)) + } + }, + Rule::tuple => { + let mut tuple = Vec::new(); + let mut p = parsed.into_inner(); + while let Some(e) = p.next() { + tuple.push(parse_expr(e)); + } + Box::new(Expression::Tuple(tuple)) + } + Rule::function => { + parse_function_expr(parsed).unwrap() + } + Rule::expr => { + let e = parsed.into_inner().next().unwrap(); + parse_expr(e) + }, + _ => { + println!("please implement parse expr for {:?}", parsed.as_rule()); + unreachable!(); + } + } +} + +fn parse_identifier(parsed: Pair) -> Identifier { + Identifier { name: parsed.as_str().into() } +} + +fn parse_function_type(parsed: Pair) -> Option { + let mut p = parsed.into_inner(); + let parsed_params_type = p.next()?; + let parsed_return_type = p.next()?; + + let params_type = parsed_params_type.into_inner().map(parse_type).collect(); + let return_type = Box::new(parse_type(parsed_return_type)); + Some(Type::Function(params_type, return_type)) +} + +fn parse_type(parsed: Pair) -> Type { + match parsed.as_rule() { + Rule::type_bool => Type::Bool, + Rule::type_string => Type::String, + Rule::type_number => Type::Number, + Rule::tuple_type => { + let mut tuple = Vec::new(); + let mut p = parsed.into_inner(); + while let Some(t) = p.next() { + tuple.push(parse_type(t)); + } + Type::Tuple(tuple) + } + Rule::type_name => { + Type::Identifier(parse_identifier(parsed)) + } + Rule::function_type => { + parse_function_type(parsed).unwrap() + } + _ => { + println!("implement parse type for {:?}", parsed.as_rule()); + unreachable!(); + } + } +} + +fn parse_declaration(parsed: Pair) -> Declaration { + match parsed.as_rule() { + Rule::type_declaration => { + let mut p = parsed.into_inner(); + let identifier = p.next().unwrap(); + let typ = p.next().unwrap(); + + let id = parse_identifier(identifier); + let t = parse_type(typ); + + Declaration::Type(id, t) + }, + Rule::variable_declaration => { + let mut p = parsed.into_inner(); + let identifier = p.next().unwrap(); + let typ = p.next().unwrap(); + let definition = p.next().unwrap(); + + let id = parse_identifier(identifier); + let t = parse_type(typ); + let d = parse_definition(definition); + + Declaration::Variable(id, t, d) + } + _ => { + unreachable!(); + } + } +} + +fn parse_module_declaration(parsed: Pair) -> Option { + let mut p = parsed.into_inner(); + let name = parse_identifier(p.next()?); + + let export_import_list = p.next()?; + + let mut export_list = Vec::new(); + let mut import_list: HashMap = HashMap::new(); + + export_import_list.into_inner().for_each(|item| { + match item.as_rule() { + Rule::module_export => { + item.into_inner().for_each(|export| { + export_list.push(parse_identifier(export)) + }); + } + Rule::module_import => { + let mut p = item.into_inner(); + let name = parse_identifier(p.next().unwrap()); + let import = { + match import_list.get_mut(&name) { + Some(import) => import, + None => { + import_list.insert(name.clone(), Import { + from: name.clone(), + rename: name.clone(), + imported: Vec::new(), + }); + import_list.get_mut(&name).unwrap() + } + } + }; + if let Some(type_import) = p.next() { + match type_import.as_rule() { + Rule::import_selective => { + let imported = type_import.into_inner().map(parse_identifier).collect(); + import.imported = imported; + }, + Rule::import_change_name => { + import.rename = parse_identifier(type_import.into_inner().next().unwrap()); + }, + _ => unreachable!(), + } + } + } + _ => { + unreachable!(); + } + } + }); + + let export = { + if export_list.is_empty() { + Export::All + } else { + Export::Selective(export_list) + } + }; + Some(ModuleInformation { + name, + export, + import: import_list.into_values().collect(), + }) +} + +pub fn parse_module(parsed: Pair) -> Option { + let mut p = parsed.into_inner(); + + let module = p.next()?; + let module_info = parse_module_declaration(module)?; + + let mut declaration_list = Vec::new(); + while let Some(declaration) = p.next() { + let declare = parse_declaration(declaration); + declaration_list.push(declare); + } + + Some(Module { + module: module_info, + declaration: declaration_list, + }) } diff --git a/src/main.rs b/src/main.rs index 168dd65..708cb57 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,6 @@ mod albireo; -use crate::albireo::{AlbireoParser, Rule}; +use crate::albireo::{AlbireoParser, Rule, parse_module}; use std::fs; use pest::Parser; @@ -8,10 +8,11 @@ use pest::Parser; fn main() { let input = fs::read_to_string("test/simple/main.air").expect("cannot read file"); - let program = AlbireoParser::parse(Rule::program, &input) + let parsed = AlbireoParser::parse(Rule::program, &input) .expect("file format is wrong or the parser is wrong") .next() .expect("cannot parse input file as a Albireo program"); - println!("{:?}", program); + let module = parse_module(parsed); + println!("{:?}", module); } diff --git a/test/simple/main.air b/test/simple/main.air index ef7b4ab..3587ea0 100644 --- a/test/simple/main.air +++ b/test/simple/main.air @@ -1,9 +1,44 @@ +; declare a module module main -c : fn => bool = function is - let a = b in - c(a) +; no explicit export so everything in this module is exported +export one, two -b : bool = false +; no import +import a +import a only name, name +import a as change_name -a : number = 1 +; declare a variable +one : number = 1 + +; declare a variable that is computed +; places a burden on the runtime +two : number = one + 1 + +; tt : bool = true + +; types can be named, alias + +; using tuple or record or enum +type coordinates_tuple = tuple (number, number) +; type coordinates_record = record { x : number , y : number } + +coordinates_as_tuple : coordinates_tuple = tuple (1, 2) +; coordinates_as_record : coordinates_record = record { x : 1 , y : 2 } + +; function definition is structured the same +void : fn => number = function is + let x = 1 in + 1 + +return_false : fn => bool = function is + false + +increment : fn number => number = function x is + x + 1 + +is_even : fn number => bool = function x is + if x % 2 then true else false + +bruh : number = increment(one)