Last active
May 18, 2019 15:56
-
-
Save vayn/676e11526a4d85c8498062c8542fca57 to your computer and use it in GitHub Desktop.
Parser combinator in Rust (https://git.io/xml-parser-combinator)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #![allow(dead_code)] | |
| #[derive(Clone, Debug, PartialEq, Eq)] | |
| struct Element { | |
| name: String, | |
| attributes: Vec<(String, String)>, | |
| children: Vec<Element>, | |
| } | |
| type ParserResult<'a, Output> = Result<(&'a str, Output), &'a str>; | |
| trait Parser<'a, Output> { | |
| fn parse(&self, input: &'a str) -> ParserResult<'a, Output>; | |
| fn map<F, NewOutput>(self, map_fn: F) -> BoxedParser<'a, NewOutput> | |
| where | |
| Self: Sized + 'a, | |
| Output: 'a, | |
| NewOutput: 'a, | |
| F: Fn(Output) -> NewOutput + 'a, | |
| { | |
| BoxedParser::new(map(self, map_fn)) | |
| } | |
| fn pred<F>(self, pred_fn: F) -> BoxedParser<'a, Output> | |
| where | |
| Self: Sized + 'a, | |
| Output: 'a, | |
| F: Fn(&Output) -> bool + 'a, | |
| { | |
| BoxedParser::new(pred(self, pred_fn)) | |
| } | |
| fn and_then<F, NextParser, NewOutput>(self, f: F) -> BoxedParser<'a, NewOutput> | |
| where | |
| Self: Sized + 'a, | |
| Output: 'a, | |
| NewOutput: 'a, | |
| NextParser: Parser<'a, NewOutput> + 'a, | |
| F: Fn(Output) -> NextParser + 'a, | |
| { | |
| BoxedParser::new(and_then(self, f)) | |
| } | |
| } | |
| impl<'a, F, Output> Parser<'a, Output> for F | |
| where | |
| F: Fn(&'a str) -> ParserResult<Output>, | |
| { | |
| fn parse(&self, input: &'a str) -> ParserResult<'a, Output> { | |
| self(input) | |
| } | |
| } | |
| struct BoxedParser<'a, Output> { | |
| parser: Box<dyn Parser<'a, Output> + 'a>, | |
| } | |
| impl<'a, Output> BoxedParser<'a, Output> { | |
| fn new<P>(parser: P) -> Self | |
| where | |
| P: Parser<'a, Output> + 'a, | |
| { | |
| BoxedParser { | |
| parser: Box::new(parser), | |
| } | |
| } | |
| } | |
| impl<'a, Output> Parser<'a, Output> for BoxedParser<'a, Output> { | |
| fn parse(&self, input: &'a str) -> ParserResult<'a, Output> { | |
| self.parser.parse(input) | |
| } | |
| } | |
| /* | |
| fn match_literal(expected: &'static str) | |
| -> impl Fn(&str) -> Result<(&str, ()), &str> | |
| { | |
| move |input| match input.get(0..expected.len()) { | |
| Some(next) if next == expected => { | |
| Ok((&input[expected.len()..], ())) | |
| } | |
| _ => Err(input), | |
| } | |
| } | |
| */ | |
| fn match_literal<'a>(expected: &'static str) -> impl Parser<'a, ()> { | |
| move |input: &'a str| match input.get(0..expected.len()) { | |
| Some(next) if next == expected => Ok((&input[expected.len()..], ())), | |
| _ => Err(input), | |
| } | |
| } | |
| #[test] | |
| fn literal_parser() { | |
| let parse_joe = match_literal("Hello Joe!"); | |
| assert_eq!( | |
| Ok(("", ())), | |
| parse_joe.parse("Hello Joe!") | |
| ); | |
| assert_eq!( | |
| Ok((" Hello Robert!", ())), | |
| parse_joe.parse("Hello Joe! Hello Robert!") | |
| ); | |
| assert_eq!( | |
| Err("Hello Mike!"), | |
| parse_joe.parse("Hello Mike!") | |
| ) | |
| } | |
| // fn identifier(input: &str) -> Result<(&str, String), &str> { | |
| fn identifier(input: &str) -> ParserResult<String> { | |
| let mut matched = String::new(); | |
| let mut chars = input.chars(); | |
| match chars.next() { | |
| Some(next) if next.is_alphabetic() => matched.push(next), | |
| _ => return Err(input), | |
| } | |
| while let Some(next) = chars.next() { | |
| if next.is_alphanumeric() || next == '-' { | |
| matched.push(next); | |
| } else { | |
| break; | |
| } | |
| } | |
| let next_index = matched.len(); | |
| Ok((&input[next_index..], matched)) | |
| } | |
| #[test] | |
| fn identifier_parser() { | |
| assert_eq!( | |
| Ok(("", "i-am-an-identifier".to_string())), | |
| identifier("i-am-an-identifier") | |
| ); | |
| assert_eq!( | |
| Ok((" entirely an identifier", "not".to_string())), | |
| identifier("not entirely an identifier") | |
| ); | |
| assert_eq!( | |
| Err("!not at all an identifier"), | |
| identifier("!not at all an identifier") | |
| ); | |
| } | |
| /* | |
| fn pair<P1, P2, R1, R2>(parser1: P1, parser2: P2) | |
| -> impl Fn(&str) -> Result<(&str, (R1, R2)), &str> | |
| where | |
| P1: Fn(&str) -> Result<(&str, R1), &str>, | |
| P2: Fn(&str) -> Result<(&str, R2), &str>, | |
| { | |
| move |input| match parser1(input) { | |
| Ok((next_input, result1)) => match parser2(next_input) { | |
| Ok((final_input, result2)) => Ok((final_input, (result1, result2))), | |
| Err(err) => Err(err), | |
| }, | |
| Err(err) => Err(err), | |
| } | |
| } | |
| fn pair<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) | |
| -> impl Parser<'a, (R1, R2)> | |
| where | |
| P1: Parser<'a, R1>, | |
| P2: Parser<'a, R2>, | |
| { | |
| move |input| match parser1.parse(input) { | |
| Ok((next_input, result1)) => match parser2.parse(next_input) { | |
| Ok((final_input, result2)) => Ok((final_input, (result1, result2))), | |
| Err(err) => Err(err), | |
| } | |
| Err(err) => Err(err), | |
| } | |
| } | |
| */ | |
| fn pair<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) -> impl Parser<'a, (R1, R2)> | |
| where | |
| P1: Parser<'a, R1>, | |
| P2: Parser<'a, R2>, | |
| { | |
| move |input| { | |
| parser1.parse(input).and_then(|(next_input, result1)| { | |
| parser2 | |
| .parse(next_input) | |
| .map(|(final_input, result2)| (final_input, (result1, result2))) | |
| }) | |
| } | |
| } | |
| //* It looks very neat, but there's a problem: parser2.map() consumes | |
| //* parser2 to create the wrapped parser, and the function is a Fn, | |
| //* not a FnOnce, so it's not allowed to consume parser2, just take a | |
| //* reference to it. Rust Problems, in other words. | |
| //* | |
| //* fn pair<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) | |
| //* -> impl Parser<'a, (R1, R2)> | |
| //* where | |
| //* P1: Parser<'a, R1> + 'a, | |
| //* P2: Parser<'a, R2> + 'a, | |
| //* R1: 'a + Clone, | |
| //* R2: 'a, | |
| //* { | |
| //* parser1.and_then(move |result1| parser2.map(move |result2| { | |
| //* (result1.clone(), result2) | |
| //* })) | |
| //* } | |
| #[test] | |
| fn pair_combinator() { | |
| let tag_opener = pair(match_literal("<"), identifier); | |
| assert_eq!( | |
| Ok(("/>", ((), "my-first-element".to_string()))), | |
| tag_opener.parse("<my-first-element/>") | |
| ); | |
| assert_eq!(Err("oops"), tag_opener.parse("oops")); | |
| assert_eq!(Err("!oops"), tag_opener.parse("<!oops")); | |
| } | |
| /* | |
| fn map<P, F, A, B>(parser: P, map_fn: F) | |
| -> impl Fn(&str) -> Result<(&str, B), &str> | |
| where | |
| P: Fn(&str) -> Result<(&str, A), &str>, | |
| F: Fn(A) -> B, | |
| { | |
| move |input| match parser(input) { | |
| Ok((next_input, result)) => Ok((next_input, map_fn(result))), | |
| Err(err) => Err(err), | |
| } | |
| } | |
| fn map<P, F, A, B>(parser: P, map_fn: F) | |
| -> impl Fn(&str) -> Result<(&str, B), &str> | |
| where | |
| P: Fn(&str) -> Result<(&str, A), &str>, | |
| F: Fn(A) -> B, | |
| { | |
| move |input| | |
| parser(input) | |
| .map(|(next_input, result)| (next_input, map_fn(result))) | |
| } | |
| */ | |
| fn map<'a, P, F, A, B>(parser: P, map_fn: F) -> impl Parser<'a, B> | |
| where | |
| P: Parser<'a, A>, | |
| F: Fn(A) -> B, | |
| { | |
| move |input| { | |
| parser | |
| .parse(input) | |
| .map(|(next_input, result)| (next_input, map_fn(result))) | |
| } | |
| } | |
| fn left<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) -> impl Parser<'a, R1> | |
| where | |
| P1: Parser<'a, R1> + 'a, | |
| P2: Parser<'a, R2> + 'a, | |
| R1: 'a, | |
| R2: 'a, | |
| { | |
| // map(pair(parser1, parser2), |(left, _right)| left) | |
| pair(parser1, parser2).map(|(left, _right)| left) | |
| } | |
| fn right<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) -> impl Parser<'a, R2> | |
| where | |
| P1: Parser<'a, R1> + 'a, | |
| P2: Parser<'a, R2> + 'a, | |
| R1: 'a, | |
| R2: 'a, | |
| { | |
| // map(pair(parser1, parser2), |(_left, right)| right) | |
| pair(parser1, parser2).map(|(_left, right)| right) | |
| } | |
| #[test] | |
| fn right_combinator() { | |
| let tag_opener = right(match_literal("<"), identifier); | |
| assert_eq!( | |
| Ok(("/>", "my-first-element".to_string())), | |
| tag_opener.parse("<my-first-element/>") | |
| ); | |
| assert_eq!(Err("oops"), tag_opener.parse("oops")); | |
| assert_eq!(Err("!oops"), tag_opener.parse("<!oops")); | |
| } | |
| fn zero_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<A>> | |
| where | |
| P: Parser<'a, A>, | |
| { | |
| move |mut input| { | |
| let mut result = Vec::new(); | |
| while let Ok((next_input, next_item)) = parser.parse(input) { | |
| input = next_input; | |
| result.push(next_item); | |
| } | |
| Ok((input, result)) | |
| } | |
| } | |
| #[test] | |
| fn zero_or_more_combinator() { | |
| let parser = zero_or_more(match_literal("ha")); | |
| assert_eq!(Ok(("", vec![(), (), ()])), parser.parse("hahaha")); | |
| assert_eq!(Ok(("ahah", vec![])), parser.parse("ahah")); | |
| assert_eq!(Ok(("", vec![])), parser.parse("")); | |
| } | |
| fn one_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<A>> | |
| where | |
| P: Parser<'a, A>, | |
| { | |
| move |mut input| { | |
| let mut result = Vec::new(); | |
| if let Ok((next_input, first_item)) = parser.parse(input) { | |
| input = next_input; | |
| result.push(first_item); | |
| } else { | |
| return Err(input); | |
| } | |
| while let Ok((next_input, next_item)) = parser.parse(input) { | |
| input = next_input; | |
| result.push(next_item); | |
| } | |
| Ok((input, result)) | |
| } | |
| } | |
| /* | |
| // Ownership problem | |
| // https://bodil.lol/parser-combinators/#one-or-more | |
| // Here, we run into Rust Problems, and I don't even mean the problem of not having | |
| // a cons method for Vec, but I know every Lisp programmer reading that bit of code | |
| // was thinking it. No, it's worse than that: it's ownership. | |
| // | |
| // We own that parser, so we can't go passing it as an argument twice, the compiler | |
| // will start shouting at you that you're trying to move an already moved value. | |
| // So can we make our combinators take references instead? No, it turns out, | |
| // not without running into another whole set of borrow checker troubles - and we're | |
| // not going to even go there right now. And because these parsers are functions, | |
| // they don't do anything so straightforward as to implement Clone, which would have | |
| // saved the day very tidily, so we're stuck with a constraint that we can't repeat our | |
| // parsers easily in combinators. | |
| fn one_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<A>> | |
| where | |
| P: Parser<'a, A>, | |
| { | |
| map(pair(parser, zero_or_more(parser)), |(head, mut tail)| { | |
| tail.insert(0, head); | |
| tail | |
| }) | |
| } | |
| */ | |
| #[test] | |
| fn one_or_more_combinator() { | |
| let parser = one_or_more(match_literal("ha")); | |
| assert_eq!(Ok(("", vec![(), (), ()])), parser.parse("hahaha")); | |
| assert_eq!(Err("ahah"), parser.parse("ahah")); | |
| assert_eq!(Err(""), parser.parse("")); | |
| } | |
| fn any_char(input: &str) -> ParserResult<char> { | |
| match input.chars().next() { | |
| Some(next) => Ok((&input[next.len_utf8()..], next)), | |
| _ => Err(input), | |
| } | |
| } | |
| fn pred<'a, P, A, F>(parser: P, predicate: F) -> impl Parser<'a, A> | |
| where | |
| P: Parser<'a, A>, | |
| F: Fn(&A) -> bool, | |
| { | |
| move |input| { | |
| if let Ok((next_input, value)) = parser.parse(input) { | |
| if predicate(&value) { | |
| return Ok((next_input, value)); | |
| } | |
| } | |
| Err(input) | |
| } | |
| } | |
| #[test] | |
| fn predicate_combinator() { | |
| let parser = pred(any_char, |c| *c == 'o'); | |
| assert_eq!(Ok(("mg", 'o')), parser.parse("omg")); | |
| assert_eq!(Err("lol"), parser.parse("lol")); | |
| } | |
| fn whitespace_char<'a>() -> impl Parser<'a, char> { | |
| // pred(any_char, |c| c.is_whitespace()) | |
| any_char.pred(|c| c.is_whitespace()) | |
| } | |
| fn space1<'a>() -> impl Parser<'a, Vec<char>> { | |
| one_or_more(whitespace_char()) | |
| } | |
| fn space0<'a>() -> impl Parser<'a, Vec<char>> { | |
| zero_or_more(whitespace_char()) | |
| } | |
| fn quoted_string<'a>() -> impl Parser<'a, String> { | |
| right( | |
| match_literal("\""), | |
| left( | |
| zero_or_more(any_char.pred(|c| *c != '"')), | |
| match_literal("\""), | |
| ), | |
| ) | |
| .map(|chars| chars.into_iter().collect()) | |
| } | |
| #[test] | |
| fn quoted_string_parser() { | |
| assert_eq!( | |
| Ok(("", "Hello Joe!".to_string())), | |
| quoted_string().parse("\"Hello Joe!\"") | |
| ); | |
| } | |
| fn attribute_pair<'a>() -> impl Parser<'a, (String, String)> { | |
| pair(identifier, right(match_literal("="), quoted_string())) | |
| } | |
| fn attributes<'a>() -> impl Parser<'a, Vec<(String, String)>> { | |
| zero_or_more(right(space1(), attribute_pair())) | |
| } | |
| #[test] | |
| fn attribute_parser() { | |
| assert_eq!( | |
| Ok(( | |
| "", | |
| vec![ | |
| ("one".to_string(), "1".to_string()), | |
| ("two".to_string(), "2".to_string()) | |
| ] | |
| )), | |
| attributes().parse(" one=\"1\" two=\"2\"") | |
| ); | |
| } | |
| fn element_start<'a>() -> impl Parser<'a, (String, Vec<(String, String)>)> { | |
| right(match_literal("<"), pair(identifier, attributes())) | |
| } | |
| fn single_element<'a>() -> impl Parser<'a, Element> { | |
| // map( | |
| // left(element_start(), match_literal("/>")), | |
| // |(name, attributes)| Element { | |
| // name, | |
| // attributes, | |
| // children: vec![], | |
| // }, | |
| // ) | |
| left(element_start(), match_literal("/>")).map(|(name, attributes)| Element { | |
| name, | |
| attributes, | |
| children: vec![], | |
| }) | |
| } | |
| #[test] | |
| fn single_element_parser() { | |
| assert_eq!( | |
| Ok(( | |
| "", | |
| Element { | |
| name: "div".to_string(), | |
| attributes: vec![("class".to_string(), "float".to_string())], | |
| children: vec![] | |
| } | |
| )), | |
| single_element().parse("<div class=\"float\"/>") | |
| ); | |
| } | |
| fn open_element<'a>() -> impl Parser<'a, Element> { | |
| left(element_start(), match_literal(">")).map(|(name, attributes)| Element { | |
| name, | |
| attributes, | |
| children: vec![], | |
| }) | |
| } | |
| fn either<'a, P1, P2, A>(parser1: P1, parser2: P2) -> impl Parser<'a, A> | |
| where | |
| P1: Parser<'a, A>, | |
| P2: Parser<'a, A>, | |
| { | |
| move |input| match parser1.parse(input) { | |
| ok @ Ok(_) => ok, | |
| Err(_) => parser2.parse(input), | |
| } | |
| } | |
| fn element<'a>() -> impl Parser<'a, Element> { | |
| whitespace_wrap(either(single_element(), parent_element())) | |
| } | |
| fn close_element<'a>(expected_name: String) -> impl Parser<'a, String> { | |
| right(match_literal("</"), left(identifier, match_literal(">"))) | |
| .pred(move |name| name == &expected_name) | |
| } | |
| fn and_then<'a, P, F, A, B, NextP>(parser: P, f: F) -> impl Parser<'a, B> | |
| where | |
| P: Parser<'a, A>, | |
| NextP: Parser<'a, B>, | |
| F: Fn(A) -> NextP, | |
| { | |
| move |input| match parser.parse(input) { | |
| Ok((next_input, result)) => f(result).parse(next_input), | |
| Err(err) => Err(err), | |
| } | |
| } | |
| fn parent_element<'a>() -> impl Parser<'a, Element> { | |
| open_element().and_then(|el| { | |
| left(zero_or_more(element()), close_element(el.name.clone())).map(move |children| { | |
| let mut el = el.clone(); | |
| el.children = children; | |
| el | |
| }) | |
| }) | |
| } | |
| fn whitespace_wrap<'a, P, A>(parser: P) -> impl Parser<'a, A> | |
| where | |
| P: Parser<'a, A> + 'a, | |
| A: 'a, | |
| { | |
| right(space0(), left(parser, space0())) | |
| } | |
| #[test] | |
| fn xml_parser() { | |
| let doc = r#" | |
| <top label="Top"> | |
| <semi-bottom label="Bottom"/> | |
| <middle> | |
| <bottom label="Another bottom"/> | |
| </middle> | |
| </top>"#; | |
| let parsed_doc = Element { | |
| name: "top".to_string(), | |
| attributes: vec![("label".to_string(), "Top".to_string())], | |
| children: vec![ | |
| Element { | |
| name: "semi-bottom".to_string(), | |
| attributes: vec![("label".to_string(), "Bottom".to_string())], | |
| children: vec![], | |
| }, | |
| Element { | |
| name: "middle".to_string(), | |
| attributes: vec![], | |
| children: vec![Element { | |
| name: "bottom".to_string(), | |
| attributes: vec![("label".to_string(), "Another bottom".to_string())], | |
| children: vec![], | |
| }], | |
| }, | |
| ], | |
| }; | |
| assert_eq!(Ok(("", parsed_doc)), element().parse(doc)); | |
| } | |
| #[test] | |
| fn mismatched_closing_tag() { | |
| let doc = r#" | |
| <top> | |
| <bottom/> | |
| </middle>"#; | |
| assert_eq!(Err("</middle>"), element().parse(doc)); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.