Last active
May 18, 2019 15:56
-
-
Save vayn/676e11526a4d85c8498062c8542fca57 to your computer and use it in GitHub Desktop.
Parser combinator in Rust (https://git.io/xml-parser-combinator)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #![allow(dead_code)] | |
| #[derive(Clone, Debug, PartialEq, Eq)] | |
| struct Element { | |
| name: String, | |
| attributes: Vec<(String, String)>, | |
| children: Vec<Element>, | |
| } | |
| type ParserResult<'a, Output> = Result<(&'a str, Output), &'a str>; | |
| trait Parser<'a, Output> { | |
| fn parse(&self, input: &'a str) -> ParserResult<'a, Output>; | |
| fn map<F, NewOutput>(self, map_fn: F) -> BoxedParser<'a, NewOutput> | |
| where | |
| Self: Sized + 'a, | |
| Output: 'a, | |
| NewOutput: 'a, | |
| F: Fn(Output) -> NewOutput + 'a, | |
| { | |
| BoxedParser::new(map(self, map_fn)) | |
| } | |
| fn pred<F>(self, pred_fn: F) -> BoxedParser<'a, Output> | |
| where | |
| Self: Sized + 'a, | |
| Output: 'a, | |
| F: Fn(&Output) -> bool + 'a, | |
| { | |
| BoxedParser::new(pred(self, pred_fn)) | |
| } | |
| fn and_then<F, NextParser, NewOutput>(self, f: F) -> BoxedParser<'a, NewOutput> | |
| where | |
| Self: Sized + 'a, | |
| Output: 'a, | |
| NewOutput: 'a, | |
| NextParser: Parser<'a, NewOutput> + 'a, | |
| F: Fn(Output) -> NextParser + 'a, | |
| { | |
| BoxedParser::new(and_then(self, f)) | |
| } | |
| } | |
| impl<'a, F, Output> Parser<'a, Output> for F | |
| where | |
| F: Fn(&'a str) -> ParserResult<Output>, | |
| { | |
| fn parse(&self, input: &'a str) -> ParserResult<'a, Output> { | |
| self(input) | |
| } | |
| } | |
| /// enum List<A> { | |
| /// Cons(A, List<A>), | |
| /// Nil, | |
| /// } | |
| /// | |
| /// https://bodil.lol/parser-combinators/#to-infinity-and-beyond | |
| /// | |
| /// To which rustc will, very sensibly, object that your recursive type List<A> | |
| /// has an infinite size, because inside every List::<A>::Cons is another | |
| /// List<A>, and that means it's List<A>s all the way down into infinity. As far | |
| /// as rustc is concerned, we're asking for an infinite list, and we're asking it | |
| /// to be able to allocate an infinite list. | |
| /// | |
| /// In many languages, an infinite list isn't a problem in principle for the type | |
| /// system, and it's actually not for Rust either. The problem is that in Rust, | |
| /// as mentioned, we need to be able to allocate it, or, rather, we need to be | |
| /// able to determine the size of a type up front when we construct it, and when | |
| /// the type is infinite, that means the size must be infinite too. | |
| /// | |
| /// The solution is to employ a bit of indirection. Instead of our List::Cons | |
| /// being an element of A and another list of A, instead we make it an element of | |
| /// A and a pointer to a list of A. We know the size of a pointer, and it's the | |
| /// same no matter what it points to, and so our List::Cons now has a fixed and | |
| /// predictable size no matter the size of the list. And the way to turn an owned | |
| /// thing into a pointer to an owned thing on the heap, in Rust, is to Box it. | |
| /// | |
| /// enum List<A> { | |
| /// Cons(A, Box<List<A>>), | |
| /// Nil, | |
| /// } | |
| /// | |
| /// Another interesting feature of Box is that the type inside it can be | |
| /// abstract. This means that instead of our by now incredibly complicated | |
| /// parser function types, we can let the type checker deal with a very succinct | |
| /// Box<dyn Parser<'a, A>> instead. | |
| /// | |
| /// That sounds great. What's the downside? Well, we might be losing a cycle or | |
| /// two to having to follow that pointer, and it could be that the compiler | |
| /// loses some opportunities to optimise our parser. But recall Knuth's | |
| /// admonition about premature optimisation: it's going to be fine. You can | |
| /// afford those cycles. | |
| /// | |
| /// So let's proceed to implement Parser for a boxed parser function in addition | |
| /// to the bare functions we've been using so far. | |
| struct BoxedParser<'a, Output> { | |
| parser: Box<dyn Parser<'a, Output> + 'a>, | |
| } | |
| impl<'a, Output> BoxedParser<'a, Output> { | |
| fn new<P>(parser: P) -> Self | |
| where | |
| P: Parser<'a, Output> + 'a, | |
| { | |
| BoxedParser { | |
| parser: Box::new(parser), | |
| } | |
| } | |
| } | |
| impl<'a, Output> Parser<'a, Output> for BoxedParser<'a, Output> { | |
| fn parse(&self, input: &'a str) -> ParserResult<'a, Output> { | |
| self.parser.parse(input) | |
| } | |
| } | |
| /* | |
| fn match_literal(expected: &'static str) | |
| -> impl Fn(&str) -> Result<(&str, ()), &str> | |
| { | |
| move |input| match input.get(0..expected.len()) { | |
| Some(next) if next == expected => { | |
| Ok((&input[expected.len()..], ())) | |
| } | |
| _ => Err(input), | |
| } | |
| } | |
| */ | |
| fn match_literal<'a>(expected: &'static str) -> impl Parser<'a, ()> { | |
| move |input: &'a str| match input.get(0..expected.len()) { | |
| Some(next) if next == expected => Ok((&input[expected.len()..], ())), | |
| _ => Err(input), | |
| } | |
| } | |
| #[test] | |
| fn literal_parser() { | |
| let parse_joe = match_literal("Hello Joe!"); | |
| assert_eq!(Ok(("", ())), parse_joe.parse("Hello Joe!")); | |
| assert_eq!( | |
| Ok((" Hello Robert!", ())), | |
| parse_joe.parse("Hello Joe! Hello Robert!") | |
| ); | |
| assert_eq!(Err("Hello Mike!"), parse_joe.parse("Hello Mike!")) | |
| } | |
| // fn identifier(input: &str) -> Result<(&str, String), &str> { | |
| fn identifier(input: &str) -> ParserResult<String> { | |
| let mut matched = String::new(); | |
| let mut chars = input.chars(); | |
| match chars.next() { | |
| Some(next) if next.is_alphabetic() => matched.push(next), | |
| _ => return Err(input), | |
| } | |
| while let Some(next) = chars.next() { | |
| if next.is_alphanumeric() || next == '-' { | |
| matched.push(next); | |
| } else { | |
| break; | |
| } | |
| } | |
| let next_index = matched.len(); | |
| Ok((&input[next_index..], matched)) | |
| } | |
| #[test] | |
| fn identifier_parser() { | |
| assert_eq!( | |
| Ok(("", "i-am-an-identifier".to_string())), | |
| identifier("i-am-an-identifier") | |
| ); | |
| assert_eq!( | |
| Ok((" entirely an identifier", "not".to_string())), | |
| identifier("not entirely an identifier") | |
| ); | |
| assert_eq!( | |
| Err("!not at all an identifier"), | |
| identifier("!not at all an identifier") | |
| ); | |
| } | |
| /* | |
| fn pair<P1, P2, R1, R2>(parser1: P1, parser2: P2) | |
| -> impl Fn(&str) -> Result<(&str, (R1, R2)), &str> | |
| where | |
| P1: Fn(&str) -> Result<(&str, R1), &str>, | |
| P2: Fn(&str) -> Result<(&str, R2), &str>, | |
| { | |
| move |input| match parser1(input) { | |
| Ok((next_input, result1)) => match parser2(next_input) { | |
| Ok((final_input, result2)) => Ok((final_input, (result1, result2))), | |
| Err(err) => Err(err), | |
| }, | |
| Err(err) => Err(err), | |
| } | |
| } | |
| fn pair<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) | |
| -> impl Parser<'a, (R1, R2)> | |
| where | |
| P1: Parser<'a, R1>, | |
| P2: Parser<'a, R2>, | |
| { | |
| move |input| match parser1.parse(input) { | |
| Ok((next_input, result1)) => match parser2.parse(next_input) { | |
| Ok((final_input, result2)) => Ok((final_input, (result1, result2))), | |
| Err(err) => Err(err), | |
| } | |
| Err(err) => Err(err), | |
| } | |
| } | |
| */ | |
| fn pair<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) -> impl Parser<'a, (R1, R2)> | |
| where | |
| P1: Parser<'a, R1>, | |
| P2: Parser<'a, R2>, | |
| { | |
| move |input| { | |
| parser1.parse(input).and_then(|(next_input, result1)| { | |
| parser2 | |
| .parse(next_input) | |
| .map(|(final_input, result2)| (final_input, (result1, result2))) | |
| }) | |
| } | |
| } | |
| /// It looks very neat, but there's a problem: parser2.map() consumes parser2 to | |
| /// create the wrapped parser, and the function is a Fn, not a FnOnce, so it's | |
| /// not allowed to consume parser2, just take a reference to it. Rust Problems, | |
| /// in other words. | |
| /// | |
| /// fn pair<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) | |
| /// -> impl Parser<'a, (R1, R2)> | |
| /// where | |
| /// P1: Parser<'a, R1> + 'a, | |
| /// P2: Parser<'a, R2> + 'a, | |
| /// R1: 'a + Clone, | |
| /// R2: 'a, | |
| /// { | |
| /// parser1.and_then(move |result1| parser2.map(move |result2| { | |
| /// (result1.clone(), result2) | |
| /// })) | |
| /// } | |
| #[test] | |
| fn pair_combinator() { | |
| let tag_opener = pair(match_literal("<"), identifier); | |
| assert_eq!( | |
| Ok(("/>", ((), "my-first-element".to_string()))), | |
| tag_opener.parse("<my-first-element/>") | |
| ); | |
| assert_eq!(Err("oops"), tag_opener.parse("oops")); | |
| assert_eq!(Err("!oops"), tag_opener.parse("<!oops")); | |
| } | |
| /* | |
| fn map<P, F, A, B>(parser: P, map_fn: F) | |
| -> impl Fn(&str) -> Result<(&str, B), &str> | |
| where | |
| P: Fn(&str) -> Result<(&str, A), &str>, | |
| F: Fn(A) -> B, | |
| { | |
| move |input| match parser(input) { | |
| Ok((next_input, result)) => Ok((next_input, map_fn(result))), | |
| Err(err) => Err(err), | |
| } | |
| } | |
| fn map<P, F, A, B>(parser: P, map_fn: F) | |
| -> impl Fn(&str) -> Result<(&str, B), &str> | |
| where | |
| P: Fn(&str) -> Result<(&str, A), &str>, | |
| F: Fn(A) -> B, | |
| { | |
| move |input| | |
| parser(input) | |
| .map(|(next_input, result)| (next_input, map_fn(result))) | |
| } | |
| */ | |
| fn map<'a, P, F, A, B>(parser: P, map_fn: F) -> impl Parser<'a, B> | |
| where | |
| P: Parser<'a, A>, | |
| F: Fn(A) -> B, | |
| { | |
| move |input| { | |
| parser | |
| .parse(input) | |
| .map(|(next_input, result)| (next_input, map_fn(result))) | |
| } | |
| } | |
| fn left<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) -> impl Parser<'a, R1> | |
| where | |
| P1: Parser<'a, R1> + 'a, | |
| P2: Parser<'a, R2> + 'a, | |
| R1: 'a, | |
| R2: 'a, | |
| { | |
| // map(pair(parser1, parser2), |(left, _right)| left) | |
| pair(parser1, parser2).map(|(left, _right)| left) | |
| } | |
| fn right<'a, P1, P2, R1, R2>(parser1: P1, parser2: P2) -> impl Parser<'a, R2> | |
| where | |
| P1: Parser<'a, R1> + 'a, | |
| P2: Parser<'a, R2> + 'a, | |
| R1: 'a, | |
| R2: 'a, | |
| { | |
| // map(pair(parser1, parser2), |(_left, right)| right) | |
| pair(parser1, parser2).map(|(_left, right)| right) | |
| } | |
| #[test] | |
| fn right_combinator() { | |
| let tag_opener = right(match_literal("<"), identifier); | |
| assert_eq!( | |
| Ok(("/>", "my-first-element".to_string())), | |
| tag_opener.parse("<my-first-element/>") | |
| ); | |
| assert_eq!(Err("oops"), tag_opener.parse("oops")); | |
| assert_eq!(Err("!oops"), tag_opener.parse("<!oops")); | |
| } | |
| fn zero_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<A>> | |
| where | |
| P: Parser<'a, A>, | |
| { | |
| move |mut input| { | |
| let mut result = Vec::new(); | |
| while let Ok((next_input, next_item)) = parser.parse(input) { | |
| input = next_input; | |
| result.push(next_item); | |
| } | |
| Ok((input, result)) | |
| } | |
| } | |
| #[test] | |
| fn zero_or_more_combinator() { | |
| let parser = zero_or_more(match_literal("ha")); | |
| assert_eq!(Ok(("", vec![(), (), ()])), parser.parse("hahaha")); | |
| assert_eq!(Ok(("ahah", vec![])), parser.parse("ahah")); | |
| assert_eq!(Ok(("", vec![])), parser.parse("")); | |
| } | |
| fn one_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<A>> | |
| where | |
| P: Parser<'a, A>, | |
| { | |
| move |mut input| { | |
| let mut result = Vec::new(); | |
| if let Ok((next_input, first_item)) = parser.parse(input) { | |
| input = next_input; | |
| result.push(first_item); | |
| } else { | |
| return Err(input); | |
| } | |
| while let Ok((next_input, next_item)) = parser.parse(input) { | |
| input = next_input; | |
| result.push(next_item); | |
| } | |
| Ok((input, result)) | |
| } | |
| } | |
| /// Ownership problem https://bodil.lol/parser-combinators/#one-or-more Here, we | |
| /// run into Rust Problems, and I don't even mean the problem of not having a | |
| /// cons method for Vec, but I know every Lisp programmer reading that bit of | |
| /// code was thinking it. No, it's worse than that: it's ownership. | |
| /// | |
| /// We own that parser, so we can't go passing it as an argument twice, the | |
| /// compiler will start shouting at you that you're trying to move an already | |
| /// moved value. So can we make our combinators take references instead? No, it | |
| /// turns out, not without running into another whole set of borrow checker | |
| /// troubles - and we're not going to even go there right now. And because these | |
| /// parsers are functions, they don't do anything so straightforward as to | |
| /// implement Clone, which would have saved the day very tidily, so we're stuck | |
| /// with a constraint that we can't repeat our parsers easily in combinators. | |
| /// | |
| /// fn one_or_more<'a, P, A>(parser: P) -> impl Parser<'a, Vec<A>> | |
| /// where | |
| /// P: Parser<'a, A>, | |
| /// { | |
| /// map(pair(parser, zero_or_more(parser)), |(head, mut tail)| { | |
| /// tail.insert(0, head); | |
| /// tail | |
| /// }) | |
| /// } | |
| #[test] | |
| fn one_or_more_combinator() { | |
| let parser = one_or_more(match_literal("ha")); | |
| assert_eq!(Ok(("", vec![(), (), ()])), parser.parse("hahaha")); | |
| assert_eq!(Err("ahah"), parser.parse("ahah")); | |
| assert_eq!(Err(""), parser.parse("")); | |
| } | |
| fn any_char(input: &str) -> ParserResult<char> { | |
| match input.chars().next() { | |
| Some(next) => Ok((&input[next.len_utf8()..], next)), | |
| _ => Err(input), | |
| } | |
| } | |
| fn pred<'a, P, A, F>(parser: P, predicate: F) -> impl Parser<'a, A> | |
| where | |
| P: Parser<'a, A>, | |
| F: Fn(&A) -> bool, | |
| { | |
| move |input| { | |
| if let Ok((next_input, value)) = parser.parse(input) { | |
| if predicate(&value) { | |
| return Ok((next_input, value)); | |
| } | |
| } | |
| Err(input) | |
| } | |
| } | |
| #[test] | |
| fn predicate_combinator() { | |
| let parser = pred(any_char, |c| *c == 'o'); | |
| assert_eq!(Ok(("mg", 'o')), parser.parse("omg")); | |
| assert_eq!(Err("lol"), parser.parse("lol")); | |
| } | |
| fn whitespace_char<'a>() -> impl Parser<'a, char> { | |
| // pred(any_char, |c| c.is_whitespace()) | |
| any_char.pred(|c| c.is_whitespace()) | |
| } | |
| fn space1<'a>() -> impl Parser<'a, Vec<char>> { | |
| one_or_more(whitespace_char()) | |
| } | |
| fn space0<'a>() -> impl Parser<'a, Vec<char>> { | |
| zero_or_more(whitespace_char()) | |
| } | |
| fn quoted_string<'a>() -> impl Parser<'a, String> { | |
| right( | |
| match_literal("\""), | |
| left( | |
| zero_or_more(any_char.pred(|c| *c != '"')), | |
| match_literal("\""), | |
| ), | |
| ) | |
| .map(|chars| chars.into_iter().collect()) | |
| } | |
| #[test] | |
| fn quoted_string_parser() { | |
| assert_eq!( | |
| Ok(("", "Hello Joe!".to_string())), | |
| quoted_string().parse("\"Hello Joe!\"") | |
| ); | |
| } | |
| fn attribute_pair<'a>() -> impl Parser<'a, (String, String)> { | |
| pair(identifier, right(match_literal("="), quoted_string())) | |
| } | |
| fn attributes<'a>() -> impl Parser<'a, Vec<(String, String)>> { | |
| zero_or_more(right(space1(), attribute_pair())) | |
| } | |
| #[test] | |
| fn attribute_parser() { | |
| assert_eq!( | |
| Ok(( | |
| "", | |
| vec![ | |
| ("one".to_string(), "1".to_string()), | |
| ("two".to_string(), "2".to_string()) | |
| ] | |
| )), | |
| attributes().parse(" one=\"1\" two=\"2\"") | |
| ); | |
| } | |
| fn element_start<'a>() -> impl Parser<'a, (String, Vec<(String, String)>)> { | |
| right(match_literal("<"), pair(identifier, attributes())) | |
| } | |
| fn single_element<'a>() -> impl Parser<'a, Element> { | |
| // map( | |
| // left(element_start(), match_literal("/>")), | |
| // |(name, attributes)| Element { | |
| // name, | |
| // attributes, | |
| // children: vec![], | |
| // }, | |
| // ) | |
| left(element_start(), match_literal("/>")).map(|(name, attributes)| Element { | |
| name, | |
| attributes, | |
| children: vec![], | |
| }) | |
| } | |
| #[test] | |
| fn single_element_parser() { | |
| assert_eq!( | |
| Ok(( | |
| "", | |
| Element { | |
| name: "div".to_string(), | |
| attributes: vec![("class".to_string(), "float".to_string())], | |
| children: vec![] | |
| } | |
| )), | |
| single_element().parse("<div class=\"float\"/>") | |
| ); | |
| } | |
| fn open_element<'a>() -> impl Parser<'a, Element> { | |
| left(element_start(), match_literal(">")).map(|(name, attributes)| Element { | |
| name, | |
| attributes, | |
| children: vec![], | |
| }) | |
| } | |
| fn either<'a, P1, P2, A>(parser1: P1, parser2: P2) -> impl Parser<'a, A> | |
| where | |
| P1: Parser<'a, A>, | |
| P2: Parser<'a, A>, | |
| { | |
| move |input| match parser1.parse(input) { | |
| ok @ Ok(_) => ok, | |
| Err(_) => parser2.parse(input), | |
| } | |
| } | |
| fn element<'a>() -> impl Parser<'a, Element> { | |
| whitespace_wrap(either(single_element(), parent_element())) | |
| } | |
| fn close_element<'a>(expected_name: String) -> impl Parser<'a, String> { | |
| right(match_literal("</"), left(identifier, match_literal(">"))) | |
| .pred(move |name| name == &expected_name) | |
| } | |
| fn and_then<'a, P, F, A, B, NextP>(parser: P, f: F) -> impl Parser<'a, B> | |
| where | |
| P: Parser<'a, A>, | |
| NextP: Parser<'a, B>, | |
| F: Fn(A) -> NextP, | |
| { | |
| move |input| match parser.parse(input) { | |
| Ok((next_input, result)) => f(result).parse(next_input), | |
| Err(err) => Err(err), | |
| } | |
| } | |
| fn parent_element<'a>() -> impl Parser<'a, Element> { | |
| open_element().and_then(|el| { | |
| left(zero_or_more(element()), close_element(el.name.clone())).map(move |children| { | |
| let mut el = el.clone(); | |
| el.children = children; | |
| el | |
| }) | |
| }) | |
| } | |
| fn whitespace_wrap<'a, P, A>(parser: P) -> impl Parser<'a, A> | |
| where | |
| P: Parser<'a, A> + 'a, | |
| A: 'a, | |
| { | |
| right(space0(), left(parser, space0())) | |
| } | |
| #[test] | |
| fn xml_parser() { | |
| let doc = r#" | |
| <top label="Top"> | |
| <semi-bottom label="Bottom"/> | |
| <middle> | |
| <bottom label="Another bottom"/> | |
| </middle> | |
| </top>"#; | |
| let parsed_doc = Element { | |
| name: "top".to_string(), | |
| attributes: vec![("label".to_string(), "Top".to_string())], | |
| children: vec![ | |
| Element { | |
| name: "semi-bottom".to_string(), | |
| attributes: vec![("label".to_string(), "Bottom".to_string())], | |
| children: vec![], | |
| }, | |
| Element { | |
| name: "middle".to_string(), | |
| attributes: vec![], | |
| children: vec![Element { | |
| name: "bottom".to_string(), | |
| attributes: vec![("label".to_string(), "Another bottom".to_string())], | |
| children: vec![], | |
| }], | |
| }, | |
| ], | |
| }; | |
| assert_eq!(Ok(("", parsed_doc)), element().parse(doc)); | |
| } | |
| #[test] | |
| fn mismatched_closing_tag() { | |
| let doc = r#" | |
| <top> | |
| <bottom/> | |
| </middle>"#; | |
| assert_eq!(Err("</middle>"), element().parse(doc)); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.