2019-01-24 08:24:20 -07:00
|
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
2019-04-16 20:22:01 -06:00
|
|
|
pub struct Symbol<'t> {
|
2019-06-14 15:57:41 -06:00
|
|
|
pub token: &'t str,
|
2019-01-24 08:24:20 -07:00
|
|
|
pub offset: usize,
|
|
|
|
}
|
|
|
|
|
2019-04-16 20:22:01 -06:00
|
|
|
impl<'t> Symbol<'t> {
|
2019-06-14 15:57:41 -06:00
|
|
|
pub fn new(token: &'t str, offset: usize) -> Self {
|
2019-06-14 06:43:21 -06:00
|
|
|
Self { token, offset }
|
2019-01-24 08:24:20 -07:00
|
|
|
}
|
|
|
|
|
2019-06-14 06:51:22 -06:00
|
|
|
pub fn parse(content: &[u8]) -> impl Iterator<Item = Symbol<'_>> {
|
2019-04-16 20:22:01 -06:00
|
|
|
lazy_static::lazy_static! {
|
2019-06-14 15:14:42 -06:00
|
|
|
// Getting false positives for this lint
|
|
|
|
#[allow(clippy::invalid_regex)]
|
2019-04-16 20:22:01 -06:00
|
|
|
static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap();
|
|
|
|
}
|
2019-06-14 15:57:41 -06:00
|
|
|
SPLIT.find_iter(content).filter_map(|m| {
|
|
|
|
let s = std::str::from_utf8(m.as_bytes()).ok();
|
|
|
|
s.map(|s| Symbol::new(s, m.start()))
|
|
|
|
})
|
2019-01-24 08:24:20 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod test {
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn tokenize_empty_is_empty() {
|
|
|
|
let input = b"";
|
2019-04-16 20:22:01 -06:00
|
|
|
let expected: Vec<Symbol> = vec![];
|
|
|
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
2019-01-24 08:24:20 -07:00
|
|
|
assert_eq!(expected, actual);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn tokenize_word_is_word() {
|
|
|
|
let input = b"word";
|
2019-06-14 15:57:41 -06:00
|
|
|
let expected: Vec<Symbol> = vec![Symbol::new("word", 0)];
|
2019-04-16 20:22:01 -06:00
|
|
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
2019-01-24 08:24:20 -07:00
|
|
|
assert_eq!(expected, actual);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn tokenize_space_separated_words() {
|
|
|
|
let input = b"A B";
|
2019-06-14 15:57:41 -06:00
|
|
|
let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 2)];
|
2019-04-16 20:22:01 -06:00
|
|
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
2019-01-24 08:24:20 -07:00
|
|
|
assert_eq!(expected, actual);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn tokenize_dot_separated_words() {
|
|
|
|
let input = b"A.B";
|
2019-06-14 15:57:41 -06:00
|
|
|
let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 2)];
|
2019-04-16 20:22:01 -06:00
|
|
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
2019-01-24 08:24:20 -07:00
|
|
|
assert_eq!(expected, actual);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn tokenize_namespace_separated_words() {
|
|
|
|
let input = b"A::B";
|
2019-06-14 15:57:41 -06:00
|
|
|
let expected: Vec<Symbol> = vec![Symbol::new("A", 0), Symbol::new("B", 3)];
|
2019-04-16 20:22:01 -06:00
|
|
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
2019-01-24 08:24:20 -07:00
|
|
|
assert_eq!(expected, actual);
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn tokenize_underscore_doesnt_separate() {
|
|
|
|
let input = b"A_B";
|
2019-06-14 15:57:41 -06:00
|
|
|
let expected: Vec<Symbol> = vec![Symbol::new("A_B", 0)];
|
2019-04-16 20:22:01 -06:00
|
|
|
let actual: Vec<_> = Symbol::parse(input).collect();
|
2019-01-24 08:24:20 -07:00
|
|
|
assert_eq!(expected, actual);
|
|
|
|
}
|
|
|
|
}
|