typos/src/tokens.rs

75 lines
2.1 KiB
Rust
Raw Normal View History

2019-01-24 08:24:20 -07:00
#[derive(Debug, Clone, PartialEq, Eq)]
2019-04-16 20:22:01 -06:00
pub struct Symbol<'t> {
2019-01-24 08:24:20 -07:00
pub token: &'t [u8],
pub offset: usize,
}
2019-04-16 20:22:01 -06:00
impl<'t> Symbol<'t> {
2019-01-24 08:24:20 -07:00
pub fn new(token: &'t [u8], offset: usize) -> Self {
Self {
token,
offset,
}
}
2019-04-16 20:22:01 -06:00
pub fn parse<'s>(content: &'s [u8]) -> impl Iterator<Item=Symbol<'s>> {
lazy_static::lazy_static! {
static ref SPLIT: regex::bytes::Regex = regex::bytes::Regex::new(r#"\b(\p{Alphabetic}|\d|_)+\b"#).unwrap();
}
SPLIT.find_iter(content).map(|m| Symbol::new(m.as_bytes(), m.start()))
2019-01-24 08:24:20 -07:00
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn tokenize_empty_is_empty() {
let input = b"";
2019-04-16 20:22:01 -06:00
let expected: Vec<Symbol> = vec![];
let actual: Vec<_> = Symbol::parse(input).collect();
2019-01-24 08:24:20 -07:00
assert_eq!(expected, actual);
}
#[test]
fn tokenize_word_is_word() {
let input = b"word";
2019-04-16 20:22:01 -06:00
let expected: Vec<Symbol> = vec![Symbol::new(b"word", 0)];
let actual: Vec<_> = Symbol::parse(input).collect();
2019-01-24 08:24:20 -07:00
assert_eq!(expected, actual);
}
#[test]
fn tokenize_space_separated_words() {
let input = b"A B";
2019-04-16 20:22:01 -06:00
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)];
let actual: Vec<_> = Symbol::parse(input).collect();
2019-01-24 08:24:20 -07:00
assert_eq!(expected, actual);
}
#[test]
fn tokenize_dot_separated_words() {
let input = b"A.B";
2019-04-16 20:22:01 -06:00
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 2)];
let actual: Vec<_> = Symbol::parse(input).collect();
2019-01-24 08:24:20 -07:00
assert_eq!(expected, actual);
}
#[test]
fn tokenize_namespace_separated_words() {
let input = b"A::B";
2019-04-16 20:22:01 -06:00
let expected: Vec<Symbol> = vec![Symbol::new(b"A", 0), Symbol::new(b"B", 3)];
let actual: Vec<_> = Symbol::parse(input).collect();
2019-01-24 08:24:20 -07:00
assert_eq!(expected, actual);
}
#[test]
fn tokenize_underscore_doesnt_separate() {
let input = b"A_B";
2019-04-16 20:22:01 -06:00
let expected: Vec<Symbol> = vec![Symbol::new(b"A_B", 0)];
let actual: Vec<_> = Symbol::parse(input).collect();
2019-01-24 08:24:20 -07:00
assert_eq!(expected, actual);
}
}