From 32f5e6c682d0668981fa8ab3c07caad559daf0c3 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 29 Jun 2021 10:40:58 -0500 Subject: [PATCH 1/9] refactor(typos)!: Bake ignores into parser This is prep for other items to be ignored BREAKING CHANGE: `TokenizerBuilder` no longer takes config for ignoring tokens. Related, we now ignore token-ignore config flags. --- crates/typos/src/tokens.rs | 311 +++++++++++++++++-------------------- docs/reference.md | 2 - src/policy.rs | 9 +- 3 files changed, 150 insertions(+), 172 deletions(-) diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 0d8f7a2..941f95d 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -4,8 +4,6 @@ use bstr::ByteSlice; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct TokenizerBuilder { unicode: bool, - ignore_hex: bool, - leading_digits: bool, } impl TokenizerBuilder { @@ -19,39 +17,15 @@ impl TokenizerBuilder { self } - /// Specify that hexadecimal numbers should be ignored. - pub fn ignore_hex(&mut self, yes: bool) -> &mut Self { - self.ignore_hex = yes; - self - } - - /// Specify that leading digits are allowed for Identifiers. - pub fn leading_digits(&mut self, yes: bool) -> &mut Self { - self.leading_digits = yes; - self - } - pub fn build(&self) -> Tokenizer { - let TokenizerBuilder { - unicode, - leading_digits, - ignore_hex, - } = self.clone(); - Tokenizer { - unicode, - leading_digits, - ignore_hex, - } + let TokenizerBuilder { unicode } = self.clone(); + Tokenizer { unicode } } } impl Default for TokenizerBuilder { fn default() -> Self { - Self { - unicode: true, - leading_digits: false, - ignore_hex: true, - } + Self { unicode: true } } } @@ -59,8 +33,6 @@ impl Default for TokenizerBuilder { #[derive(Debug, Clone)] pub struct Tokenizer { unicode: bool, - leading_digits: bool, - ignore_hex: bool, } impl Tokenizer { @@ -70,9 +42,9 @@ impl Tokenizer { pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator> { let iter = if self.unicode && !ByteSlice::is_ascii(content.as_bytes()) { - itertools::Either::Left(unicode_parser::iter_literals(content)) + itertools::Either::Left(unicode_parser::iter_identifiers(content)) } else { - itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes())) + itertools::Either::Right(ascii_parser::iter_identifiers(content.as_bytes())) }; iter.filter_map(move |identifier| { let offset = offset(content.as_bytes(), identifier.as_bytes()); @@ -82,10 +54,11 @@ impl Tokenizer { pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator> { let iter = if self.unicode && !ByteSlice::is_ascii(content) { - let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c)); + let iter = + Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_identifiers(c)); itertools::Either::Left(iter) } else { - itertools::Either::Right(ascii_parser::iter_literals(content)) + itertools::Either::Right(ascii_parser::iter_identifiers(content)) }; iter.filter_map(move |identifier| { let offset = offset(content, identifier.as_bytes()); @@ -95,17 +68,6 @@ impl Tokenizer { fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option> { debug_assert!(!identifier.is_empty()); - if self.leading_digits { - if is_number(identifier.as_bytes()) { - return None; - } - - if self.ignore_hex && is_hex(identifier.as_bytes()) { - return None; - } - } else if is_digit(identifier.as_bytes()[0]) { - return None; - } let case = Case::None; Some(Identifier::new_unchecked(identifier, case, offset)) @@ -164,98 +126,155 @@ impl<'s> Iterator for Utf8Chunks<'s> { } } -fn is_number(ident: &[u8]) -> bool { - ident.iter().all(|b| is_digit(*b) || is_digit_sep(*b)) -} - -fn is_hex(ident: &[u8]) -> bool { - if ident.len() < 3 { - false - } else { - ident[0] == b'0' - && ident[1] == b'x' - && ident[2..] - .iter() - .all(|b| is_hex_digit(*b) || is_digit_sep(*b)) - } -} - -#[inline] -fn is_digit(chr: u8) -> bool { - chr.is_ascii_digit() -} - -#[inline] -fn is_digit_sep(chr: u8) -> bool { - // `_`: number literal separator in Rust and other languages - // `'`: number literal separator in C++ - chr == b'_' || chr == b'\'' -} - -#[inline] -fn is_hex_digit(chr: u8) -> bool { - chr.is_ascii_hexdigit() -} - mod parser { + use nom::branch::*; use nom::bytes::complete::*; + use nom::character::complete::*; use nom::sequence::*; - use nom::IResult; + use nom::{AsChar, IResult}; - pub(crate) trait AsChar: nom::AsChar { - #[allow(clippy::wrong_self_convention)] - fn is_xid_continue(self) -> bool; - } - - impl AsChar for u8 { - fn is_xid_continue(self) -> bool { - (b'a'..=b'z').contains(&self) - || (b'A'..=b'Z').contains(&self) - || (b'0'..=b'9').contains(&self) - || self == b'_' - } - } - - impl AsChar for char { - fn is_xid_continue(self) -> bool { - unicode_xid::UnicodeXID::is_xid_continue(self) - } - } - - pub(crate) fn next_literal(input: T) -> IResult + pub(crate) fn next_identifier(input: T) -> IResult where - T: nom::InputTakeAtPosition, - ::Item: AsChar, + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Slice> + + nom::Offset + + Clone + + PartialEq + + std::fmt::Debug, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, { - preceded(literal_sep, identifier)(input) - } - - fn literal_sep(input: T) -> IResult - where - T: nom::InputTakeAtPosition, - ::Item: AsChar, - { - take_till(AsChar::is_xid_continue)(input) + preceded(ignore, identifier)(input) } fn identifier(input: T) -> IResult where T: nom::InputTakeAtPosition, - ::Item: AsChar, + ::Item: AsChar + Copy, { // Generally a language would be `{XID_Start}{XID_Continue}*` but going with only // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd // or unexpected cases than strip off start characters to a word since we aren't doing a // proper word boundary parse - take_while1(AsChar::is_xid_continue)(input) + take_while1(is_xid_continue)(input) + } + + fn ignore(input: T) -> IResult + where + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Slice> + + nom::Offset + + Clone + + PartialEq + + std::fmt::Debug, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, + { + take_many0(alt(( + sep1, + terminated(hex_literal, sep1), + terminated(dec_literal, sep1), + )))(input) + } + + fn sep1(input: T) -> IResult + where + T: nom::InputTakeAtPosition, + ::Item: AsChar + Copy, + { + take_till1(is_xid_continue)(input) + } + + fn dec_literal(input: T) -> IResult + where + T: nom::InputTakeAtPosition, + ::Item: AsChar + Copy, + { + take_while1(is_dec_digit)(input) + } + + fn hex_literal(input: T) -> IResult + where + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Slice> + + Clone, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, + { + preceded( + pair(char('0'), alt((char('x'), char('X')))), + take_while1(is_hex_digit), + )(input) + } + + fn take_many0(mut f: F) -> impl FnMut(I) -> IResult + where + I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug, + F: nom::Parser, + E: nom::error::ParseError, + { + move |i: I| { + let mut current = i.clone(); + loop { + match f.parse(current.clone()) { + Err(nom::Err::Error(_)) => { + let offset = i.offset(¤t); + let (after, before) = i.take_split(offset); + return Ok((after, before)); + } + Err(e) => { + return Err(e); + } + Ok((next, _)) => { + if next == current { + return Err(nom::Err::Error(E::from_error_kind( + i, + nom::error::ErrorKind::Many0, + ))); + } + + current = next; + } + } + } + } + } + + fn is_dec_digit(i: impl AsChar + Copy) -> bool { + i.is_dec_digit() || is_digit_sep(i.as_char()) + } + + fn is_hex_digit(i: impl AsChar + Copy) -> bool { + i.is_hex_digit() || is_digit_sep(i.as_char()) + } + + fn is_xid_continue(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + unicode_xid::UnicodeXID::is_xid_continue(c) + } + + #[inline] + fn is_digit_sep(chr: char) -> bool { + // `_`: number literal separator in Rust and other languages + // `'`: number literal separator in C++ + chr == '_' || chr == '\'' } } mod unicode_parser { - use super::parser::next_literal; + use super::parser::next_identifier; - pub(crate) fn iter_literals(mut input: &str) -> impl Iterator { - std::iter::from_fn(move || match next_literal(input) { + pub(crate) fn iter_identifiers(mut input: &str) -> impl Iterator { + std::iter::from_fn(move || match next_identifier(input) { Ok((i, o)) => { input = i; debug_assert_ne!(o, ""); @@ -267,10 +286,10 @@ mod unicode_parser { } mod ascii_parser { - use super::parser::next_literal; + use super::parser::next_identifier; - pub(crate) fn iter_literals(mut input: &[u8]) -> impl Iterator { - std::iter::from_fn(move || match next_literal(input) { + pub(crate) fn iter_identifiers(mut input: &[u8]) -> impl Iterator { + std::iter::from_fn(move || match next_identifier(input) { Ok((i, o)) => { input = i; debug_assert_ne!(o, b""); @@ -613,11 +632,8 @@ mod test { } #[test] - fn tokenize_ignore_hex_enabled() { - let parser = TokenizerBuilder::new() - .ignore_hex(true) - .leading_digits(true) - .build(); + fn tokenize_ignore_hex() { + let parser = TokenizerBuilder::new().build(); let input = "Hello 0xDEADBEEF World"; let expected: Vec = vec![ @@ -631,54 +647,13 @@ mod test { } #[test] - fn tokenize_ignore_hex_disabled() { - let parser = TokenizerBuilder::new() - .ignore_hex(false) - .leading_digits(true) - .build(); - - let input = "Hello 0xDEADBEEF World"; - let expected: Vec = vec![ - Identifier::new_unchecked("Hello", Case::None, 0), - Identifier::new_unchecked("0xDEADBEEF", Case::None, 6), - Identifier::new_unchecked("World", Case::None, 17), - ]; - let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); - assert_eq!(expected, actual); - let actual: Vec<_> = parser.parse_str(input).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn tokenize_leading_digits_enabled() { - let parser = TokenizerBuilder::new() - .ignore_hex(false) - .leading_digits(true) - .build(); + fn tokenize_leading_digits() { + let parser = TokenizerBuilder::new().build(); let input = "Hello 0Hello 124 0xDEADBEEF World"; let expected: Vec = vec![ Identifier::new_unchecked("Hello", Case::None, 0), Identifier::new_unchecked("0Hello", Case::None, 6), - Identifier::new_unchecked("0xDEADBEEF", Case::None, 17), - Identifier::new_unchecked("World", Case::None, 28), - ]; - let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); - assert_eq!(expected, actual); - let actual: Vec<_> = parser.parse_str(input).collect(); - assert_eq!(expected, actual); - } - - #[test] - fn tokenize_leading_digits_disabled() { - let parser = TokenizerBuilder::new() - .ignore_hex(false) - .leading_digits(false) - .build(); - - let input = "Hello 0Hello 124 0xDEADBEEF World"; - let expected: Vec = vec![ - Identifier::new_unchecked("Hello", Case::None, 0), Identifier::new_unchecked("World", Case::None, 28), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); diff --git a/docs/reference.md b/docs/reference.md index 13e1625..67247bb 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -26,8 +26,6 @@ Configuration is read from the following (in precedence order) | default.check-filename | \- | bool | Verifying spelling in file names. | | default.check-file | \- | bool | Verifying spelling in files. | | default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) | -| default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. | -| default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. | | default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. | | default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | | default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | diff --git a/src/policy.rs b/src/policy.rs index 86233d7..ede2fa6 100644 --- a/src/policy.rs +++ b/src/policy.rs @@ -224,10 +224,15 @@ impl<'s> ConfigEngine<'s> { tokenizer.unwrap_or_else(crate::config::TokenizerConfig::from_defaults); let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults); + if !tokenizer_config.ignore_hex() { + log::warn!("`ignore-hex` is deprecated"); + if !tokenizer_config.identifier_leading_digits() { + log::warn!("`identifier-leading-digits` is deprecated"); + } + } + let tokenizer = typos::tokens::TokenizerBuilder::new() .unicode(tokenizer_config.unicode()) - .ignore_hex(tokenizer_config.ignore_hex()) - .leading_digits(tokenizer_config.identifier_leading_digits()) .build(); let dict = crate::dict::BuiltIn::new(dict_config.locale()); From 85082cdbb1fbcdc176bb88db4321ed2eac10a715 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 29 Jun 2021 12:00:58 -0500 Subject: [PATCH 2/9] feat(parser): Ignore UUIDs We might be able to make this bail our earlier and not accidentally detect the wrong thing by checking if the hex values are lowercase. RFC 4122 says that UUIDs must be generated lowecase, while input accepts any case. The main issues are risk on the "input" part and the extra annoyance of writing a custm `is_hex_digit` function. --- crates/typos/src/tokens.rs | 53 +++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 941f95d..16943f9 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -130,6 +130,7 @@ mod parser { use nom::branch::*; use nom::bytes::complete::*; use nom::character::complete::*; + use nom::combinator::*; use nom::sequence::*; use nom::{AsChar, IResult}; @@ -140,6 +141,7 @@ mod parser { + nom::InputIter + nom::InputLength + nom::Slice> + + nom::Slice> + nom::Offset + Clone + PartialEq @@ -169,6 +171,7 @@ mod parser { + nom::InputIter + nom::InputLength + nom::Slice> + + nom::Slice> + nom::Offset + Clone + PartialEq @@ -178,6 +181,7 @@ mod parser { { take_many0(alt(( sep1, + terminated(uuid_literal, sep1), terminated(hex_literal, sep1), terminated(dec_literal, sep1), )))(input) @@ -196,7 +200,7 @@ mod parser { T: nom::InputTakeAtPosition, ::Item: AsChar + Copy, { - take_while1(is_dec_digit)(input) + take_while1(is_dec_digit_with_sep)(input) } fn hex_literal(input: T) -> IResult @@ -212,10 +216,36 @@ mod parser { { preceded( pair(char('0'), alt((char('x'), char('X')))), - take_while1(is_hex_digit), + take_while1(is_hex_digit_with_sep), )(input) } + fn uuid_literal(input: T) -> IResult + where + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Offset + + nom::Slice> + + nom::Slice> + + Clone, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, + { + recognize(tuple(( + take_while_m_n(8, 8, AsChar::is_hex_digit), + char('-'), + take_while_m_n(4, 4, AsChar::is_hex_digit), + char('-'), + take_while_m_n(4, 4, AsChar::is_hex_digit), + char('-'), + take_while_m_n(4, 4, AsChar::is_hex_digit), + char('-'), + take_while_m_n(12, 12, AsChar::is_hex_digit), + )))(input) + } + fn take_many0(mut f: F) -> impl FnMut(I) -> IResult where I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug, @@ -249,11 +279,11 @@ mod parser { } } - fn is_dec_digit(i: impl AsChar + Copy) -> bool { + fn is_dec_digit_with_sep(i: impl AsChar + Copy) -> bool { i.is_dec_digit() || is_digit_sep(i.as_char()) } - fn is_hex_digit(i: impl AsChar + Copy) -> bool { + fn is_hex_digit_with_sep(i: impl AsChar + Copy) -> bool { i.is_hex_digit() || is_digit_sep(i.as_char()) } @@ -646,6 +676,21 @@ mod test { assert_eq!(expected, actual); } + #[test] + fn tokenize_ignore_uuid() { + let parser = TokenizerBuilder::new().build(); + + let input = "Hello 123e4567-e89b-12d3-a456-426652340000 World"; + let expected: Vec = vec![ + Identifier::new_unchecked("Hello", Case::None, 0), + Identifier::new_unchecked("World", Case::None, 43), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + #[test] fn tokenize_leading_digits() { let parser = TokenizerBuilder::new().build(); From 8566b31f7b21b7256b5eeb7807a9c03702be6027 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 29 Jun 2021 12:13:18 -0500 Subject: [PATCH 3/9] fix(parser): Go ahead and do lower UUIDs I need this for hash support anyways --- crates/typos/src/tokens.rs | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 16943f9..43b1ace 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -234,15 +234,15 @@ mod parser { ::Item: AsChar + Copy, { recognize(tuple(( - take_while_m_n(8, 8, AsChar::is_hex_digit), + take_while_m_n(8, 8, is_lower_hex_digit), char('-'), - take_while_m_n(4, 4, AsChar::is_hex_digit), + take_while_m_n(4, 4, is_lower_hex_digit), char('-'), - take_while_m_n(4, 4, AsChar::is_hex_digit), + take_while_m_n(4, 4, is_lower_hex_digit), char('-'), - take_while_m_n(4, 4, AsChar::is_hex_digit), + take_while_m_n(4, 4, is_lower_hex_digit), char('-'), - take_while_m_n(12, 12, AsChar::is_hex_digit), + take_while_m_n(12, 12, is_lower_hex_digit), )))(input) } @@ -287,6 +287,11 @@ mod parser { i.is_hex_digit() || is_digit_sep(i.as_char()) } + fn is_lower_hex_digit(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + ('a'..='f').contains(&c) || ('0'..='9').contains(&c) + } + fn is_xid_continue(i: impl AsChar + Copy) -> bool { let c = i.as_char(); unicode_xid::UnicodeXID::is_xid_continue(c) From 23b6ad5796d39f96d5e5505164694cc9290ee91c Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 29 Jun 2021 12:17:34 -0500 Subject: [PATCH 4/9] feat(parser): Ignore SHA-1+ Fixes #270 --- crates/typos/src/tokens.rs | 39 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 43b1ace..1e8798b 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -182,6 +182,7 @@ mod parser { take_many0(alt(( sep1, terminated(uuid_literal, sep1), + terminated(hash_literal, sep1), terminated(hex_literal, sep1), terminated(dec_literal, sep1), )))(input) @@ -246,6 +247,29 @@ mod parser { )))(input) } + fn hash_literal(input: T) -> IResult + where + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Offset + + nom::Slice> + + nom::Slice> + + Clone, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, + { + // Size considerations: + // - sha-1 is git's original hash + // - sha-256 is git's new hash + // - Git hashes can be abbreviated but we need a good abbreviation that won't be mistaken + // for a variable name + const SHA_1_MAX: usize = 40; + const SHA_256_MAX: usize = 64; + take_while_m_n(SHA_1_MAX, SHA_256_MAX, is_lower_hex_digit)(input) + } + fn take_many0(mut f: F) -> impl FnMut(I) -> IResult where I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug, @@ -696,6 +720,21 @@ mod test { assert_eq!(expected, actual); } + #[test] + fn tokenize_ignore_hash() { + let parser = TokenizerBuilder::new().build(); + + let input = "Hello 485865fd0412e40d041e861506bb3ac11a3a91e3 World"; + let expected: Vec = vec![ + Identifier::new_unchecked("Hello", Case::None, 0), + Identifier::new_unchecked("World", Case::None, 47), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + #[test] fn tokenize_leading_digits() { let parser = TokenizerBuilder::new().build(); From 2a1e6ca0f6d45173788adc0078e93aabae036d0f Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 29 Jun 2021 13:25:07 -0500 Subject: [PATCH 5/9] feat(parser): Ignore base64 For now, we hardcoded a min length of 90 bytes to ensure to avoid ambiguity with math operations on variables (generally people use whitespace anyways). Fixes #287 --- crates/typos/src/tokens.rs | 64 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 1e8798b..5795e19 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -185,6 +185,7 @@ mod parser { terminated(hash_literal, sep1), terminated(hex_literal, sep1), terminated(dec_literal, sep1), + terminated(base64_literal, sep1), )))(input) } @@ -270,6 +271,40 @@ mod parser { take_while_m_n(SHA_1_MAX, SHA_256_MAX, is_lower_hex_digit)(input) } + fn base64_literal(input: T) -> IResult + where + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Offset + + nom::Slice> + + nom::Slice> + + std::fmt::Debug + + Clone, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, + { + let (padding, captured) = take_while1(is_base64_digit)(input.clone())?; + if captured.input_len() < 90 { + return Err(nom::Err::Error(nom::error::Error::new( + input, + nom::error::ErrorKind::LengthValue, + ))); + } + + const CHUNK: usize = 4; + let padding_offset = input.offset(&padding); + let mut padding_len = CHUNK - padding_offset % CHUNK; + if padding_len == CHUNK { + padding_len = 0; + } + + let (after, _) = take_while_m_n(padding_len, padding_len, is_base64_padding)(padding)?; + let after_offset = input.offset(&after); + Ok(input.take_split(after_offset)) + } + fn take_many0(mut f: F) -> impl FnMut(I) -> IResult where I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug, @@ -316,6 +351,20 @@ mod parser { ('a'..='f').contains(&c) || ('0'..='9').contains(&c) } + fn is_base64_digit(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + ('a'..='z').contains(&c) + || ('A'..='Z').contains(&c) + || ('0'..='9').contains(&c) + || c == '+' + || c == '/' + } + + fn is_base64_padding(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + c == '=' + } + fn is_xid_continue(i: impl AsChar + Copy) -> bool { let c = i.as_char(); unicode_xid::UnicodeXID::is_xid_continue(c) @@ -735,6 +784,21 @@ mod test { assert_eq!(expected, actual); } + #[test] + fn tokenize_ignore_base64() { + let parser = TokenizerBuilder::new().build(); + + let input = "Good Iy9+btvut+d92V+v84444ziIqJKHK879KJH59//X1Iy9+btvut+d92V+v84444ziIqJKHK879KJH59//X122Iy9+btvut+d92V+v84444ziIqJKHK879KJH59//X12== Bye"; + let expected: Vec = vec![ + Identifier::new_unchecked("Good", Case::None, 0), + Identifier::new_unchecked("Bye", Case::None, 134), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + #[test] fn tokenize_leading_digits() { let parser = TokenizerBuilder::new().build(); From 6915d85c0b43ada285ce55d70b2e4446d1679e6e Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 29 Jun 2021 13:42:25 -0500 Subject: [PATCH 6/9] feat(parser): Ignore emails This skips a lot of validation for being "good enough" (comment open/closes matching, etc). This has a chance of incorrectly matching in languages with `@` as an operator, like Python, but Python encourages spaces arround operators, so hopefully this won't be a problem. --- crates/typos/src/tokens.rs | 61 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 5795e19..02c5429 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -186,6 +186,7 @@ mod parser { terminated(hex_literal, sep1), terminated(dec_literal, sep1), terminated(base64_literal, sep1), + terminated(email_literal, sep1), )))(input) } @@ -305,6 +306,27 @@ mod parser { Ok(input.take_split(after_offset)) } + fn email_literal(input: T) -> IResult + where + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Offset + + nom::Slice> + + nom::Slice> + + std::fmt::Debug + + Clone, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, + { + recognize(tuple(( + take_while1(is_email_localport_char), + char('@'), + take_while1(is_email_domain_char), + )))(input) + } + fn take_many0(mut f: F) -> impl FnMut(I) -> IResult where I: nom::Offset + nom::InputTake + Clone + PartialEq + std::fmt::Debug, @@ -338,19 +360,23 @@ mod parser { } } + #[inline] fn is_dec_digit_with_sep(i: impl AsChar + Copy) -> bool { i.is_dec_digit() || is_digit_sep(i.as_char()) } + #[inline] fn is_hex_digit_with_sep(i: impl AsChar + Copy) -> bool { i.is_hex_digit() || is_digit_sep(i.as_char()) } + #[inline] fn is_lower_hex_digit(i: impl AsChar + Copy) -> bool { let c = i.as_char(); ('a'..='f').contains(&c) || ('0'..='9').contains(&c) } + #[inline] fn is_base64_digit(i: impl AsChar + Copy) -> bool { let c = i.as_char(); ('a'..='z').contains(&c) @@ -360,11 +386,31 @@ mod parser { || c == '/' } + #[inline] fn is_base64_padding(i: impl AsChar + Copy) -> bool { let c = i.as_char(); c == '=' } + #[inline] + fn is_email_localport_char(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + ('a'..='z').contains(&c) + || ('A'..='Z').contains(&c) + || ('0'..='9').contains(&c) + || "!#$%&'*+-/=?^_`{|}~().".find(c).is_some() + } + + #[inline] + fn is_email_domain_char(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + ('a'..='z').contains(&c) + || ('A'..='Z').contains(&c) + || ('0'..='9').contains(&c) + || "-().".find(c).is_some() + } + + #[inline] fn is_xid_continue(i: impl AsChar + Copy) -> bool { let c = i.as_char(); unicode_xid::UnicodeXID::is_xid_continue(c) @@ -799,6 +845,21 @@ mod test { assert_eq!(expected, actual); } + #[test] + fn tokenize_ignore_email() { + let parser = TokenizerBuilder::new().build(); + + let input = "Good example@example.com Bye"; + let expected: Vec = vec![ + Identifier::new_unchecked("Good", Case::None, 0), + Identifier::new_unchecked("Bye", Case::None, 25), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + #[test] fn tokenize_leading_digits() { let parser = TokenizerBuilder::new().build(); From b673b81146720c37a1dfeca9b7018e82544ce242 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 29 Jun 2021 13:55:46 -0500 Subject: [PATCH 7/9] fix(parser): Ensure we get full base64 We greedily matched separators, including ones that might be part of base64. This impacts the length calculation, so we want as much as possible. --- crates/typos/src/tokens.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 02c5429..85d0c31 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -180,13 +180,13 @@ mod parser { ::Item: AsChar + Copy, { take_many0(alt(( - sep1, terminated(uuid_literal, sep1), terminated(hash_literal, sep1), terminated(hex_literal, sep1), terminated(dec_literal, sep1), terminated(base64_literal, sep1), terminated(email_literal, sep1), + sep1, )))(input) } From c83f6551091ab40298eb58615b5e8750c6276f1d Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 29 Jun 2021 14:14:58 -0500 Subject: [PATCH 8/9] feat(parser): Ignore URLs Fixes #288 --- crates/typos/src/tokens.rs | 77 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 85d0c31..cdbaea5 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -186,6 +186,7 @@ mod parser { terminated(dec_literal, sep1), terminated(base64_literal, sep1), terminated(email_literal, sep1), + terminated(url_literal, sep1), sep1, )))(input) } @@ -321,9 +322,41 @@ mod parser { ::Item: AsChar + Copy, { recognize(tuple(( - take_while1(is_email_localport_char), + take_while1(is_localport_char), char('@'), - take_while1(is_email_domain_char), + take_while1(is_domain_char), + )))(input) + } + + fn url_literal(input: T) -> IResult + where + T: nom::InputTakeAtPosition + + nom::InputTake + + nom::InputIter + + nom::InputLength + + nom::Offset + + nom::Slice> + + nom::Slice> + + std::fmt::Debug + + Clone, + ::Item: AsChar + Copy, + ::Item: AsChar + Copy, + { + recognize(tuple(( + opt(terminated( + take_while1(is_scheme_char), + // HACK: Technically you can skip `//` if you don't have a domain but that would + // get messy to support. + tuple((char(':'), char('/'), char('/'))), + )), + tuple(( + opt(terminated(take_while1(is_localport_char), char('@'))), + take_while1(is_domain_char), + opt(preceded(char(':'), take_while1(AsChar::is_dec_digit))), + )), + char('/'), + // HACK: Too lazy to enumerate + take_while(is_localport_char), )))(input) } @@ -393,7 +426,7 @@ mod parser { } #[inline] - fn is_email_localport_char(i: impl AsChar + Copy) -> bool { + fn is_localport_char(i: impl AsChar + Copy) -> bool { let c = i.as_char(); ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) @@ -402,7 +435,7 @@ mod parser { } #[inline] - fn is_email_domain_char(i: impl AsChar + Copy) -> bool { + fn is_domain_char(i: impl AsChar + Copy) -> bool { let c = i.as_char(); ('a'..='z').contains(&c) || ('A'..='Z').contains(&c) @@ -410,6 +443,12 @@ mod parser { || "-().".find(c).is_some() } + #[inline] + fn is_scheme_char(i: impl AsChar + Copy) -> bool { + let c = i.as_char(); + ('a'..='z').contains(&c) || ('0'..='9').contains(&c) || "+.-".find(c).is_some() + } + #[inline] fn is_xid_continue(i: impl AsChar + Copy) -> bool { let c = i.as_char(); @@ -860,6 +899,36 @@ mod test { assert_eq!(expected, actual); } + #[test] + fn tokenize_ignore_min_url() { + let parser = TokenizerBuilder::new().build(); + + let input = "Good example.com/hello Bye"; + let expected: Vec = vec![ + Identifier::new_unchecked("Good", Case::None, 0), + Identifier::new_unchecked("Bye", Case::None, 23), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_ignore_max_url() { + let parser = TokenizerBuilder::new().build(); + + let input = "Good http://user@example.com:3142/hello?query=value&extra=two#fragment Bye"; + let expected: Vec = vec![ + Identifier::new_unchecked("Good", Case::None, 0), + Identifier::new_unchecked("Bye", Case::None, 71), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + #[test] fn tokenize_leading_digits() { let parser = TokenizerBuilder::new().build(); From 9a0d7548627ce3bdf17972396f965b754af61411 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Tue, 29 Jun 2021 14:43:05 -0500 Subject: [PATCH 9/9] docs(parser): Note new features --- CHANGELOG.md | 20 ++++++++++++++++++++ docs/comparison.md | 5 +++++ 2 files changed, 25 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a74cb95..af102e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,26 @@ and this project adheres to [Semantic Versioning](http://semver.org/). ## [Unreleased] - ReleaseDate +#### Change of Behavior + +- `ignore-hex` and `identifier-leading-digit` are deprecated and `typos` acts as + if `ignore-hex=true` and `identifier-leading-digit=false`. + +#### Features + +- Automatically ignore + - UUIDs + - SHAs + - base64 encoded data (must be at least 90 bytes) + - emails + - URLs + +#### Performance + +- Due to new literal detection, finding identifiers is takes 10x longer. + Combined with word splitting, its only takes 3x longer. The majority of the + time is spent in dictionary lookups, so we don't expect this to have too much impact in the end. + ## [1.0.10] - 2021-06-28 #### Bug Fixes diff --git a/docs/comparison.md b/docs/comparison.md index 952864f..21898f1 100644 --- a/docs/comparison.md +++ b/docs/comparison.md @@ -8,7 +8,12 @@ | Per-Lang Dict | Yes | ? | No | Yes | | CamelCase | Yes | ? | No | Yes | | snake_case | Yes | ? | No | Yes | +| Ignore email | Yes | yes | No | No | +| Ignore url | Yes | yes | No | No | | Ignore Hex | Yes | ? | No | Yes | +| Ignore UUID | Yes | ? | No | No | +| Ignore base64 | Yes | ? | No | No | +| Ignore SHAs | Yes | ? | No | No | | C-Escapes | No ([#20][def-3]) | ? | No | Yes | | Encodings | UTF-8 / UTF-16 | ? | Auto | Auto | | Whole-project | Yes | Yes | Yes | No |