diff --git a/Cargo.lock b/Cargo.lock index 6630c4e..e624421 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,9 +2,9 @@ # It is not intended for manual editing. [[package]] name = "addr2line" -version = "0.14.0" +version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c0929d69e78dd9bf5408269919fcbcaeb2e35e5d43e5815517cdc6a8e11a423" +checksum = "a55f82cfe485775d02112886f4169bde0c5894d75e79ead7eafe7e40a25e45f7" dependencies = [ "gimli", ] @@ -17,9 +17,9 @@ checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e" [[package]] name = "ahash" -version = "0.6.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "865f8b0b3fced577b7df82e9b0eb7609595d7209c0b39e78d0646672e244b1b1" +checksum = "a75b7e6a93ecd6dbd2c225154d0fa7f86205574ecaa6c87429fb5f66ee677c44" dependencies = [ "getrandom 0.2.0", "lazy_static", @@ -46,9 +46,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.34" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf8dcb5b4bbaa28653b647d8c77bd4ed40183b48882e130c1f1ffb73de069fd7" +checksum = "ee67c11feeac938fae061b232e38e0b6d94f97a9df10e6271319325ac4c56a86" [[package]] name = "arrayvec" @@ -207,12 +207,6 @@ dependencies = [ "unicase", ] -[[package]] -name = "const_fn" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c478836e029dcef17fb47c89023448c64f781a046e0300e257ad8225ae59afab" - [[package]] name = "content_inspector" version = "0.2.4" @@ -224,13 +218,12 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec91540d98355f690a86367e566ecad2e9e579f230230eb7c21398372be73ea5" +checksum = "02d96d1e189ef58269ebe5b97953da3274d83a93af647c2ddd6f9dab28cedb8d" dependencies = [ "autocfg", "cfg-if 1.0.0", - "const_fn", "lazy_static", ] @@ -275,9 +268,9 @@ dependencies = [ "fnv", "ident_case", "proc-macro2 1.0.24", - "quote 1.0.7", + "quote 1.0.8", "strsim 0.9.3", - "syn 1.0.50", + "syn 1.0.57", ] [[package]] @@ -287,8 +280,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72" dependencies = [ "darling_core", - "quote 1.0.7", - "syn 1.0.50", + "quote 1.0.8", + "syn 1.0.57", ] [[package]] @@ -312,8 +305,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41cb0e6161ad61ed084a36ba71fbba9e3ac5aee3606fb607fe08da6acbcf3d8c" dependencies = [ "proc-macro2 1.0.24", - "quote 1.0.7", - "syn 1.0.50", + "quote 1.0.8", + "syn 1.0.57", ] [[package]] @@ -324,8 +317,8 @@ checksum = "6604612c19dd3bb353650b715b61f09bcb089dd17bdca1a9a42637079bf5e428" dependencies = [ "darling", "proc-macro2 1.0.24", - "quote 1.0.7", - "syn 1.0.50", + "quote 1.0.8", + "syn 1.0.57", ] [[package]] @@ -374,8 +367,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "946ee94e3dbf58fdd324f9ce245c7b238d46a66f00e86a020b71996349e46cce" dependencies = [ "proc-macro2 1.0.24", - "quote 1.0.7", - "syn 1.0.50", + "quote 1.0.8", + "syn 1.0.57", ] [[package]] @@ -421,17 +414,17 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "funty" -version = "1.0.1" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0ba62103ce691c2fd80fbae2213dfdda9ce60804973ac6b6e97de818ea7f52c8" +checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" [[package]] name = "getrandom" -version = "0.1.15" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc587bc0ec293155d5bfa6b9891ec18a1e330c234f896ea47fbada4cadbe47e6" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" dependencies = [ - "cfg-if 0.1.10", + "cfg-if 1.0.0", "libc", "wasi", ] @@ -478,9 +471,9 @@ dependencies = [ [[package]] name = "heck" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205" +checksum = "87cbf45460356b7deeb5e3415b5563308c0a9b057c85e12b06ad551f98d0a6ac" dependencies = [ "unicode-segmentation", ] @@ -559,9 +552,9 @@ dependencies = [ [[package]] name = "itoa" -version = "0.4.6" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc6f3ad7b9d11a0c00842ff8de1b60ee58661048eb8049ed33c73594f359d7e6" +checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" [[package]] name = "lazy_static" @@ -584,9 +577,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.80" +version = "0.2.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d58d1b70b004888f764dfbf6a26a3b0342a1632d33968e4a179d8011c760614" +checksum = "1482821306169ec4d07f6aca392a4681f66c75c9918aa49641a2595db64053cb" [[package]] name = "log" @@ -730,9 +723,9 @@ checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" [[package]] name = "predicates" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96bfead12e90dccead362d62bb2c90a5f6fc4584963645bc7f71a735e0b0735a" +checksum = "73dd9b7b200044694dfede9edf907c1ca19630908443e9447e624993700c6932" dependencies = [ "difference", "float-cmp", @@ -743,15 +736,15 @@ dependencies = [ [[package]] name = "predicates-core" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06075c3a3e92559ff8929e7a280684489ea27fe44805174c3ebd9328dcb37178" +checksum = "fb3dbeaaf793584e29c58c7e3a82bbb3c7c06b63cea68d13b0e3cddc124104dc" [[package]] name = "predicates-tree" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e63c4859013b38a76eca2414c64911fba30def9e3202ac461a2d22831220124" +checksum = "aee95d988ee893cb35c06b148c80ed2cd52c8eea927f50ba7a0be1a786aeab73" dependencies = [ "predicates-core", "treeline", @@ -771,8 +764,8 @@ checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" dependencies = [ "proc-macro-error-attr", "proc-macro2 1.0.24", - "quote 1.0.7", - "syn 1.0.50", + "quote 1.0.8", + "syn 1.0.57", "version_check", ] @@ -783,7 +776,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" dependencies = [ "proc-macro2 1.0.24", - "quote 1.0.7", + "quote 1.0.8", "version_check", ] @@ -822,9 +815,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa563d17ecb180e500da1cfd2b028310ac758de548efdd203e18f283af693f37" +checksum = "991431c3519a3f36861882da93630ce66b52918dcf1b8e2fd66b397fc96f28df" dependencies = [ "proc-macro2 1.0.24", ] @@ -841,7 +834,7 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" dependencies = [ - "getrandom 0.1.15", + "getrandom 0.1.16", "libc", "rand_chacha", "rand_core", @@ -865,7 +858,7 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" dependencies = [ - "getrandom 0.1.15", + "getrandom 0.1.16", ] [[package]] @@ -975,29 +968,29 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" [[package]] name = "serde" -version = "1.0.117" +version = "1.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b88fa983de7720629c9387e9f517353ed404164b1e482c970a90c1a4aaf7dc1a" +checksum = "06c64263859d87aa2eb554587e2d23183398d617427327cf2b3d0ed8c69e4800" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.117" +version = "1.0.118" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbd1ae72adb44aab48f325a02444a5fc079349a8d804c1fc922aed3f7454c74e" +checksum = "c84d3526699cd55261af4b941e4e725444df67aa4f9e6a3564f18030d12672df" dependencies = [ "proc-macro2 1.0.24", - "quote 1.0.7", - "syn 1.0.50", + "quote 1.0.8", + "syn 1.0.57", ] [[package]] name = "serde_json" -version = "1.0.59" +version = "1.0.61" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcac07dbffa1c65e7f816ab9eba78eb142c6d44410f4eeba1e26e4f5dfa56b95" +checksum = "4fceb2595057b6891a4ee808f70054bd2d12f0e97f1cbb78689b59f676df325a" dependencies = [ "itoa", "ryu", @@ -1048,8 +1041,8 @@ dependencies = [ "heck", "proc-macro-error", "proc-macro2 1.0.24", - "quote 1.0.7", - "syn 1.0.50", + "quote 1.0.8", + "syn 1.0.57", ] [[package]] @@ -1065,12 +1058,12 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.50" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "443b4178719c5a851e1bde36ce12da21d74a0e60b4d982ec3385a933c812f0f6" +checksum = "4211ce9909eb971f111059df92c45640aad50a619cf55cd76476be803c4c68e6" dependencies = [ "proc-macro2 1.0.24", - "quote 1.0.7", + "quote 1.0.8", "unicode-xid 0.2.1", ] @@ -1114,22 +1107,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.22" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e9ae34b84616eedaaf1e9dd6026dbe00dcafa92aa0c8077cb69df1fcfe5e53e" +checksum = "76cc616c6abf8c8928e2fdcc0dbfab37175edd8fb49a4641066ad1364fdab146" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.22" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ba20f23e85b10754cd195504aebf6a27e2e6cbe28c17778a0c930724628dd56" +checksum = "9be73a2caec27583d0046ef3796c3794f868a5bc813db689eed00c7631275cd1" dependencies = [ "proc-macro2 1.0.24", - "quote 1.0.7", - "syn 1.0.50", + "quote 1.0.8", + "syn 1.0.57", ] [[package]] @@ -1143,9 +1136,9 @@ dependencies = [ [[package]] name = "toml" -version = "0.5.7" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75cf45bb0bef80604d001caaec0d09da99611b3c0fd39d3080468875cdb65645" +checksum = "a31142970826733df8241ef35dc040ef98c679ab14d7c3e54d827099b3acecaa" dependencies = [ "serde", ] @@ -1161,16 +1154,11 @@ name = "typos" version = "0.3.0" dependencies = [ "anyhow", - "bstr", - "content_inspector", - "derive_more 0.99.11", - "derive_setters", "itertools", "log", "once_cell", "regex", "serde", - "serde_json", "thiserror", "unicode-segmentation", ] @@ -1185,21 +1173,27 @@ dependencies = [ "bstr", "clap", "clap-verbosity-flag", + "content_inspector", + "derive_more 0.99.11", + "derive_setters", "difflib", "env_logger 0.8.2", "human-panic", "ignore", + "itertools", "log", "phf", "predicates", "proc-exit", "serde", + "serde_json", "structopt", "toml", "typos", "typos-dict", "typos-vars", "unicase", + "unicode-segmentation", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index bd427a7..32a6f22 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,6 +50,12 @@ ahash = "0.6.1" difflib = "0.4" proc-exit = "1.0" human-panic = "1.0.3" +content_inspector = "0.2.4" +unicode-segmentation = "1.6.0" +derive_more = "0.99.11" +derive_setters = "0.1" +itertools = "0.9" +serde_json = "1.0" [dev-dependencies] assert_fs = "1.0" diff --git a/benches/checks.rs b/benches/checks.rs index 8f8695e..fbf1f42 100644 --- a/benches/checks.rs +++ b/benches/checks.rs @@ -5,131 +5,180 @@ extern crate test; mod data; use assert_fs::prelude::*; -use typos::checks::Check; +use typos_cli::checks::FileChecker; + +fn bench_files(data: &str, b: &mut test::Bencher) { + let temp = assert_fs::TempDir::new().unwrap(); + let sample_path = temp.child("sample"); + sample_path.write_str(data).unwrap(); -fn bench_parse_ident_str(data: &str, b: &mut test::Bencher) { let corrections = typos_cli::dict::BuiltIn::new(Default::default()); - let parser = typos::tokens::Parser::new(); - let checks = typos::checks::TyposSettings::new().build_identifier_parser(); - b.iter(|| checks.check_str(data, &parser, &corrections, &typos::report::PrintSilent)); -} - -#[bench] -fn parse_idents_empty_str(b: &mut test::Bencher) { - bench_parse_ident_str(data::EMPTY, b); -} - -#[bench] -fn parse_idents_no_tokens_str(b: &mut test::Bencher) { - bench_parse_ident_str(data::NO_TOKENS, b); -} - -#[bench] -fn parse_idents_single_token_str(b: &mut test::Bencher) { - bench_parse_ident_str(data::SINGLE_TOKEN, b); -} - -#[bench] -fn parse_idents_sherlock_str(b: &mut test::Bencher) { - bench_parse_ident_str(data::SHERLOCK, b); -} - -#[bench] -fn parse_idents_code_str(b: &mut test::Bencher) { - bench_parse_ident_str(data::CODE, b); -} - -#[bench] -fn parse_idents_corpus_str(b: &mut test::Bencher) { - bench_parse_ident_str(data::CORPUS, b); -} - -fn bench_parse_ident_bytes(data: &str, b: &mut test::Bencher) { - let corrections = typos_cli::dict::BuiltIn::new(Default::default()); - let parser = typos::tokens::Parser::new(); - let checks = typos::checks::TyposSettings::new().build_identifier_parser(); + let parser = typos::tokens::Tokenizer::new(); + let checks = typos_cli::checks::TyposSettings::new().build_files(); b.iter(|| { - checks.check_bytes( - data.as_bytes(), + checks.check_file( + sample_path.path(), + true, &parser, &corrections, - &typos::report::PrintSilent, + &typos_cli::report::PrintSilent, ) }); + + temp.close().unwrap(); } #[bench] -fn parse_idents_empty_bytes(b: &mut test::Bencher) { - bench_parse_ident_bytes(data::EMPTY, b); +fn files_empty(b: &mut test::Bencher) { + bench_files(data::EMPTY, b); } #[bench] -fn parse_idents_no_tokens_bytes(b: &mut test::Bencher) { - bench_parse_ident_bytes(data::NO_TOKENS, b); +fn files_no_tokens(b: &mut test::Bencher) { + bench_files(data::NO_TOKENS, b); } #[bench] -fn parse_idents_single_token_bytes(b: &mut test::Bencher) { - bench_parse_ident_bytes(data::SINGLE_TOKEN, b); +fn files_single_token(b: &mut test::Bencher) { + bench_files(data::SINGLE_TOKEN, b); } #[bench] -fn parse_idents_sherlock_bytes(b: &mut test::Bencher) { - bench_parse_ident_bytes(data::SHERLOCK, b); +fn files_sherlock(b: &mut test::Bencher) { + bench_files(data::SHERLOCK, b); } #[bench] -fn parse_idents_code_bytes(b: &mut test::Bencher) { - bench_parse_ident_bytes(data::CODE, b); +fn files_code(b: &mut test::Bencher) { + bench_files(data::CODE, b); } #[bench] -fn parse_idents_corpus_bytes(b: &mut test::Bencher) { - bench_parse_ident_bytes(data::CORPUS, b); +fn files_corpus(b: &mut test::Bencher) { + bench_files(data::CORPUS, b); } -fn bench_parse_word_str(data: &str, b: &mut test::Bencher) { +fn bench_identifiers(data: &str, b: &mut test::Bencher) { + let temp = assert_fs::TempDir::new().unwrap(); + let sample_path = temp.child("sample"); + sample_path.write_str(data).unwrap(); + let corrections = typos_cli::dict::BuiltIn::new(Default::default()); - let parser = typos::tokens::Parser::new(); - let checks = typos::checks::TyposSettings::new().build_word_parser(); - b.iter(|| checks.check_str(data, &parser, &corrections, &typos::report::PrintSilent)); + let parser = typos::tokens::Tokenizer::new(); + let checks = typos_cli::checks::TyposSettings::new().build_identifier_parser(); + b.iter(|| { + checks.check_file( + sample_path.path(), + true, + &parser, + &corrections, + &typos_cli::report::PrintSilent, + ) + }); + + temp.close().unwrap(); } #[bench] -fn parse_words_empty(b: &mut test::Bencher) { - bench_parse_word_str(data::EMPTY, b); +fn identifiers_empty(b: &mut test::Bencher) { + bench_identifiers(data::EMPTY, b); } #[bench] -fn parse_words_no_tokens(b: &mut test::Bencher) { - bench_parse_word_str(data::NO_TOKENS, b); +fn identifiers_no_tokens(b: &mut test::Bencher) { + bench_identifiers(data::NO_TOKENS, b); } #[bench] -fn parse_words_single_token(b: &mut test::Bencher) { - bench_parse_word_str(data::SINGLE_TOKEN, b); +fn identifiers_single_token(b: &mut test::Bencher) { + bench_identifiers(data::SINGLE_TOKEN, b); } #[bench] -fn parse_words_sherlock(b: &mut test::Bencher) { - bench_parse_word_str(data::SHERLOCK, b); +fn identifiers_sherlock(b: &mut test::Bencher) { + bench_identifiers(data::SHERLOCK, b); } #[bench] -fn parse_words_code(b: &mut test::Bencher) { - bench_parse_word_str(data::CODE, b); +fn identifiers_code(b: &mut test::Bencher) { + bench_identifiers(data::CODE, b); } #[bench] -fn parse_words_corpus(b: &mut test::Bencher) { - bench_parse_word_str(data::CORPUS, b); +fn identifiers_corpus(b: &mut test::Bencher) { + bench_identifiers(data::CORPUS, b); +} + +fn bench_words(data: &str, b: &mut test::Bencher) { + let temp = assert_fs::TempDir::new().unwrap(); + let sample_path = temp.child("sample"); + sample_path.write_str(data).unwrap(); + + let corrections = typos_cli::dict::BuiltIn::new(Default::default()); + let parser = typos::tokens::Tokenizer::new(); + let checks = typos_cli::checks::TyposSettings::new().build_word_parser(); + b.iter(|| { + checks.check_file( + sample_path.path(), + true, + &parser, + &corrections, + &typos_cli::report::PrintSilent, + ) + }); + + temp.close().unwrap(); +} + +#[bench] +fn words_empty(b: &mut test::Bencher) { + bench_words(data::EMPTY, b); +} + +#[bench] +fn words_no_tokens(b: &mut test::Bencher) { + bench_words(data::NO_TOKENS, b); +} + +#[bench] +fn words_single_token(b: &mut test::Bencher) { + bench_words(data::SINGLE_TOKEN, b); +} + +#[bench] +fn words_sherlock(b: &mut test::Bencher) { + bench_words(data::SHERLOCK, b); +} + +#[bench] +fn words_code(b: &mut test::Bencher) { + bench_words(data::CODE, b); +} + +#[bench] +fn words_corpus(b: &mut test::Bencher) { + bench_words(data::CORPUS, b); } fn bench_typos(data: &str, b: &mut test::Bencher) { + let temp = assert_fs::TempDir::new().unwrap(); + let sample_path = temp.child("sample"); + sample_path.write_str(data).unwrap(); + let corrections = typos_cli::dict::BuiltIn::new(Default::default()); - let parser = typos::tokens::Parser::new(); - let checks = typos::checks::TyposSettings::new().build_typos(); - b.iter(|| checks.check_str(data, &parser, &corrections, &typos::report::PrintSilent)); + let parser = typos::tokens::Tokenizer::new(); + let checks = typos_cli::checks::TyposSettings::new().build_typos(); + b.iter(|| { + checks.check_file( + sample_path.path(), + true, + &parser, + &corrections, + &typos_cli::report::PrintSilent, + ) + }); + + temp.close().unwrap(); } #[bench] @@ -161,54 +210,3 @@ fn typos_code(b: &mut test::Bencher) { fn typos_corpus(b: &mut test::Bencher) { bench_typos(data::CORPUS, b); } - -fn bench_check_file(data: &str, b: &mut test::Bencher) { - let temp = assert_fs::TempDir::new().unwrap(); - let sample_path = temp.child("sample"); - sample_path.write_str(data).unwrap(); - - let corrections = typos_cli::dict::BuiltIn::new(Default::default()); - let parser = typos::tokens::Parser::new(); - let checks = typos::checks::TyposSettings::new().build_typos(); - b.iter(|| { - checks.check_file( - sample_path.path(), - true, - &parser, - &corrections, - &typos::report::PrintSilent, - ) - }); - - temp.close().unwrap(); -} - -#[bench] -fn check_file_empty(b: &mut test::Bencher) { - bench_check_file(data::EMPTY, b); -} - -#[bench] -fn check_file_no_tokens(b: &mut test::Bencher) { - bench_check_file(data::NO_TOKENS, b); -} - -#[bench] -fn check_file_single_token(b: &mut test::Bencher) { - bench_check_file(data::SINGLE_TOKEN, b); -} - -#[bench] -fn check_file_sherlock(b: &mut test::Bencher) { - bench_check_file(data::SHERLOCK, b); -} - -#[bench] -fn check_file_code(b: &mut test::Bencher) { - bench_check_file(data::CODE, b); -} - -#[bench] -fn check_file_corpus(b: &mut test::Bencher) { - bench_check_file(data::CORPUS, b); -} diff --git a/benches/tokenize.rs b/benches/tokenize.rs index 32e6a74..efcce0b 100644 --- a/benches/tokenize.rs +++ b/benches/tokenize.rs @@ -6,19 +6,19 @@ mod data; #[bench] fn ident_parse_empty(b: &mut test::Bencher) { - let parser = typos::tokens::Parser::new(); + let parser = typos::tokens::Tokenizer::new(); b.iter(|| parser.parse_bytes(data::EMPTY.as_bytes()).last()); } #[bench] fn ident_parse_no_tokens(b: &mut test::Bencher) { - let parser = typos::tokens::Parser::new(); + let parser = typos::tokens::Tokenizer::new(); b.iter(|| parser.parse_bytes(data::NO_TOKENS.as_bytes()).last()); } #[bench] fn ident_parse_single_token(b: &mut test::Bencher) { - let parser = typos::tokens::Parser::new(); + let parser = typos::tokens::Tokenizer::new(); b.iter(|| { parser.parse_bytes(data::SINGLE_TOKEN.as_bytes()).last(); }); @@ -26,19 +26,19 @@ fn ident_parse_single_token(b: &mut test::Bencher) { #[bench] fn ident_parse_sherlock(b: &mut test::Bencher) { - let parser = typos::tokens::Parser::new(); + let parser = typos::tokens::Tokenizer::new(); b.iter(|| parser.parse_bytes(data::SHERLOCK.as_bytes()).last()); } #[bench] fn ident_parse_code(b: &mut test::Bencher) { - let parser = typos::tokens::Parser::new(); + let parser = typos::tokens::Tokenizer::new(); b.iter(|| parser.parse_bytes(data::CODE.as_bytes()).last()); } #[bench] fn ident_parse_corpus(b: &mut test::Bencher) { - let parser = typos::tokens::Parser::new(); + let parser = typos::tokens::Tokenizer::new(); b.iter(|| parser.parse_bytes(data::CORPUS.as_bytes()).last()); } diff --git a/crates/typos/Cargo.toml b/crates/typos/Cargo.toml index 15a317e..a128a7c 100644 --- a/crates/typos/Cargo.toml +++ b/crates/typos/Cargo.toml @@ -20,11 +20,6 @@ thiserror = "1.0" regex = "1.3" once_cell = "1.2.0" serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" itertools = "0.9" -bstr = "0.2" log = "0.4" unicode-segmentation = "1.7.1" -derive_more = "0.99.11" -derive_setters = "0.1" -content_inspector = "0.2.4" diff --git a/crates/typos/src/checks.rs b/crates/typos/src/checks.rs deleted file mode 100644 index 6040440..0000000 --- a/crates/typos/src/checks.rs +++ /dev/null @@ -1,489 +0,0 @@ -use bstr::ByteSlice; - -use crate::report; -use crate::tokens; -use crate::Dictionary; -use crate::Status; - -pub trait Check: Send + Sync { - fn check_str( - &self, - buffer: &str, - parser: &tokens::Parser, - dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error>; - - fn check_bytes( - &self, - buffer: &[u8], - parser: &tokens::Parser, - dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error>; - - fn check_filenames(&self) -> bool; - - fn check_files(&self) -> bool; - - fn binary(&self) -> bool; - - fn check_filename( - &self, - path: &std::path::Path, - parser: &tokens::Parser, - dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - if !self.check_filenames() { - return Ok(()); - } - - if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) { - let context_reporter = ReportContext { - reporter, - context: report::PathContext { path }.into(), - }; - self.check_str(file_name, parser, dictionary, &context_reporter)?; - } - - Ok(()) - } - - fn check_file( - &self, - path: &std::path::Path, - explicit: bool, - parser: &tokens::Parser, - dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - if !self.check_files() { - return Ok(()); - } - - let buffer = read_file(path, reporter)?; - let (buffer, content_type) = massage_data(buffer)?; - if !explicit && !self.binary() && content_type.is_binary() { - let msg = report::BinaryFile { path }; - reporter.report(msg.into())?; - return Ok(()); - } - - for (line_idx, line) in buffer.lines().enumerate() { - let line_num = line_idx + 1; - let context_reporter = ReportContext { - reporter, - context: report::FileContext { path, line_num }.into(), - }; - self.check_bytes(line, parser, dictionary, &context_reporter)?; - } - - Ok(()) - } -} - -struct ReportContext<'m, 'r> { - reporter: &'r dyn report::Report, - context: report::Context<'m>, -} - -impl<'m, 'r> report::Report for ReportContext<'m, 'r> { - fn report(&self, msg: report::Message) -> Result<(), std::io::Error> { - let msg = msg.context(Some(self.context.clone())); - self.reporter.report(msg) - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct TyposSettings { - check_filenames: bool, - check_files: bool, - binary: bool, -} - -impl TyposSettings { - pub fn new() -> Self { - Default::default() - } - - pub fn check_filenames(&mut self, yes: bool) -> &mut Self { - self.check_filenames = yes; - self - } - - pub fn check_files(&mut self, yes: bool) -> &mut Self { - self.check_files = yes; - self - } - - pub fn binary(&mut self, yes: bool) -> &mut Self { - self.binary = yes; - self - } - - pub fn build_typos(&self) -> Typos { - Typos { - check_filenames: self.check_filenames, - check_files: self.check_files, - binary: self.binary, - } - } - - pub fn build_identifier_parser(&self) -> ParseIdentifiers { - ParseIdentifiers { - check_filenames: self.check_filenames, - check_files: self.check_files, - binary: self.binary, - } - } - - pub fn build_word_parser(&self) -> ParseWords { - ParseWords { - check_filenames: self.check_filenames, - check_files: self.check_files, - binary: self.binary, - } - } - - pub fn build_files(&self) -> Files { - Files {} - } -} - -impl Default for TyposSettings { - fn default() -> Self { - Self { - check_filenames: true, - check_files: true, - binary: false, - } - } -} - -#[derive(Debug, Clone)] -pub struct Typos { - check_filenames: bool, - check_files: bool, - binary: bool, -} - -impl Check for Typos { - fn check_str( - &self, - buffer: &str, - parser: &tokens::Parser, - dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - for ident in parser.parse_str(buffer) { - match dictionary.correct_ident(ident) { - Some(Status::Valid) => {} - Some(corrections) => { - let byte_offset = ident.offset(); - let msg = report::Typo { - context: None, - buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()), - byte_offset, - typo: ident.token(), - corrections, - }; - reporter.report(msg.into())?; - } - None => { - for word in ident.split() { - match dictionary.correct_word(word) { - Some(Status::Valid) => {} - Some(corrections) => { - let byte_offset = word.offset(); - let msg = report::Typo { - context: None, - buffer: std::borrow::Cow::Borrowed(buffer.as_bytes()), - byte_offset, - typo: word.token(), - corrections, - }; - reporter.report(msg.into())?; - } - None => {} - } - } - } - } - } - Ok(()) - } - - fn check_bytes( - &self, - buffer: &[u8], - parser: &tokens::Parser, - dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - for ident in parser.parse_bytes(buffer) { - match dictionary.correct_ident(ident) { - Some(Status::Valid) => {} - Some(corrections) => { - let byte_offset = ident.offset(); - let msg = report::Typo { - context: None, - buffer: std::borrow::Cow::Borrowed(buffer), - byte_offset, - typo: ident.token(), - corrections, - }; - reporter.report(msg.into())?; - } - None => { - for word in ident.split() { - match dictionary.correct_word(word) { - Some(Status::Valid) => {} - Some(corrections) => { - let byte_offset = word.offset(); - let msg = report::Typo { - context: None, - buffer: std::borrow::Cow::Borrowed(buffer), - byte_offset, - typo: word.token(), - corrections, - }; - reporter.report(msg.into())?; - } - None => {} - } - } - } - } - } - - Ok(()) - } - - fn check_filenames(&self) -> bool { - self.check_filenames - } - - fn check_files(&self) -> bool { - self.check_files - } - - fn binary(&self) -> bool { - self.binary - } -} - -#[derive(Debug, Clone)] -pub struct ParseIdentifiers { - check_filenames: bool, - check_files: bool, - binary: bool, -} - -impl Check for ParseIdentifiers { - fn check_str( - &self, - buffer: &str, - parser: &tokens::Parser, - _dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - let msg = report::Parse { - context: None, - kind: report::ParseKind::Identifier, - data: parser.parse_str(buffer).map(|i| i.token()).collect(), - }; - if !msg.data.is_empty() { - reporter.report(msg.into())?; - } - - Ok(()) - } - - fn check_bytes( - &self, - buffer: &[u8], - parser: &tokens::Parser, - _dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - let msg = report::Parse { - context: None, - kind: report::ParseKind::Identifier, - data: parser.parse_bytes(buffer).map(|i| i.token()).collect(), - }; - if !msg.data.is_empty() { - reporter.report(msg.into())?; - } - - Ok(()) - } - - fn check_filenames(&self) -> bool { - self.check_filenames - } - - fn check_files(&self) -> bool { - self.check_files - } - - fn binary(&self) -> bool { - self.binary - } -} - -#[derive(Debug, Clone)] -pub struct ParseWords { - check_filenames: bool, - check_files: bool, - binary: bool, -} - -impl Check for ParseWords { - fn check_str( - &self, - buffer: &str, - parser: &tokens::Parser, - _dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - let msg = report::Parse { - context: None, - kind: report::ParseKind::Word, - data: parser - .parse_str(buffer) - .flat_map(|ident| ident.split().map(|i| i.token())) - .collect(), - }; - if !msg.data.is_empty() { - reporter.report(msg.into())?; - } - - Ok(()) - } - - fn check_bytes( - &self, - buffer: &[u8], - parser: &tokens::Parser, - _dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - let msg = report::Parse { - context: None, - kind: report::ParseKind::Word, - data: parser - .parse_bytes(buffer) - .flat_map(|ident| ident.split().map(|i| i.token())) - .collect(), - }; - if !msg.data.is_empty() { - reporter.report(msg.into())?; - } - - Ok(()) - } - - fn check_filenames(&self) -> bool { - self.check_filenames - } - - fn check_files(&self) -> bool { - self.check_files - } - - fn binary(&self) -> bool { - self.binary - } -} - -#[derive(Debug, Clone)] -pub struct Files {} - -impl Check for Files { - fn check_str( - &self, - _buffer: &str, - _parser: &tokens::Parser, - _dictionary: &dyn Dictionary, - _reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - Ok(()) - } - - fn check_bytes( - &self, - _buffer: &[u8], - _parser: &tokens::Parser, - _dictionary: &dyn Dictionary, - _reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - Ok(()) - } - - fn check_filenames(&self) -> bool { - true - } - - fn check_files(&self) -> bool { - true - } - - fn binary(&self) -> bool { - true - } - - fn check_filename( - &self, - _path: &std::path::Path, - _parser: &tokens::Parser, - _dictionary: &dyn Dictionary, - _reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - Ok(()) - } - - fn check_file( - &self, - path: &std::path::Path, - _explicit: bool, - _parser: &tokens::Parser, - _dictionary: &dyn Dictionary, - reporter: &dyn report::Report, - ) -> Result<(), std::io::Error> { - let msg = report::File::new(path); - reporter.report(msg.into())?; - - Ok(()) - } -} - -fn read_file( - path: &std::path::Path, - reporter: &dyn report::Report, -) -> Result, std::io::Error> { - let buffer = match std::fs::read(path) { - Ok(buffer) => buffer, - Err(err) => { - let msg = report::Error::new(err.to_string()); - reporter.report(msg.into())?; - Vec::new() - } - }; - Ok(buffer) -} - -fn massage_data( - buffer: Vec, -) -> Result<(Vec, content_inspector::ContentType), std::io::Error> { - let mut content_type = content_inspector::inspect(&buffer); - - // HACK: We only support UTF-8 at the moment - if content_type != content_inspector::ContentType::UTF_8_BOM - && content_type != content_inspector::ContentType::UTF_8 - { - content_type = content_inspector::ContentType::BINARY; - } - - Ok((buffer, content_type)) -} diff --git a/crates/typos/src/dict.rs b/crates/typos/src/dict.rs index 2fded93..971ca86 100644 --- a/crates/typos/src/dict.rs +++ b/crates/typos/src/dict.rs @@ -1,6 +1,35 @@ use std::borrow::Cow; -#[derive(Clone, PartialEq, Eq, Debug, serde::Serialize, derive_more::From)] +/// Look up the validity of a term. +pub trait Dictionary: Send + Sync { + /// Look up the validity of an Identifier. + /// + /// `None` if the status is unknown. + fn correct_ident<'s, 'w>(&'s self, ident: crate::tokens::Identifier<'w>) -> Option>; + + /// Look up the validity of a Word. + /// + /// `None` if the status is unknown. + fn correct_word<'s, 'w>(&'s self, word: crate::tokens::Word<'w>) -> Option>; +} + +pub(crate) struct NullDictionary; + +impl Dictionary for NullDictionary { + fn correct_ident<'s, 'w>( + &'s self, + _ident: crate::tokens::Identifier<'w>, + ) -> Option> { + None + } + + fn correct_word<'s, 'w>(&'s self, _word: crate::tokens::Word<'w>) -> Option> { + None + } +} + +/// Validity of a term in a Dictionary. +#[derive(Clone, PartialEq, Eq, Debug, serde::Serialize)] #[serde(rename_all = "snake_case")] #[serde(untagged)] pub enum Status<'c> { @@ -27,6 +56,20 @@ impl<'c> Status<'c> { } } + pub fn into_owned(self) -> Status<'static> { + match self { + Status::Valid => Status::Valid, + Status::Invalid => Status::Invalid, + Status::Corrections(corrections) => { + let corrections = corrections + .into_iter() + .map(|c| Cow::Owned(c.into_owned())) + .collect(); + Status::Corrections(corrections) + } + } + } + pub fn borrow(&self) -> Status<'_> { match self { Status::Corrections(corrections) => { @@ -40,10 +83,3 @@ impl<'c> Status<'c> { } } } - -pub trait Dictionary: Send + Sync { - fn correct_ident<'s, 'w>(&'s self, _ident: crate::tokens::Identifier<'w>) - -> Option>; - - fn correct_word<'s, 'w>(&'s self, word: crate::tokens::Word<'w>) -> Option>; -} diff --git a/crates/typos/src/lib.rs b/crates/typos/src/lib.rs index 1cb77c9..93ba77d 100644 --- a/crates/typos/src/lib.rs +++ b/crates/typos/src/lib.rs @@ -1,7 +1,7 @@ mod dict; +mod parser; -pub mod checks; -pub mod report; pub mod tokens; -pub use crate::dict::*; +pub use dict::*; +pub use parser::*; diff --git a/crates/typos/src/parser.rs b/crates/typos/src/parser.rs new file mode 100644 index 0000000..883a730 --- /dev/null +++ b/crates/typos/src/parser.rs @@ -0,0 +1,147 @@ +use crate::tokens; +use crate::Dictionary; +use std::borrow::Cow; + +/// Extract typos from the buffer. +#[derive(Clone)] +pub struct ParserBuilder<'p, 'd> { + tokenizer: Option<&'p tokens::Tokenizer>, + dictionary: &'d dyn Dictionary, +} + +impl<'p> ParserBuilder<'p, 'static> { + pub fn new() -> Self { + Default::default() + } +} + +impl<'p, 'd> ParserBuilder<'p, 'd> { + /// Set the Tokenizer used when parsing. + pub fn tokenizer(mut self, tokenizer: &'p tokens::Tokenizer) -> Self { + self.tokenizer = Some(tokenizer); + self + } + + /// Set the dictionary used when parsing. + pub fn dictionary<'d1>(self, dictionary: &'d1 dyn Dictionary) -> ParserBuilder<'p, 'd1> { + ParserBuilder { + tokenizer: self.tokenizer, + dictionary, + } + } + + /// Extract typos from the buffer. + pub fn build(&self) -> TyposParser<'p, 'd> { + TyposParser { + tokenizer: self.tokenizer.unwrap_or(&DEFAULT_TOKENIZER), + dictionary: self.dictionary, + } + } +} + +impl<'p> Default for ParserBuilder<'p, 'static> { + fn default() -> Self { + Self { + tokenizer: None, + dictionary: &crate::NullDictionary, + } + } +} + +static DEFAULT_TOKENIZER: once_cell::sync::Lazy = + once_cell::sync::Lazy::new(tokens::Tokenizer::new); + +/// Extract typos from the buffer. +#[derive(Clone)] +pub struct TyposParser<'p, 'd> { + tokenizer: &'p tokens::Tokenizer, + dictionary: &'d dyn Dictionary, +} + +impl<'p, 'd> TyposParser<'p, 'd> { + pub fn parse_str<'b, 's: 'b>(&'s self, buffer: &'b str) -> impl Iterator> { + self.tokenizer + .parse_str(buffer) + .flat_map(move |ident| self.process_ident(ident)) + } + + pub fn parse_bytes<'b, 's: 'b>(&'s self, buffer: &'b [u8]) -> impl Iterator> { + self.tokenizer + .parse_bytes(buffer) + .flat_map(move |ident| self.process_ident(ident)) + } + + fn process_ident<'i, 's: 'i>( + &'s self, + ident: tokens::Identifier<'i>, + ) -> impl Iterator> { + match self.dictionary.correct_ident(ident) { + Some(crate::Status::Valid) => itertools::Either::Left(None.into_iter()), + Some(corrections) => { + let typo = Typo { + byte_offset: ident.offset(), + typo: ident.token().into(), + corrections, + }; + itertools::Either::Left(Some(typo).into_iter()) + } + None => itertools::Either::Right( + ident + .split() + .filter_map(move |word| self.process_word(word)), + ), + } + } + + fn process_word<'w, 's: 'w>(&'s self, word: tokens::Word<'w>) -> Option> { + match self.dictionary.correct_word(word) { + Some(crate::Status::Valid) => None, + Some(corrections) => { + let typo = Typo { + byte_offset: word.offset(), + typo: word.token().into(), + corrections, + }; + Some(typo) + } + None => None, + } + } +} + +/// An invalid term found in the buffer. +#[derive(Clone, Debug)] +#[non_exhaustive] +pub struct Typo<'m> { + pub byte_offset: usize, + pub typo: Cow<'m, str>, + pub corrections: crate::Status<'m>, +} + +impl<'m> Typo<'m> { + pub fn into_owned(self) -> Typo<'static> { + Typo { + byte_offset: self.byte_offset, + typo: Cow::Owned(self.typo.into_owned()), + corrections: self.corrections.into_owned(), + } + } + + pub fn borrow(&self) -> Typo<'_> { + Typo { + byte_offset: self.byte_offset, + typo: Cow::Borrowed(self.typo.as_ref()), + corrections: self.corrections.borrow(), + } + } +} + +impl<'m> Default for Typo<'m> { + fn default() -> Self { + Self { + byte_offset: 0, + typo: "".into(), + corrections: crate::Status::Invalid, + } + } +} diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 9f2728c..3f5aefc 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -1,13 +1,6 @@ -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Case { - Title, - Lower, - Scream, - None, -} - +/// Define rules for tokenizaing a buffer. #[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct ParserBuilder { +pub struct TokenizerBuilder { ignore_hex: bool, leading_digits: bool, leading_chars: String, @@ -15,37 +8,42 @@ pub struct ParserBuilder { include_chars: String, } -impl ParserBuilder { +impl TokenizerBuilder { pub fn new() -> Self { Default::default() } + /// Specify that hexadecimal numbers should be ignored. pub fn ignore_hex(&mut self, yes: bool) -> &mut Self { self.ignore_hex = yes; self } + /// Specify that leading digits are allowed for Identifiers. pub fn leading_digits(&mut self, yes: bool) -> &mut Self { self.leading_digits = yes; self } + /// Extend accepted leading characters for Identifiers. pub fn leading_chars(&mut self, chars: String) -> &mut Self { self.leading_chars = chars; self } + /// Specify that digits can be included in Identifiers. pub fn include_digits(&mut self, yes: bool) -> &mut Self { self.include_digits = yes; self } + /// Extend accepted characters for Identifiers. pub fn include_chars(&mut self, chars: String) -> &mut Self { self.include_chars = chars; self } - pub fn build(&self) -> Parser { + pub fn build(&self) -> Tokenizer { let mut pattern = r#"\b("#.to_owned(); Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars); Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars); @@ -54,7 +52,7 @@ impl ParserBuilder { let words_str = regex::Regex::new(&pattern).unwrap(); let words_bytes = regex::bytes::Regex::new(&pattern).unwrap(); - Parser { + Tokenizer { words_str, words_bytes, // `leading_digits` let's us bypass the regexes since you can't have a decimal or @@ -77,7 +75,7 @@ impl ParserBuilder { } } -impl Default for ParserBuilder { +impl Default for TokenizerBuilder { fn default() -> Self { Self { ignore_hex: true, @@ -89,17 +87,18 @@ impl Default for ParserBuilder { } } +/// Extract Identifiers from a buffer. #[derive(Debug, Clone)] -pub struct Parser { +pub struct Tokenizer { words_str: regex::Regex, words_bytes: regex::bytes::Regex, ignore_numbers: bool, ignore_hex: bool, } -impl Parser { +impl Tokenizer { pub fn new() -> Self { - ParserBuilder::default().build() + TokenizerBuilder::default().build() } pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator> { @@ -132,7 +131,7 @@ impl Parser { } } -impl Default for Parser { +impl Default for Tokenizer { fn default() -> Self { Self::new() } @@ -156,6 +155,7 @@ fn is_hex(ident: &[u8]) -> bool { HEX.is_match(ident) } +/// A term composed of Words. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Identifier<'t> { token: &'t str, @@ -179,11 +179,13 @@ impl<'t> Identifier<'t> { self.offset } + /// Split into individual Words. pub fn split(&self) -> impl Iterator> { split_ident(self.token, self.offset) } } +/// An indivisible term. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Word<'t> { token: &'t str, @@ -237,52 +239,8 @@ impl<'t> Word<'t> { } } -/// Tracks the current 'mode' of the transformation algorithm as it scans the input string. -/// -/// The mode is a tri-state which tracks the case of the last cased character of the current -/// word. If there is no cased character (either lowercase or uppercase) since the previous -/// word boundary, than the mode is `Boundary`. If the last cased character is lowercase, then -/// the mode is `Lowercase`. Otherrwise, the mode is `Uppercase`. -#[derive(Clone, Copy, PartialEq, Debug)] -enum WordMode { - /// There have been no lowercase or uppercase characters in the current word. - Boundary, - /// The previous cased character in the current word is lowercase. - Lowercase, - /// The previous cased character in the current word is uppercase. - Uppercase, - Number, -} - -impl WordMode { - fn classify(c: char) -> Self { - if c.is_lowercase() { - WordMode::Lowercase - } else if c.is_uppercase() { - WordMode::Uppercase - } else if c.is_ascii_digit() { - WordMode::Number - } else { - // This assumes all characters are either lower or upper case. - WordMode::Boundary - } - } - - fn case(self, last: WordMode) -> Case { - match (self, last) { - (WordMode::Uppercase, WordMode::Uppercase) => Case::Scream, - (WordMode::Uppercase, WordMode::Lowercase) => Case::Title, - (WordMode::Lowercase, WordMode::Lowercase) => Case::Lower, - (WordMode::Number, WordMode::Number) => Case::None, - (WordMode::Number, _) - | (_, WordMode::Number) - | (WordMode::Boundary, _) - | (_, WordMode::Boundary) - | (WordMode::Lowercase, WordMode::Uppercase) => { - unreachable!("Invalid case combination: ({:?}, {:?})", self, last) - } - } - } +fn split_ident(ident: &str, offset: usize) -> impl Iterator> { + SplitIdent::new(ident, offset) } struct SplitIdent<'s> { @@ -377,8 +335,61 @@ impl<'s> Iterator for SplitIdent<'s> { } } -fn split_ident(ident: &str, offset: usize) -> impl Iterator> { - SplitIdent::new(ident, offset) +/// Format of the term. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Case { + Title, + Lower, + Scream, + None, +} + +/// Tracks the current 'mode' of the transformation algorithm as it scans the input string. +/// +/// The mode is a tri-state which tracks the case of the last cased character of the current +/// word. If there is no cased character (either lowercase or uppercase) since the previous +/// word boundary, than the mode is `Boundary`. If the last cased character is lowercase, then +/// the mode is `Lowercase`. Otherrwise, the mode is `Uppercase`. +#[derive(Clone, Copy, PartialEq, Debug)] +enum WordMode { + /// There have been no lowercase or uppercase characters in the current word. + Boundary, + /// The previous cased character in the current word is lowercase. + Lowercase, + /// The previous cased character in the current word is uppercase. + Uppercase, + Number, +} + +impl WordMode { + fn classify(c: char) -> Self { + if c.is_lowercase() { + WordMode::Lowercase + } else if c.is_uppercase() { + WordMode::Uppercase + } else if c.is_ascii_digit() { + WordMode::Number + } else { + // This assumes all characters are either lower or upper case. + WordMode::Boundary + } + } + + fn case(self, last: WordMode) -> Case { + match (self, last) { + (WordMode::Uppercase, WordMode::Uppercase) => Case::Scream, + (WordMode::Uppercase, WordMode::Lowercase) => Case::Title, + (WordMode::Lowercase, WordMode::Lowercase) => Case::Lower, + (WordMode::Number, WordMode::Number) => Case::None, + (WordMode::Number, _) + | (_, WordMode::Number) + | (WordMode::Boundary, _) + | (_, WordMode::Boundary) + | (WordMode::Lowercase, WordMode::Uppercase) => { + unreachable!("Invalid case combination: ({:?}, {:?})", self, last) + } + } + } } #[cfg(test)] @@ -387,7 +398,7 @@ mod test { #[test] fn tokenize_empty_is_empty() { - let parser = Parser::new(); + let parser = Tokenizer::new(); let input = ""; let expected: Vec = vec![]; @@ -399,7 +410,7 @@ mod test { #[test] fn tokenize_word_is_word() { - let parser = Parser::new(); + let parser = Tokenizer::new(); let input = "word"; let expected: Vec = vec![Identifier::new_unchecked("word", 0)]; @@ -411,7 +422,7 @@ mod test { #[test] fn tokenize_space_separated_words() { - let parser = Parser::new(); + let parser = Tokenizer::new(); let input = "A B"; let expected: Vec = vec![ @@ -426,7 +437,7 @@ mod test { #[test] fn tokenize_dot_separated_words() { - let parser = Parser::new(); + let parser = Tokenizer::new(); let input = "A.B"; let expected: Vec = vec![ @@ -441,7 +452,7 @@ mod test { #[test] fn tokenize_namespace_separated_words() { - let parser = Parser::new(); + let parser = Tokenizer::new(); let input = "A::B"; let expected: Vec = vec![ @@ -456,7 +467,7 @@ mod test { #[test] fn tokenize_underscore_doesnt_separate() { - let parser = Parser::new(); + let parser = Tokenizer::new(); let input = "A_B"; let expected: Vec = vec![Identifier::new_unchecked("A_B", 0)]; @@ -468,7 +479,7 @@ mod test { #[test] fn tokenize_ignore_hex_enabled() { - let parser = ParserBuilder::new().ignore_hex(true).build(); + let parser = TokenizerBuilder::new().ignore_hex(true).build(); let input = "Hello 0xDEADBEEF World"; let expected: Vec = vec![ @@ -483,7 +494,7 @@ mod test { #[test] fn tokenize_ignore_hex_disabled() { - let parser = ParserBuilder::new() + let parser = TokenizerBuilder::new() .ignore_hex(false) .leading_digits(true) .build(); @@ -523,11 +534,11 @@ mod test { &[("A", Case::Scream, 0), ("String", Case::Title, 1)], ), ( - "SimpleXMLParser", + "SimpleXMLTokenizer", &[ ("Simple", Case::Title, 0), ("XML", Case::Scream, 6), - ("Parser", Case::Title, 9), + ("Tokenizer", Case::Title, 9), ], ), ( diff --git a/src/args.rs b/src/args.rs index 1b1d153..4a3398a 100644 --- a/src/args.rs +++ b/src/args.rs @@ -12,13 +12,13 @@ arg_enum! { } } -pub const PRINT_SILENT: typos::report::PrintSilent = typos::report::PrintSilent; -pub const PRINT_BRIEF: typos::report::PrintBrief = typos::report::PrintBrief; -pub const PRINT_LONG: typos::report::PrintLong = typos::report::PrintLong; -pub const PRINT_JSON: typos::report::PrintJson = typos::report::PrintJson; +pub const PRINT_SILENT: typos_cli::report::PrintSilent = typos_cli::report::PrintSilent; +pub const PRINT_BRIEF: typos_cli::report::PrintBrief = typos_cli::report::PrintBrief; +pub const PRINT_LONG: typos_cli::report::PrintLong = typos_cli::report::PrintLong; +pub const PRINT_JSON: typos_cli::report::PrintJson = typos_cli::report::PrintJson; impl Format { - pub(crate) fn reporter(self) -> &'static dyn typos::report::Report { + pub(crate) fn reporter(self) -> &'static dyn typos_cli::report::Report { match self { Format::Silent => &PRINT_SILENT, Format::Brief => &PRINT_BRIEF, diff --git a/src/checks.rs b/src/checks.rs index 35d51c6..3bfb25b 100644 --- a/src/checks.rs +++ b/src/checks.rs @@ -1,27 +1,637 @@ -pub(crate) fn check_path( +use bstr::ByteSlice; + +use crate::report; +use typos::tokens; +use typos::Dictionary; + +pub trait FileChecker: Send + Sync { + fn check_file( + &self, + path: &std::path::Path, + explicit: bool, + parser: &tokens::Tokenizer, + dictionary: &dyn Dictionary, + reporter: &dyn report::Report, + ) -> Result<(), std::io::Error>; +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TyposSettings { + check_filenames: bool, + check_files: bool, + binary: bool, +} + +impl TyposSettings { + pub fn new() -> Self { + Default::default() + } + + pub fn check_filenames(&mut self, yes: bool) -> &mut Self { + self.check_filenames = yes; + self + } + + pub fn check_files(&mut self, yes: bool) -> &mut Self { + self.check_files = yes; + self + } + + pub fn binary(&mut self, yes: bool) -> &mut Self { + self.binary = yes; + self + } + + pub fn build_typos(&self) -> Typos { + Typos { + check_filenames: self.check_filenames, + check_files: self.check_files, + binary: self.binary, + } + } + + pub fn build_fix_typos(&self) -> FixTypos { + FixTypos { + check_filenames: self.check_filenames, + check_files: self.check_files, + binary: self.binary, + } + } + + pub fn build_diff_typos(&self) -> DiffTypos { + DiffTypos { + check_filenames: self.check_filenames, + check_files: self.check_files, + binary: self.binary, + } + } + + pub fn build_identifier_parser(&self) -> Identifiers { + Identifiers { + check_filenames: self.check_filenames, + check_files: self.check_files, + binary: self.binary, + } + } + + pub fn build_word_parser(&self) -> Words { + Words { + check_filenames: self.check_filenames, + check_files: self.check_files, + binary: self.binary, + } + } + + pub fn build_files(&self) -> FoundFiles { + FoundFiles { + binary: self.binary, + } + } +} + +impl Default for TyposSettings { + fn default() -> Self { + Self { + check_filenames: true, + check_files: true, + binary: false, + } + } +} + +#[derive(Debug, Clone)] +pub struct Typos { + check_filenames: bool, + check_files: bool, + binary: bool, +} + +impl FileChecker for Typos { + fn check_file( + &self, + path: &std::path::Path, + explicit: bool, + tokenizer: &tokens::Tokenizer, + dictionary: &dyn Dictionary, + reporter: &dyn report::Report, + ) -> Result<(), std::io::Error> { + let parser = typos::ParserBuilder::new() + .tokenizer(tokenizer) + .dictionary(dictionary) + .build(); + + if self.check_filenames { + if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) { + for typo in parser.parse_str(file_name) { + let msg = report::Typo { + context: Some(report::PathContext { path }.into()), + buffer: std::borrow::Cow::Borrowed(file_name.as_bytes()), + byte_offset: typo.byte_offset, + typo: typo.typo.as_ref(), + corrections: typo.corrections, + }; + reporter.report(msg.into())?; + } + } + } + + if self.check_files { + let (buffer, content_type) = read_file(path, reporter)?; + if !explicit && !self.binary && content_type.is_binary() { + let msg = report::BinaryFile { path }; + reporter.report(msg.into())?; + } else { + let mut accum_line_num = AccumulateLineNum::new(); + for typo in parser.parse_bytes(&buffer) { + let line_num = accum_line_num.line_num(&buffer, typo.byte_offset); + let (line, line_offset) = extract_line(&buffer, typo.byte_offset); + let msg = report::Typo { + context: Some(report::FileContext { path, line_num }.into()), + buffer: std::borrow::Cow::Borrowed(line), + byte_offset: line_offset, + typo: typo.typo.as_ref(), + corrections: typo.corrections, + }; + reporter.report(msg.into())?; + } + } + } + + Ok(()) + } +} + +#[derive(Debug, Clone)] +pub struct FixTypos { + check_filenames: bool, + check_files: bool, + binary: bool, +} + +impl FileChecker for FixTypos { + fn check_file( + &self, + path: &std::path::Path, + explicit: bool, + tokenizer: &tokens::Tokenizer, + dictionary: &dyn Dictionary, + reporter: &dyn report::Report, + ) -> Result<(), std::io::Error> { + let parser = typos::ParserBuilder::new() + .tokenizer(tokenizer) + .dictionary(dictionary) + .build(); + + if self.check_files { + let (buffer, content_type) = read_file(path, reporter)?; + if !explicit && !self.binary && content_type.is_binary() { + let msg = report::BinaryFile { path }; + reporter.report(msg.into())?; + } else { + let mut fixes = Vec::new(); + let mut accum_line_num = AccumulateLineNum::new(); + for typo in parser.parse_bytes(&buffer) { + if is_fixable(&typo) { + fixes.push(typo.into_owned()); + } else { + let line_num = accum_line_num.line_num(&buffer, typo.byte_offset); + let (line, line_offset) = extract_line(&buffer, typo.byte_offset); + let msg = report::Typo { + context: Some(report::FileContext { path, line_num }.into()), + buffer: std::borrow::Cow::Borrowed(line), + byte_offset: line_offset, + typo: typo.typo.as_ref(), + corrections: typo.corrections, + }; + reporter.report(msg.into())?; + } + } + if !fixes.is_empty() { + let buffer = fix_buffer(buffer, fixes.into_iter()); + write_file(path, content_type, &buffer, reporter)?; + } + } + } + + // Ensure the above write can happen before renaming the file. + if self.check_filenames { + if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) { + let mut fixes = Vec::new(); + for typo in parser.parse_str(file_name) { + if is_fixable(&typo) { + fixes.push(typo.into_owned()); + } else { + let msg = report::Typo { + context: Some(report::PathContext { path }.into()), + buffer: std::borrow::Cow::Borrowed(file_name.as_bytes()), + byte_offset: typo.byte_offset, + typo: typo.typo.as_ref(), + corrections: typo.corrections, + }; + reporter.report(msg.into())?; + } + } + if !fixes.is_empty() { + let file_name = file_name.to_owned().into_bytes(); + let new_name = fix_buffer(file_name, fixes.into_iter()); + let new_name = + String::from_utf8(new_name).expect("corrections are valid utf-8"); + let new_path = path.with_file_name(new_name); + std::fs::rename(path, new_path)?; + } + } + } + + Ok(()) + } +} + +#[derive(Debug, Clone)] +pub struct DiffTypos { + check_filenames: bool, + check_files: bool, + binary: bool, +} + +impl FileChecker for DiffTypos { + fn check_file( + &self, + path: &std::path::Path, + explicit: bool, + tokenizer: &tokens::Tokenizer, + dictionary: &dyn Dictionary, + reporter: &dyn report::Report, + ) -> Result<(), std::io::Error> { + let parser = typos::ParserBuilder::new() + .tokenizer(tokenizer) + .dictionary(dictionary) + .build(); + + let mut content = Vec::new(); + let mut new_content = Vec::new(); + if self.check_files { + let (buffer, content_type) = read_file(path, reporter)?; + if !explicit && !self.binary && content_type.is_binary() { + let msg = report::BinaryFile { path }; + reporter.report(msg.into())?; + } else { + let mut fixes = Vec::new(); + let mut accum_line_num = AccumulateLineNum::new(); + for typo in parser.parse_bytes(&buffer) { + if is_fixable(&typo) { + fixes.push(typo.into_owned()); + } else { + let line_num = accum_line_num.line_num(&buffer, typo.byte_offset); + let (line, line_offset) = extract_line(&buffer, typo.byte_offset); + let msg = report::Typo { + context: Some(report::FileContext { path, line_num }.into()), + buffer: std::borrow::Cow::Borrowed(line), + byte_offset: line_offset, + typo: typo.typo.as_ref(), + corrections: typo.corrections, + }; + reporter.report(msg.into())?; + } + } + if !fixes.is_empty() { + new_content = fix_buffer(buffer.clone(), fixes.into_iter()); + content = buffer + } + } + } + + // Match FixTypos ordering for easy diffing. + let mut new_path = None; + if self.check_filenames { + if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) { + let mut fixes = Vec::new(); + for typo in parser.parse_str(file_name) { + if is_fixable(&typo) { + fixes.push(typo.into_owned()); + } else { + let msg = report::Typo { + context: Some(report::PathContext { path }.into()), + buffer: std::borrow::Cow::Borrowed(file_name.as_bytes()), + byte_offset: typo.byte_offset, + typo: typo.typo.as_ref(), + corrections: typo.corrections, + }; + reporter.report(msg.into())?; + } + } + if !fixes.is_empty() { + let file_name = file_name.to_owned().into_bytes(); + let new_name = fix_buffer(file_name, fixes.into_iter()); + let new_name = + String::from_utf8(new_name).expect("corrections are valid utf-8"); + new_path = Some(path.with_file_name(new_name)); + } + } + } + + if new_path.is_some() || !content.is_empty() { + let original_path = path.display().to_string(); + let fixed_path = new_path.as_deref().unwrap_or(path).display().to_string(); + let original_content: Vec<_> = content + .lines_with_terminator() + .map(|s| String::from_utf8_lossy(s).into_owned()) + .collect(); + let fixed_content: Vec<_> = new_content + .lines_with_terminator() + .map(|s| String::from_utf8_lossy(s).into_owned()) + .collect(); + let diff = difflib::unified_diff( + &original_content, + &fixed_content, + original_path.as_str(), + fixed_path.as_str(), + "original", + "fixed", + 0, + ); + for line in diff { + print!("{}", line); + } + } + + Ok(()) + } +} + +#[derive(Debug, Clone)] +pub struct Identifiers { + check_filenames: bool, + check_files: bool, + binary: bool, +} + +impl FileChecker for Identifiers { + fn check_file( + &self, + path: &std::path::Path, + explicit: bool, + tokenizer: &tokens::Tokenizer, + _dictionary: &dyn Dictionary, + reporter: &dyn report::Report, + ) -> Result<(), std::io::Error> { + if self.check_filenames { + if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) { + for word in tokenizer.parse_str(file_name) { + let msg = report::Parse { + context: Some(report::PathContext { path }.into()), + kind: report::ParseKind::Identifier, + data: word.token(), + }; + reporter.report(msg.into())?; + } + } + } + + if self.check_files { + let (buffer, content_type) = read_file(path, reporter)?; + if !explicit && !self.binary && content_type.is_binary() { + let msg = report::BinaryFile { path }; + reporter.report(msg.into())?; + } else { + for word in tokenizer.parse_bytes(&buffer) { + // HACK: Don't look up the line_num per entry to better match the performance + // of Typos for comparison purposes. We don't really get much out of it + // anyway. + let line_num = 0; + let msg = report::Parse { + context: Some(report::FileContext { path, line_num }.into()), + kind: report::ParseKind::Identifier, + data: word.token(), + }; + reporter.report(msg.into())?; + } + } + } + + Ok(()) + } +} + +#[derive(Debug, Clone)] +pub struct Words { + check_filenames: bool, + check_files: bool, + binary: bool, +} + +impl FileChecker for Words { + fn check_file( + &self, + path: &std::path::Path, + explicit: bool, + tokenizer: &tokens::Tokenizer, + _dictionary: &dyn Dictionary, + reporter: &dyn report::Report, + ) -> Result<(), std::io::Error> { + if self.check_filenames { + if let Some(file_name) = path.file_name().and_then(|s| s.to_str()) { + for word in tokenizer.parse_str(file_name).flat_map(|i| i.split()) { + let msg = report::Parse { + context: Some(report::PathContext { path }.into()), + kind: report::ParseKind::Word, + data: word.token(), + }; + reporter.report(msg.into())?; + } + } + } + + if self.check_files { + let (buffer, content_type) = read_file(path, reporter)?; + if !explicit && !self.binary && content_type.is_binary() { + let msg = report::BinaryFile { path }; + reporter.report(msg.into())?; + } else { + for word in tokenizer.parse_bytes(&buffer).flat_map(|i| i.split()) { + // HACK: Don't look up the line_num per entry to better match the performance + // of Typos for comparison purposes. We don't really get much out of it + // anyway. + let line_num = 0; + let msg = report::Parse { + context: Some(report::FileContext { path, line_num }.into()), + kind: report::ParseKind::Word, + data: word.token(), + }; + reporter.report(msg.into())?; + } + } + } + + Ok(()) + } +} + +#[derive(Debug, Clone)] +pub struct FoundFiles { + binary: bool, +} + +impl FileChecker for FoundFiles { + fn check_file( + &self, + path: &std::path::Path, + explicit: bool, + _parser: &tokens::Tokenizer, + _dictionary: &dyn Dictionary, + reporter: &dyn report::Report, + ) -> Result<(), std::io::Error> { + // Check `self.binary` first so we can easily check performance of walking vs reading + if self.binary { + let msg = report::File::new(path); + reporter.report(msg.into())?; + } else { + let (_buffer, content_type) = read_file(path, reporter)?; + if !explicit && content_type.is_binary() { + let msg = report::BinaryFile { path }; + reporter.report(msg.into())?; + } else { + let msg = report::File::new(path); + reporter.report(msg.into())?; + } + } + + Ok(()) + } +} + +pub fn read_file( + path: &std::path::Path, + reporter: &dyn report::Report, +) -> Result<(Vec, content_inspector::ContentType), std::io::Error> { + let buffer = match std::fs::read(path) { + Ok(buffer) => buffer, + Err(err) => { + let msg = report::Error::new(err.to_string()); + reporter.report(msg.into())?; + Vec::new() + } + }; + + let mut content_type = content_inspector::inspect(&buffer); + // HACK: We only support UTF-8 at the moment + if content_type != content_inspector::ContentType::UTF_8_BOM + && content_type != content_inspector::ContentType::UTF_8 + { + content_type = content_inspector::ContentType::BINARY; + } + + Ok((buffer, content_type)) +} + +pub fn write_file( + path: &std::path::Path, + content_type: content_inspector::ContentType, + buffer: &[u8], + reporter: &dyn report::Report, +) -> Result<(), std::io::Error> { + assert!( + content_type == content_inspector::ContentType::UTF_8_BOM + || content_type == content_inspector::ContentType::UTF_8 + || content_type == content_inspector::ContentType::BINARY + ); + match std::fs::write(path, buffer) { + Ok(()) => (), + Err(err) => { + let msg = report::Error::new(err.to_string()); + reporter.report(msg.into())?; + } + }; + Ok(()) +} + +struct AccumulateLineNum { + line_num: usize, + last_offset: usize, +} + +impl AccumulateLineNum { + fn new() -> Self { + Self { + // 1-indexed + line_num: 1, + last_offset: 0, + } + } + + fn line_num(&mut self, buffer: &[u8], byte_offset: usize) -> usize { + assert!(self.last_offset <= byte_offset); + let slice = &buffer[self.last_offset..byte_offset]; + let newlines = slice.lines().count(); + let line_num = self.line_num + newlines; + self.line_num = line_num; + self.last_offset = byte_offset; + line_num + } +} + +fn extract_line(buffer: &[u8], byte_offset: usize) -> (&[u8], usize) { + let line_start = buffer[0..byte_offset] + .rfind_byte(b'\n') + // Skip the newline + .map(|s| s + 1) + .unwrap_or(0); + let line = buffer[line_start..] + .lines() + .next() + .expect("should always be at least a line"); + let line_offset = byte_offset - line_start; + (line, line_offset) +} + +fn extract_fix<'t>(typo: &'t typos::Typo<'t>) -> Option<&'t str> { + match &typo.corrections { + typos::Status::Corrections(c) if c.len() == 1 => Some(c[0].as_ref()), + _ => None, + } +} + +fn is_fixable(typo: &typos::Typo<'_>) -> bool { + extract_fix(typo).is_some() +} + +fn fix_buffer(mut buffer: Vec, typos: impl Iterator>) -> Vec { + let mut offset = 0isize; + for typo in typos { + let fix = extract_fix(&typo).expect("Caller only provides fixable typos"); + let start = ((typo.byte_offset as isize) + offset) as usize; + let end = start + typo.typo.len(); + + buffer.splice(start..end, fix.as_bytes().iter().copied()); + + offset += (fix.len() as isize) - (typo.typo.len() as isize); + } + buffer +} + +pub fn walk_path( walk: ignore::Walk, - checks: &dyn typos::checks::Check, - parser: &typos::tokens::Parser, + checks: &dyn FileChecker, + parser: &typos::tokens::Tokenizer, dictionary: &dyn typos::Dictionary, - reporter: &dyn typos::report::Report, + reporter: &dyn report::Report, ) -> Result<(), ignore::Error> { for entry in walk { - check_entry(entry, checks, parser, dictionary, reporter)?; + walk_entry(entry, checks, parser, dictionary, reporter)?; } Ok(()) } -pub(crate) fn check_path_parallel( +pub fn walk_path_parallel( walk: ignore::WalkParallel, - checks: &dyn typos::checks::Check, - parser: &typos::tokens::Parser, + checks: &dyn FileChecker, + parser: &typos::tokens::Tokenizer, dictionary: &dyn typos::Dictionary, - reporter: &dyn typos::report::Report, + reporter: &dyn report::Report, ) -> Result<(), ignore::Error> { let error: std::sync::Mutex> = std::sync::Mutex::new(Ok(())); walk.run(|| { Box::new(|entry: Result| { - match check_entry(entry, checks, parser, dictionary, reporter) { + match walk_entry(entry, checks, parser, dictionary, reporter) { Ok(()) => ignore::WalkState::Continue, Err(err) => { *error.lock().unwrap() = Err(err); @@ -34,17 +644,16 @@ pub(crate) fn check_path_parallel( error.into_inner().unwrap() } -fn check_entry( +fn walk_entry( entry: Result, - checks: &dyn typos::checks::Check, - parser: &typos::tokens::Parser, + checks: &dyn FileChecker, + parser: &typos::tokens::Tokenizer, dictionary: &dyn typos::Dictionary, - reporter: &dyn typos::report::Report, + reporter: &dyn report::Report, ) -> Result<(), ignore::Error> { let entry = entry?; if entry.file_type().map(|t| t.is_file()).unwrap_or(true) { let explicit = entry.depth() == 0; - checks.check_filename(entry.path(), parser, dictionary, reporter)?; checks.check_file(entry.path(), explicit, parser, dictionary, reporter)?; } diff --git a/src/diff.rs b/src/diff.rs deleted file mode 100644 index c99ac4e..0000000 --- a/src/diff.rs +++ /dev/null @@ -1,93 +0,0 @@ -use std::collections::BTreeMap; -use std::sync; - -use bstr::ByteSlice; - -pub struct Diff<'r> { - reporter: &'r dyn typos::report::Report, - deferred: sync::Mutex, -} - -impl<'r> Diff<'r> { - pub(crate) fn new(reporter: &'r dyn typos::report::Report) -> Self { - Self { - reporter, - deferred: sync::Mutex::new(crate::replace::Deferred::default()), - } - } - - pub fn show(&self) -> Result<(), std::io::Error> { - let deferred = self.deferred.lock().unwrap(); - - for (path, corrections) in deferred.content.iter() { - let buffer = std::fs::read(path)?; - - let mut original = Vec::new(); - let mut corrected = Vec::new(); - for (line_idx, line) in buffer.lines_with_terminator().enumerate() { - original.push(String::from_utf8_lossy(line).into_owned()); - - let line_num = line_idx + 1; - let line = if let Some(corrections) = corrections.get(&line_num) { - let line = line.to_vec(); - crate::replace::correct(line, &corrections) - } else { - line.to_owned() - }; - corrected.push(String::from_utf8_lossy(&line).into_owned()) - } - - let display_path = path.display().to_string(); - let diff = difflib::unified_diff( - &original, - &corrected, - display_path.as_str(), - display_path.as_str(), - "original", - "corrected", - 0, - ); - for line in diff { - print!("{}", line); - } - } - - Ok(()) - } -} - -impl<'r> typos::report::Report for Diff<'r> { - fn report(&self, msg: typos::report::Message<'_>) -> Result<(), std::io::Error> { - let typo = match &msg { - typos::report::Message::Typo(typo) => typo, - _ => return self.reporter.report(msg), - }; - - let corrections = match &typo.corrections { - typos::Status::Corrections(corrections) if corrections.len() == 1 => corrections, - _ => return self.reporter.report(msg), - }; - - match &typo.context { - Some(typos::report::Context::File(file)) => { - let path = file.path.to_owned(); - let line_num = file.line_num; - let correction = crate::replace::Correction::new( - typo.byte_offset, - typo.typo, - corrections[0].as_ref(), - ); - let mut deferred = self.deferred.lock().unwrap(); - let content = deferred - .content - .entry(path) - .or_insert_with(BTreeMap::new) - .entry(line_num) - .or_insert_with(Vec::new); - content.push(correction); - Ok(()) - } - _ => self.reporter.report(msg), - } - } -} diff --git a/src/lib.rs b/src/lib.rs index db703d1..4d0e01e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,4 @@ +pub mod checks; pub mod config; pub mod dict; +pub mod report; diff --git a/src/main.rs b/src/main.rs index 80d0b99..f5e206d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,11 +7,10 @@ use std::io::Write; use structopt::StructOpt; mod args; -mod checks; -mod config; -mod dict; -mod diff; -mod replace; +use typos_cli::checks; +use typos_cli::config; +use typos_cli::dict; +use typos_cli::report; use proc_exit::WithCodeResultExt; @@ -61,7 +60,7 @@ fn run() -> proc_exit::ExitResult { config.default.update(&args.overrides); let config = config; - let parser = typos::tokens::ParserBuilder::new() + let parser = typos::tokens::TokenizerBuilder::new() .ignore_hex(config.default.ignore_hex()) .leading_digits(config.default.identifier_leading_digits()) .leading_chars(config.default.identifier_leading_chars().to_owned()) @@ -74,7 +73,7 @@ fn run() -> proc_exit::ExitResult { dictionary.identifiers(config.default.extend_identifiers()); dictionary.words(config.default.extend_words()); - let mut settings = typos::checks::TyposSettings::new(); + let mut settings = checks::TyposSettings::new(); settings .check_filenames(config.default.check_filename()) .check_files(config.default.check_file()) @@ -98,18 +97,11 @@ fn run() -> proc_exit::ExitResult { } else { args.format.reporter() }; - let status_reporter = typos::report::MessageStatus::new(output_reporter); - let mut reporter: &dyn typos::report::Report = &status_reporter; - let replace_reporter = replace::Replace::new(reporter); - let diff_reporter = diff::Diff::new(reporter); - if args.diff { - reporter = &diff_reporter; - } else if args.write_changes { - reporter = &replace_reporter; - } + let status_reporter = report::MessageStatus::new(output_reporter); + let reporter: &dyn report::Report = &status_reporter; - let (files, identifier_parser, word_parser, checks); - let selected_checks: &dyn typos::checks::Check = if args.files { + let (files, identifier_parser, word_parser, checks, fixer, differ); + let selected_checks: &dyn checks::FileChecker = if args.files { files = settings.build_files(); &files } else if args.identifiers { @@ -118,13 +110,19 @@ fn run() -> proc_exit::ExitResult { } else if args.words { word_parser = settings.build_word_parser(); &word_parser + } else if args.write_changes { + fixer = settings.build_fix_typos(); + &fixer + } else if args.diff { + differ = settings.build_diff_typos(); + &differ } else { checks = settings.build_typos(); &checks }; if single_threaded { - checks::check_path( + checks::walk_path( walk.build(), selected_checks, &parser, @@ -132,7 +130,7 @@ fn run() -> proc_exit::ExitResult { reporter, ) } else { - checks::check_path_parallel( + checks::walk_path_parallel( walk.build_parallel(), selected_checks, &parser, @@ -152,14 +150,6 @@ fn run() -> proc_exit::ExitResult { if status_reporter.errors_found() { errors_found = true; } - - if args.diff { - diff_reporter.show().with_code(proc_exit::Code::FAILURE)?; - } else if args.write_changes { - replace_reporter - .write() - .with_code(proc_exit::Code::FAILURE)?; - } } if errors_found { diff --git a/src/replace.rs b/src/replace.rs deleted file mode 100644 index 1ac129a..0000000 --- a/src/replace.rs +++ /dev/null @@ -1,263 +0,0 @@ -use std::collections::BTreeMap; -use std::io::Write; -use std::path; -use std::sync; - -use bstr::ByteSlice; - -pub struct Replace<'r> { - reporter: &'r dyn typos::report::Report, - deferred: sync::Mutex, -} - -impl<'r> Replace<'r> { - pub(crate) fn new(reporter: &'r dyn typos::report::Report) -> Self { - Self { - reporter, - deferred: sync::Mutex::new(Deferred::default()), - } - } - - pub fn write(&self) -> Result<(), std::io::Error> { - let deferred = self.deferred.lock().unwrap(); - - for (path, corrections) in deferred.content.iter() { - let buffer = std::fs::read(path)?; - - let mut file = std::fs::File::create(path)?; - for (line_idx, line) in buffer.lines_with_terminator().enumerate() { - let line_num = line_idx + 1; - if let Some(corrections) = corrections.get(&line_num) { - let line = line.to_vec(); - let line = correct(line, &corrections); - file.write_all(&line)?; - } else { - file.write_all(&line)?; - } - } - } - - for (path, corrections) in deferred.paths.iter() { - let orig_name = path - .file_name() - .and_then(|s| s.to_str()) - .expect("generating a correction requires the filename to be valid.") - .to_owned() - .into_bytes(); - let new_name = correct(orig_name, &corrections); - let new_name = String::from_utf8(new_name).expect("corrections are valid utf-8"); - let new_path = path.with_file_name(new_name); - std::fs::rename(path, new_path)?; - } - - Ok(()) - } -} - -impl<'r> typos::report::Report for Replace<'r> { - fn report(&self, msg: typos::report::Message<'_>) -> Result<(), std::io::Error> { - let typo = match &msg { - typos::report::Message::Typo(typo) => typo, - _ => return self.reporter.report(msg), - }; - - let corrections = match &typo.corrections { - typos::Status::Corrections(corrections) if corrections.len() == 1 => corrections, - _ => return self.reporter.report(msg), - }; - - match &typo.context { - Some(typos::report::Context::File(file)) => { - let path = file.path.to_owned(); - let line_num = file.line_num; - let correction = - Correction::new(typo.byte_offset, typo.typo, corrections[0].as_ref()); - let mut deferred = self.deferred.lock().unwrap(); - let content = deferred - .content - .entry(path) - .or_insert_with(BTreeMap::new) - .entry(line_num) - .or_insert_with(Vec::new); - content.push(correction); - Ok(()) - } - Some(typos::report::Context::Path(path)) => { - let path = path.path.to_owned(); - let correction = - Correction::new(typo.byte_offset, typo.typo, corrections[0].as_ref()); - let mut deferred = self.deferred.lock().unwrap(); - let content = deferred.paths.entry(path).or_insert_with(Vec::new); - content.push(correction); - Ok(()) - } - _ => self.reporter.report(msg), - } - } -} - -#[derive(Clone, Debug, Default)] -pub(crate) struct Deferred { - pub(crate) content: BTreeMap>>, - pub(crate) paths: BTreeMap>, -} - -#[derive(Clone, Debug, PartialOrd, Ord, PartialEq, Eq)] -pub(crate) struct Correction { - pub byte_offset: usize, - pub typo: Vec, - pub correction: Vec, -} - -impl Correction { - pub(crate) fn new(byte_offset: usize, typo: &str, correction: &str) -> Self { - Self { - byte_offset, - typo: typo.as_bytes().to_vec(), - correction: correction.as_bytes().to_vec(), - } - } -} - -pub(crate) fn correct(mut line: Vec, corrections: &[Correction]) -> Vec { - let mut corrections: Vec<_> = corrections.iter().collect(); - corrections.sort_unstable(); - corrections.reverse(); - - for correction in corrections { - let start = correction.byte_offset; - let end = start + correction.typo.len(); - line.splice(start..end, correction.correction.iter().copied()); - } - - line -} - -#[cfg(test)] -mod test { - use super::*; - - use assert_fs::prelude::*; - use typos::report::Report; - - fn simple_correct(line: &str, corrections: Vec<(usize, &str, &str)>) -> String { - let line = line.as_bytes().to_vec(); - let corrections: Vec<_> = corrections - .into_iter() - .map(|(byte_offset, typo, correction)| Correction { - byte_offset, - typo: typo.as_bytes().to_vec(), - correction: correction.as_bytes().to_vec(), - }) - .collect(); - let actual = correct(line, &corrections); - String::from_utf8(actual).unwrap() - } - - #[test] - fn test_correct_single() { - let actual = simple_correct("foo foo foo", vec![(4, "foo", "bar")]); - assert_eq!(actual, "foo bar foo"); - } - - #[test] - fn test_correct_single_grow() { - let actual = simple_correct("foo foo foo", vec![(4, "foo", "happy")]); - assert_eq!(actual, "foo happy foo"); - } - - #[test] - fn test_correct_single_shrink() { - let actual = simple_correct("foo foo foo", vec![(4, "foo", "if")]); - assert_eq!(actual, "foo if foo"); - } - - #[test] - fn test_correct_start() { - let actual = simple_correct("foo foo foo", vec![(0, "foo", "bar")]); - assert_eq!(actual, "bar foo foo"); - } - - #[test] - fn test_correct_end() { - let actual = simple_correct("foo foo foo", vec![(8, "foo", "bar")]); - assert_eq!(actual, "foo foo bar"); - } - - #[test] - fn test_correct_end_grow() { - let actual = simple_correct("foo foo foo", vec![(8, "foo", "happy")]); - assert_eq!(actual, "foo foo happy"); - } - - #[test] - fn test_correct_multiple() { - let actual = simple_correct( - "foo foo foo", - vec![(4, "foo", "happy"), (8, "foo", "world")], - ); - assert_eq!(actual, "foo happy world"); - } - - #[test] - fn test_replace_content() { - let temp = assert_fs::TempDir::new().unwrap(); - let input_file = temp.child("foo.txt"); - input_file.write_str("1 foo 2\n3 4 5").unwrap(); - - let primary = typos::report::PrintSilent; - let replace = Replace::new(&primary); - replace - .report( - typos::report::Typo::default() - .context(Some( - typos::report::FileContext::default() - .path(input_file.path()) - .line_num(1) - .into(), - )) - .buffer(std::borrow::Cow::Borrowed(b"1 foo 2\n3 4 5")) - .byte_offset(2) - .typo("foo") - .corrections(typos::Status::Corrections(vec![ - std::borrow::Cow::Borrowed("bar"), - ])) - .into(), - ) - .unwrap(); - replace.write().unwrap(); - - input_file.assert("1 bar 2\n3 4 5"); - } - - #[test] - fn test_replace_path() { - let temp = assert_fs::TempDir::new().unwrap(); - let input_file = temp.child("foo.txt"); - input_file.write_str("foo foo foo").unwrap(); - - let primary = typos::report::PrintSilent; - let replace = Replace::new(&primary); - replace - .report( - typos::report::Typo::default() - .context(Some( - typos::report::PathContext::default() - .path(input_file.path()) - .into(), - )) - .buffer(std::borrow::Cow::Borrowed(b"foo.txt")) - .byte_offset(0) - .typo("foo") - .corrections(typos::Status::Corrections(vec![ - std::borrow::Cow::Borrowed("bar"), - ])) - .into(), - ) - .unwrap(); - replace.write().unwrap(); - - input_file.assert(predicates::path::missing()); - temp.child("bar.txt").assert("foo foo foo"); - } -} diff --git a/crates/typos/src/report.rs b/src/report.rs similarity index 91% rename from crates/typos/src/report.rs rename to src/report.rs index d2d7ce9..08b2f28 100644 --- a/crates/typos/src/report.rs +++ b/src/report.rs @@ -72,7 +72,7 @@ pub struct Typo<'m> { pub buffer: Cow<'m, [u8]>, pub byte_offset: usize, pub typo: &'m str, - pub corrections: crate::Status<'m>, + pub corrections: typos::Status<'m>, } impl<'m> Default for Typo<'m> { @@ -82,7 +82,7 @@ impl<'m> Default for Typo<'m> { buffer: Cow::Borrowed(&[]), byte_offset: 0, typo: "", - corrections: crate::Status::Invalid, + corrections: typos::Status::Invalid, } } } @@ -168,7 +168,7 @@ pub struct Parse<'m> { #[serde(flatten)] pub context: Option>, pub kind: ParseKind, - pub data: Vec<&'m str>, + pub data: &'m str, } impl<'m> Default for Parse<'m> { @@ -176,7 +176,7 @@ impl<'m> Default for Parse<'m> { Self { context: None, kind: ParseKind::Identifier, - data: vec![], + data: "", } } } @@ -234,10 +234,21 @@ impl<'r> MessageStatus<'r> { impl<'r> Report for MessageStatus<'r> { fn report(&self, msg: Message) -> Result<(), std::io::Error> { - self.typos_found - .compare_and_swap(false, msg.is_correction(), atomic::Ordering::Relaxed); - self.errors_found - .compare_and_swap(false, msg.is_error(), atomic::Ordering::Relaxed); + let _ = self.typos_found.compare_exchange( + false, + msg.is_correction(), + atomic::Ordering::Relaxed, + atomic::Ordering::Relaxed, + ); + let _ = self + .errors_found + .compare_exchange( + false, + msg.is_error(), + atomic::Ordering::Relaxed, + atomic::Ordering::Relaxed, + ) + .unwrap(); self.reporter.report(msg) } } @@ -265,7 +276,7 @@ impl Report for PrintBrief { writeln!(io::stdout(), "{}", msg.path.display())?; } Message::Parse(msg) => { - writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?; + writeln!(io::stdout(), "{}", msg.data)?; } Message::Error(msg) => { log::error!("{}: {}", context_display(&msg.context), msg.msg); @@ -289,7 +300,7 @@ impl Report for PrintLong { writeln!(io::stdout(), "{}", msg.path.display())?; } Message::Parse(msg) => { - writeln!(io::stdout(), "{}", itertools::join(msg.data.iter(), " "))?; + writeln!(io::stdout(), "{}", msg.data)?; } Message::Error(msg) => { log::error!("{}: {}", context_display(&msg.context), msg.msg); @@ -308,8 +319,8 @@ fn print_brief_correction(msg: &Typo) -> Result<(), std::io::Error> { ) .count(); match &msg.corrections { - crate::Status::Valid => {} - crate::Status::Invalid => { + typos::Status::Valid => {} + typos::Status::Invalid => { writeln!( io::stdout(), "{}:{}: `{}` is disallowed", @@ -318,7 +329,7 @@ fn print_brief_correction(msg: &Typo) -> Result<(), std::io::Error> { msg.typo, )?; } - crate::Status::Corrections(corrections) => { + typos::Status::Corrections(corrections) => { writeln!( io::stdout(), "{}:{}: `{}` -> {}", @@ -345,11 +356,11 @@ fn print_long_correction(msg: &Typo) -> Result<(), std::io::Error> { ) .count(); match &msg.corrections { - crate::Status::Valid => {} - crate::Status::Invalid => { + typos::Status::Valid => {} + typos::Status::Invalid => { writeln!(handle, "error: `{}` is disallowed`", msg.typo,)?; } - crate::Status::Corrections(corrections) => { + typos::Status::Corrections(corrections) => { writeln!( handle, "error: `{}` should be {}",