diff --git a/Cargo.lock b/Cargo.lock index 4a91983..dd1c91b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -46,9 +46,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.38" +version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afddf7f520a80dbf76e6f50a35bca42a2331ef227a28b3b6dc5c2e2338d114b1" +checksum = "28b2cd92db5cbd74e8e5028f7e27dd7aa3090e89e4f2a197cc7c8dfb69c7063b" [[package]] name = "arrayvec" @@ -58,9 +58,9 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" [[package]] name = "assert_fs" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3203d5bb9979ac7210f01a150578ebafef6f08b55e79f6db32673c0977b94340" +checksum = "73c485ca248200dfb850a64468a926321865cae0c450eaa7cdbe9ccf4ec49028" dependencies = [ "doc-comment", "globwalk", @@ -89,11 +89,12 @@ checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" [[package]] name = "backtrace" -version = "0.3.56" +version = "0.3.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d117600f438b1707d4e4ae15d3595657288f8235a0eb593e80ecc98ab34e1bc" +checksum = "88fb5a785d6b44fd9d6700935608639af1b8356de1e55d5f7c2740f4faa15d82" dependencies = [ "addr2line", + "cc", "cfg-if", "libc", "miniz_oxide", @@ -109,9 +110,9 @@ checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" [[package]] name = "bitvec" -version = "0.19.4" +version = "0.19.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7ba35e9565969edb811639dbebfe34edc0368e472c5018474c8eb2543397f81" +checksum = "8942c8d352ae1838c9dda0b0ca2ab657696ef2232a20147cf1b30ae1a9cb4321" dependencies = [ "funty", "radium", @@ -139,19 +140,25 @@ checksum = "63396b8a4b9de3f4fdfb320ab6080762242f66a8ef174c49d8e19b674db4cdbe" [[package]] name = "byteorder" -version = "1.4.2" +version = "1.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae44d1a3d5a19df61dd0c8beb138458ac2a53a7ac09eba97d55592540004306b" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "cast" -version = "0.2.3" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0" +checksum = "cc38c385bfd7e444464011bb24820f40dd1c76bcdfa1b78611cb7c2e5cafab75" dependencies = [ "rustc_version", ] +[[package]] +name = "cc" +version = "1.0.67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd" + [[package]] name = "cfg-if" version = "1.0.0" @@ -225,6 +232,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + [[package]] name = "criterion" version = "0.3.4" @@ -263,9 +276,9 @@ dependencies = [ [[package]] name = "crossbeam-channel" -version = "0.5.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dca26ee1f8d361640700bde38b2c37d8c22b3ce2d360e1fc1c74ea4b0aa7d775" +checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" dependencies = [ "cfg-if", "crossbeam-utils", @@ -284,9 +297,9 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2584f639eb95fea8c798496315b297cf81b9b58b6d30ab066a75455333cf4b12" +checksum = "52fb27eab85b17fbb9f6fd667089e07d6a2eb8743d02639ee7f6a7a7729c9c94" dependencies = [ "cfg-if", "crossbeam-utils", @@ -297,9 +310,9 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7e9d99fa91428effe99c5c6d4634cdeba32b8cf784fc428a2a687f61a952c49" +checksum = "4feb231f0d4d6af81aed15928e58ecf5816aa62a2393e2c82f46973e92a9a278" dependencies = [ "autocfg", "cfg-if", @@ -308,9 +321,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.1.5" +version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9d58633299b24b515ac72a3f869f8b91306a3cec616a602843a383acd6f9e97" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" dependencies = [ "bstr", "csv-core", @@ -346,10 +359,10 @@ checksum = "f0c960ae2da4de88a91b2d920c2a7233b400bc33cb28453a2987822d8392519b" dependencies = [ "fnv", "ident_case", - "proc-macro2 1.0.24", + "proc-macro2 1.0.26", "quote 1.0.9", "strsim 0.9.3", - "syn 1.0.60", + "syn 1.0.71", ] [[package]] @@ -360,7 +373,7 @@ checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72" dependencies = [ "darling_core", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.71", ] [[package]] @@ -379,13 +392,14 @@ dependencies = [ [[package]] name = "derive_more" -version = "0.99.11" +version = "0.99.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41cb0e6161ad61ed084a36ba71fbba9e3ac5aee3606fb607fe08da6acbcf3d8c" +checksum = "f82b1b72f1263f214c0f823371768776c4f5841b942c9883aa8e5ec584fd0ba6" dependencies = [ - "proc-macro2 1.0.24", + "convert_case", + "proc-macro2 1.0.26", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.71", ] [[package]] @@ -395,9 +409,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1cf41b4580a37cca5ef2ada2cc43cf5d6be3983f4522e83010d67ab6925e84b" dependencies = [ "darling", - "proc-macro2 1.0.24", + "proc-macro2 1.0.26", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.71", ] [[package]] @@ -509,9 +523,9 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "946ee94e3dbf58fdd324f9ce245c7b238d46a66f00e86a020b71996349e46cce" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.26", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.71", ] [[package]] @@ -717,9 +731,9 @@ checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" [[package]] name = "js-sys" -version = "0.3.48" +version = "0.3.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc9f84f9b115ce7843d60706df1422a916680bfdfcbdb0447c5614ff9d7e4d78" +checksum = "2d99f9e3e84b8f67f846ef5b4cbbc3b1c29f6c759fcbce6f01aa0e73d932a24c" dependencies = [ "wasm-bindgen", ] @@ -741,9 +755,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "lexical-core" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21f866863575d0e1d654fbeeabdc927292fdf862873dc3c96c6f753357e13374" +checksum = "6607c62aa161d23d17a9072cc5da0be67cdfc89d3afb1e8d9c842bebc2525ffe" dependencies = [ "arrayvec", "bitflags", @@ -754,9 +768,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.86" +version = "0.2.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7282d924be3275cec7f6756ff4121987bc6481325397dde6ba3e7802b1a8b1c" +checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e" [[package]] name = "log" @@ -781,9 +795,9 @@ checksum = "0ee1c47aaa256ecabcaea351eae4a9b01ef39ed810004e298d2511ed284b1525" [[package]] name = "memoffset" -version = "0.6.1" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "157b4208e3059a8f9e78d559edc658e13df41410cb3ae03979c83130067fdd87" +checksum = "f83fb6581e8ed1f85fd45c116db8405483899489e38406156c25eb743554361d" dependencies = [ "autocfg", ] @@ -866,9 +880,9 @@ checksum = "a9a7ab5d64814df0fe4a4b5ead45ed6c5f181ee3ff04ba344313a6c80446c5d4" [[package]] name = "once_cell" -version = "1.7.0" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10acf907b94fc1b1a152d08ef97e7759650268cf986bf127f387e602b02c7e5a" +checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3" [[package]] name = "oorandom" @@ -960,9 +974,9 @@ checksum = "ac74c624d6b2d21f425f752262f42188365d7b8ff1aff74c82e45136510a4857" [[package]] name = "predicates" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eeb433456c1a57cc93554dea3ce40b4c19c4057e41c55d4a0f3d84ea71c325aa" +checksum = "f49cfaf7fdaa3bfacc6fa3e7054e65148878354a5cfddcf661df4c851f8021df" dependencies = [ "difference", "float-cmp", @@ -1000,9 +1014,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" dependencies = [ "proc-macro-error-attr", - "proc-macro2 1.0.24", + "proc-macro2 1.0.26", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.71", "version_check", ] @@ -1012,7 +1026,7 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.26", "quote 1.0.9", "version_check", ] @@ -1028,11 +1042,11 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.24" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e0704ee1a7e00d7bb417d0770ea303c1bccbabf0ef1667dae92b5967f5f8a71" +checksum = "a152013215dca273577e18d2bf00fa862b89b24169fb78c4c95aeb07992c9cec" dependencies = [ - "unicode-xid 0.2.1", + "unicode-xid 0.2.2", ] [[package]] @@ -1056,7 +1070,7 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.26", ] [[package]] @@ -1183,23 +1197,22 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.5" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94341e4e44e24f6b591b59e47a8a027df12e008d73fd5672dbea9cc22f4507d9" +checksum = "85dd92e586f7355c633911e11f77f3d12f04b1b1bd76a198bd34ae3af8341ef2" dependencies = [ "bitflags", ] [[package]] name = "regex" -version = "1.4.3" +version = "1.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9251239e129e16308e70d853559389de218ac275b515068abc96829d05b948a" +checksum = "2a26af418b574bd56588335b3a3659a65725d4e636eb1016c2f9e3b38c7cc759" dependencies = [ "aho-corasick", "memchr", "regex-syntax", - "thread_local", ] [[package]] @@ -1213,9 +1226,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.22" +version = "0.6.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5eb417147ba9860a96cfe72a0b93bf88fee1744b5636ec99ab20c1aa9376581" +checksum = "24d5f089152e60f62d28b835fbff2cd2e8dc0baf1ac13343bef92ab7eed84548" [[package]] name = "remove_dir_all" @@ -1279,9 +1292,9 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" [[package]] name = "serde" -version = "1.0.123" +version = "1.0.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92d5161132722baa40d802cc70b15262b98258453e85e5d1d365c757c73869ae" +checksum = "558dc50e1a5a5fa7112ca2ce4effcb321b0300c0d4ccf0776a9f60cd89031171" dependencies = [ "serde_derive", ] @@ -1298,13 +1311,13 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.123" +version = "1.0.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9391c295d64fc0abb2c556bad848f33cb8296276b1ad2677d1ae1ace4f258f31" +checksum = "b093b7a2bb58203b5da3056c05b4ec1fed827dcfdb37347a8841695263b3d06d" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.26", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.71", ] [[package]] @@ -1319,10 +1332,16 @@ dependencies = [ ] [[package]] -name = "siphasher" -version = "0.3.3" +name = "simdutf8" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" +checksum = "7f4f3d445e9015cf5e72cec4a3b3a84f8d54f34207afee609fd152de1c0212b1" + +[[package]] +name = "siphasher" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbce6d4507c7e4a3962091436e56e95290cb71fa302d0d270e32130b75fbff27" [[package]] name = "static_assertions" @@ -1361,9 +1380,9 @@ checksum = "5ba9cdfda491b814720b6b06e0cac513d922fc407582032e8706e9f137976f90" dependencies = [ "heck", "proc-macro-error", - "proc-macro2 1.0.24", + "proc-macro2 1.0.26", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.71", ] [[package]] @@ -1379,13 +1398,13 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.60" +version = "1.0.71" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c700597eca8a5a762beb35753ef6b94df201c81cca676604f547495a0d7f0081" +checksum = "ad184cc9470f9117b2ac6817bfe297307418819ba40552f9b3846f05c33d5373" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.26", "quote 1.0.9", - "unicode-xid 0.2.1", + "unicode-xid 0.2.2", ] [[package]] @@ -1441,9 +1460,9 @@ version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7765189610d8241a44529806d6fd1f2e0a08734313a35d5b3a556f92b381f3c0" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.26", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.71", ] [[package]] @@ -1457,9 +1476,9 @@ dependencies = [ [[package]] name = "tinytemplate" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2ada8616fad06a2d0c455adc530de4ef57605a8120cc65da9653e0e9623ca74" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" dependencies = [ "serde", "serde_json", @@ -1493,11 +1512,13 @@ dependencies = [ "anyhow", "itertools 0.10.0", "log", + "nom", "once_cell", - "regex", "serde", + "simdutf8", "thiserror", "unicode-segmentation", + "unicode-xid 0.2.2", ] [[package]] @@ -1512,7 +1533,7 @@ dependencies = [ "clap-verbosity-flag", "content_inspector", "criterion", - "derive_more 0.99.11", + "derive_more 0.99.13", "derive_setters", "difflib", "encoding", @@ -1631,9 +1652,9 @@ checksum = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" [[package]] name = "unicode-xid" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" [[package]] name = "uuid" @@ -1676,15 +1697,15 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" [[package]] name = "version_check" -version = "0.9.2" +version = "0.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed" +checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" [[package]] name = "walkdir" -version = "2.3.1" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "777182bc735b6424e1a57516d35ed72cb8019d85c8c9bf536dccb3445c1a2f7d" +checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" dependencies = [ "same-file", "winapi", @@ -1705,9 +1726,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" [[package]] name = "wasm-bindgen" -version = "0.2.71" +version = "0.2.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ee1280240b7c461d6a0071313e08f34a60b0365f14260362e5a2b17d1d31aa7" +checksum = "83240549659d187488f91f33c0f8547cbfef0b2088bc470c116d1d260ef623d9" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -1715,24 +1736,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.71" +version = "0.2.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b7d8b6942b8bb3a9b0e73fc79b98095a27de6fa247615e59d096754a3bc2aa8" +checksum = "ae70622411ca953215ca6d06d3ebeb1e915f0f6613e3b495122878d7ebec7dae" dependencies = [ "bumpalo", "lazy_static", "log", - "proc-macro2 1.0.24", + "proc-macro2 1.0.26", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.71", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.71" +version = "0.2.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5ac38da8ef716661f0f36c0d8320b89028efe10c7c0afde65baffb496ce0d3b" +checksum = "3e734d91443f177bfdb41969de821e15c516931c3c3db3d318fa1b68975d0f6f" dependencies = [ "quote 1.0.9", "wasm-bindgen-macro-support", @@ -1740,28 +1761,28 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.71" +version = "0.2.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc053ec74d454df287b9374ee8abb36ffd5acb95ba87da3ba5b7d3fe20eb401e" +checksum = "d53739ff08c8a68b0fdbcd54c372b8ab800b1449ab3c9d706503bc7dd1621b2c" dependencies = [ - "proc-macro2 1.0.24", + "proc-macro2 1.0.26", "quote 1.0.9", - "syn 1.0.60", + "syn 1.0.71", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.71" +version = "0.2.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d6f8ec44822dd71f5f221a5847fb34acd9060535c1211b70a05844c0f6383b1" +checksum = "d9a543ae66aa233d14bb765ed9af4a33e81b8b58d1584cf1b47ff8cd0b9e4489" [[package]] name = "web-sys" -version = "0.3.48" +version = "0.3.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec600b26223b2948cedfde2a0aa6756dcf1fef616f43d7b3097aaf53a6c4d92b" +checksum = "a905d57e488fec8861446d3393670fb50d27a262344013181c2cdf9fff5481be" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/benches/tokenize.rs b/benches/tokenize.rs index 9cd0ce9..9c7399d 100644 --- a/benches/tokenize.rs +++ b/benches/tokenize.rs @@ -1,39 +1,93 @@ mod data; -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; -fn bench_tokenize(c: &mut Criterion) { - let mut group = c.benchmark_group("tokenize"); +fn bench_parse_str(c: &mut Criterion) { + let mut group = c.benchmark_group("parse_str"); for (name, sample) in data::DATA { let len = sample.len(); - group.bench_with_input(BenchmarkId::new("ident(bytes)", name), &len, |b, _| { - let parser = typos::tokens::Tokenizer::new(); - b.iter(|| parser.parse_bytes(sample.as_bytes()).last()); - }); - group.bench_with_input(BenchmarkId::new("ident(str)", name), &len, |b, _| { - let parser = typos::tokens::Tokenizer::new(); + group.throughput(Throughput::Bytes(len as u64)); + group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| { + let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build(); b.iter(|| parser.parse_str(sample).last()); }); - group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| { - let symbol = typos::tokens::Identifier::new_unchecked(sample, 0); - b.iter(|| symbol.split().last()); + group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| { + let parser = typos::tokens::TokenizerBuilder::new() + .unicode(false) + .build(); + b.iter(|| parser.parse_str(sample).last()); }); - group.bench_with_input( - BenchmarkId::new("ident(bytes)+words", name), - &len, - |b, _| { - let parser = typos::tokens::Tokenizer::new(); - b.iter(|| { - parser - .parse_bytes(sample.as_bytes()) - .flat_map(|i| i.split()) - .last() - }); - }, - ); } group.finish(); } -criterion_group!(benches, bench_tokenize); +fn bench_parse_bytes(c: &mut Criterion) { + let mut group = c.benchmark_group("parse_bytes"); + for (name, sample) in data::DATA { + let len = sample.len(); + group.throughput(Throughput::Bytes(len as u64)); + group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| { + let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build(); + b.iter(|| parser.parse_bytes(sample.as_bytes()).last()); + }); + group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| { + let parser = typos::tokens::TokenizerBuilder::new() + .unicode(false) + .build(); + b.iter(|| parser.parse_bytes(sample.as_bytes()).last()); + }); + } + group.finish(); +} + +fn bench_split(c: &mut Criterion) { + let mut group = c.benchmark_group("split"); + for (name, sample) in data::DATA { + let len = sample.len(); + group.throughput(Throughput::Bytes(len as u64)); + group.bench_with_input(BenchmarkId::new("words", name), &len, |b, _| { + let symbol = + typos::tokens::Identifier::new_unchecked(sample, typos::tokens::Case::None, 0); + b.iter(|| symbol.split().last()); + }); + } + group.finish(); +} + +fn bench_parse_split(c: &mut Criterion) { + let mut group = c.benchmark_group("parse_bytes+split"); + for (name, sample) in data::DATA { + let len = sample.len(); + group.throughput(Throughput::Bytes(len as u64)); + group.bench_with_input(BenchmarkId::new("unicode", name), &len, |b, _| { + let parser = typos::tokens::TokenizerBuilder::new().unicode(true).build(); + b.iter(|| { + parser + .parse_bytes(sample.as_bytes()) + .flat_map(|i| i.split()) + .last() + }); + }); + group.bench_with_input(BenchmarkId::new("ascii", name), &len, |b, _| { + let parser = typos::tokens::TokenizerBuilder::new() + .unicode(false) + .build(); + b.iter(|| { + parser + .parse_bytes(sample.as_bytes()) + .flat_map(|i| i.split()) + .last() + }); + }); + } + group.finish(); +} + +criterion_group!( + benches, + bench_parse_str, + bench_parse_bytes, + bench_split, + bench_parse_split +); criterion_main!(benches); diff --git a/crates/typos/Cargo.toml b/crates/typos/Cargo.toml index 0611d78..d44573f 100644 --- a/crates/typos/Cargo.toml +++ b/crates/typos/Cargo.toml @@ -17,9 +17,11 @@ codecov = { repository = "crate-ci/typos" } [dependencies] anyhow = "1.0" thiserror = "1.0" -regex = "1.3" +nom = "6.0" +unicode-xid = "0.2.2" once_cell = "1.2.0" serde = { version = "1.0", features = ["derive"] } +simdutf8 = "0.1.1" itertools = "0.10" log = "0.4" unicode-segmentation = "1.7.1" diff --git a/crates/typos/src/tokens.rs b/crates/typos/src/tokens.rs index 3d0947f..39c568d 100644 --- a/crates/typos/src/tokens.rs +++ b/crates/typos/src/tokens.rs @@ -1,11 +1,9 @@ /// Define rules for tokenizaing a buffer. #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct TokenizerBuilder { + unicode: bool, ignore_hex: bool, leading_digits: bool, - leading_chars: String, - include_digits: bool, - include_chars: String, } impl TokenizerBuilder { @@ -13,6 +11,12 @@ impl TokenizerBuilder { Default::default() } + /// Specify that unicode Identifiers are allowed. + pub fn unicode(&mut self, yes: bool) -> &mut Self { + self.unicode = yes; + self + } + /// Specify that hexadecimal numbers should be ignored. pub fn ignore_hex(&mut self, yes: bool) -> &mut Self { self.ignore_hex = yes; @@ -25,64 +29,26 @@ impl TokenizerBuilder { self } - /// Extend accepted leading characters for Identifiers. - pub fn leading_chars(&mut self, chars: String) -> &mut Self { - self.leading_chars = chars; - self - } - - /// Specify that digits can be included in Identifiers. - pub fn include_digits(&mut self, yes: bool) -> &mut Self { - self.include_digits = yes; - self - } - - /// Extend accepted characters for Identifiers. - pub fn include_chars(&mut self, chars: String) -> &mut Self { - self.include_chars = chars; - self - } - pub fn build(&self) -> Tokenizer { - let mut pattern = r#"\b("#.to_owned(); - Self::push_pattern(&mut pattern, self.leading_digits, &self.leading_chars); - Self::push_pattern(&mut pattern, self.include_digits, &self.include_chars); - pattern.push_str(r#"*)\b"#); - - let words_str = regex::Regex::new(&pattern).unwrap(); - let words_bytes = regex::bytes::Regex::new(&pattern).unwrap(); - + let TokenizerBuilder { + unicode, + leading_digits, + ignore_hex, + } = self.clone(); Tokenizer { - words_str, - words_bytes, - // `leading_digits` let's us bypass the regexes since you can't have a decimal or - // hexadecimal number without a leading digit. - ignore_numbers: self.leading_digits, - ignore_hex: self.ignore_hex && self.leading_digits, + unicode, + leading_digits, + ignore_hex, } } - - fn push_pattern(pattern: &mut String, digits: bool, chars: &str) { - pattern.push_str(r#"(\p{Alphabetic}"#); - if digits { - pattern.push_str(r#"|\d"#); - } - for grapheme in unicode_segmentation::UnicodeSegmentation::graphemes(chars, true) { - let escaped = regex::escape(&grapheme); - pattern.push_str(&format!("|{}", escaped)); - } - pattern.push(')'); - } } impl Default for TokenizerBuilder { fn default() -> Self { Self { - ignore_hex: true, + unicode: true, leading_digits: false, - leading_chars: "_".to_owned(), - include_digits: true, - include_chars: "_'".to_owned(), + ignore_hex: true, } } } @@ -90,9 +56,8 @@ impl Default for TokenizerBuilder { /// Extract Identifiers from a buffer. #[derive(Debug, Clone)] pub struct Tokenizer { - words_str: regex::Regex, - words_bytes: regex::bytes::Regex, - ignore_numbers: bool, + unicode: bool, + leading_digits: bool, ignore_hex: bool, } @@ -102,32 +67,46 @@ impl Tokenizer { } pub fn parse_str<'c>(&'c self, content: &'c str) -> impl Iterator> { - self.words_str - .find_iter(content) - .filter(move |m| self.accept(m.as_str().as_bytes())) - .map(|m| Identifier::new_unchecked(m.as_str(), m.start())) + let iter = if self.unicode { + itertools::Either::Left(unicode_parser::iter_literals(content)) + } else { + itertools::Either::Right(ascii_parser::iter_literals(content.as_bytes())) + }; + iter.filter_map(move |identifier| { + let offset = offset(content.as_bytes(), identifier.as_bytes()); + self.transform(identifier, offset) + }) } pub fn parse_bytes<'c>(&'c self, content: &'c [u8]) -> impl Iterator> { - self.words_bytes - .find_iter(content) - .filter(move |m| self.accept(m.as_bytes())) - .filter_map(|m| { - let s = std::str::from_utf8(m.as_bytes()).ok(); - s.map(|s| Identifier::new_unchecked(s, m.start())) - }) + let iter = if self.unicode { + let iter = Utf8Chunks::new(content).flat_map(move |c| unicode_parser::iter_literals(c)); + itertools::Either::Left(iter) + } else { + itertools::Either::Right(ascii_parser::iter_literals(content)) + }; + iter.filter_map(move |identifier| { + let offset = offset(content, identifier.as_bytes()); + self.transform(identifier, offset) + }) } - fn accept(&self, contents: &[u8]) -> bool { - if self.ignore_numbers && is_number(contents) { - return false; + fn transform<'i>(&self, identifier: &'i str, offset: usize) -> Option> { + debug_assert!(!identifier.is_empty()); + if self.leading_digits { + if is_number(identifier.as_bytes()) { + return None; + } + + if self.ignore_hex && is_hex(identifier.as_bytes()) { + return None; + } + } else if is_digit(identifier.as_bytes()[0]) { + return None; } - if self.ignore_hex && is_hex(contents) { - return false; - } - - true + let case = Case::None; + Some(Identifier::new_unchecked(identifier, case, offset)) } } @@ -137,34 +116,176 @@ impl Default for Tokenizer { } } -// `_`: number literal separator in Rust and other languages -// `'`: number literal separator in C++ -static DIGITS: once_cell::sync::Lazy = - once_cell::sync::Lazy::new(|| regex::bytes::Regex::new(r#"^[0-9_']+$"#).unwrap()); - -fn is_number(ident: &[u8]) -> bool { - DIGITS.is_match(ident) +fn offset(base: &[u8], needle: &[u8]) -> usize { + let base = base.as_ptr() as usize; + let needle = needle.as_ptr() as usize; + debug_assert!(base <= needle); + needle - base } -// `_`: number literal separator in Rust and other languages -// `'`: number literal separator in C++ -static HEX: once_cell::sync::Lazy = - once_cell::sync::Lazy::new(|| regex::bytes::Regex::new(r#"^0[xX][0-9a-fA-F_']+$"#).unwrap()); +struct Utf8Chunks<'s> { + source: &'s [u8], +} + +impl<'s> Utf8Chunks<'s> { + fn new(source: &'s [u8]) -> Self { + Self { source } + } +} + +impl<'s> Iterator for Utf8Chunks<'s> { + type Item = &'s str; + + fn next(&mut self) -> Option<&'s str> { + if self.source.is_empty() { + return None; + } + + match simdutf8::compat::from_utf8(self.source) { + Ok(valid) => { + self.source = b""; + Some(valid) + } + Err(error) => { + let (valid, after_valid) = self.source.split_at(error.valid_up_to()); + + if let Some(invalid_sequence_length) = error.error_len() { + self.source = &after_valid[invalid_sequence_length..]; + } else { + self.source = b""; + } + + let valid = unsafe { std::str::from_utf8_unchecked(valid) }; + Some(valid) + } + } + } +} + +fn is_number(ident: &[u8]) -> bool { + ident.iter().all(|b| is_digit(*b) || is_digit_sep(*b)) +} fn is_hex(ident: &[u8]) -> bool { - HEX.is_match(ident) + if ident.len() < 3 { + false + } else { + ident[0] == b'0' + && ident[1] == b'x' + && ident[2..] + .iter() + .all(|b| is_hex_digit(*b) || is_digit_sep(*b)) + } +} + +#[inline] +fn is_digit(chr: u8) -> bool { + chr.is_ascii_digit() +} + +#[inline] +fn is_digit_sep(chr: u8) -> bool { + // `_`: number literal separator in Rust and other languages + // `'`: number literal separator in C++ + chr == b'_' || chr == b'\'' +} + +#[inline] +fn is_hex_digit(chr: u8) -> bool { + chr.is_ascii_hexdigit() +} + +mod unicode_parser { + use nom::bytes::complete::*; + use nom::sequence::*; + use nom::IResult; + + pub(crate) fn iter_literals(mut input: &str) -> impl Iterator { + std::iter::from_fn(move || match next_literal(input) { + Ok((i, o)) => { + input = i; + debug_assert_ne!(o, ""); + Some(o) + } + _ => None, + }) + } + + fn next_literal(input: &str) -> IResult<&str, &str> { + preceded(literal_sep, identifier)(input) + } + + fn literal_sep(input: &str) -> IResult<&str, &str> { + take_till(unicode_xid::UnicodeXID::is_xid_continue)(input) + } + + fn identifier(input: &str) -> IResult<&str, &str> { + // Generally a language would be `{XID_Start}{XID_Continue}*` but going with only + // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd + // or unexpected cases than strip off start characters to a word since we aren't doing a + // proper word boundary parse + take_while1(unicode_xid::UnicodeXID::is_xid_continue)(input) + } +} + +mod ascii_parser { + use nom::bytes::complete::*; + use nom::sequence::*; + use nom::IResult; + + pub(crate) fn iter_literals(mut input: &[u8]) -> impl Iterator { + std::iter::from_fn(move || match next_literal(input) { + Ok((i, o)) => { + input = i; + debug_assert_ne!(o, b""); + // This is safe because we've checked that the strings are a subset of ASCII + // characters. + let o = unsafe { std::str::from_utf8_unchecked(o) }; + Some(o) + } + _ => None, + }) + } + + fn next_literal(input: &[u8]) -> IResult<&[u8], &[u8]> { + preceded(literal_sep, identifier)(input) + } + + fn literal_sep(input: &[u8]) -> IResult<&[u8], &[u8]> { + take_till(is_continue)(input) + } + + fn identifier(input: &[u8]) -> IResult<&[u8], &[u8]> { + // Generally a language would be `{XID_Start}{XID_Continue}*` but going with only + // `{XID_Continue}+` because XID_Continue is a superset of XID_Start and rather catch odd + // or unexpected cases than strip off start characters to a word since we aren't doing a + // proper word boundary parse + take_while1(is_continue)(input) + } + + fn is_continue(c: u8) -> bool { + (b'a'..=b'z').contains(&c) + || (b'A'..=b'Z').contains(&c) + || (b'0'..=b'9').contains(&c) + || c == b'_' + } } /// A term composed of Words. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Identifier<'t> { token: &'t str, + case: Case, offset: usize, } impl<'t> Identifier<'t> { - pub fn new_unchecked(token: &'t str, offset: usize) -> Self { - Self { token, offset } + pub fn new_unchecked(token: &'t str, case: Case, offset: usize) -> Self { + Self { + token, + case, + offset, + } } pub fn token(&self) -> &'t str { @@ -172,7 +293,7 @@ impl<'t> Identifier<'t> { } pub fn case(&self) -> Case { - Case::None + self.case } pub fn offset(&self) -> usize { @@ -181,7 +302,12 @@ impl<'t> Identifier<'t> { /// Split into individual Words. pub fn split(&self) -> impl Iterator> { - SplitIdent::new(self.token, self.offset) + match self.case { + Case::None => itertools::Either::Left(SplitIdent::new(self.token, self.offset)), + _ => itertools::Either::Right( + Some(Word::new_unchecked(self.token, self.case, self.offset)).into_iter(), + ), + } } } @@ -269,7 +395,7 @@ impl<'s> Iterator for SplitIdent<'s> { while let Some((i, c)) = self.char_indices.next() { let cur_mode = WordMode::classify(c); if cur_mode == WordMode::Boundary { - assert!(self.start_mode == WordMode::Boundary); + debug_assert!(self.start_mode == WordMode::Boundary); continue; } if self.start_mode == WordMode::Boundary { @@ -409,7 +535,7 @@ mod test { let parser = Tokenizer::new(); let input = "word"; - let expected: Vec = vec![Identifier::new_unchecked("word", 0)]; + let expected: Vec = vec![Identifier::new_unchecked("word", Case::None, 0)]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); let actual: Vec<_> = parser.parse_str(input).collect(); @@ -422,8 +548,8 @@ mod test { let input = "A B"; let expected: Vec = vec![ - Identifier::new_unchecked("A", 0), - Identifier::new_unchecked("B", 2), + Identifier::new_unchecked("A", Case::None, 0), + Identifier::new_unchecked("B", Case::None, 2), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); @@ -437,8 +563,8 @@ mod test { let input = "A.B"; let expected: Vec = vec![ - Identifier::new_unchecked("A", 0), - Identifier::new_unchecked("B", 2), + Identifier::new_unchecked("A", Case::None, 0), + Identifier::new_unchecked("B", Case::None, 2), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); @@ -452,8 +578,8 @@ mod test { let input = "A::B"; let expected: Vec = vec![ - Identifier::new_unchecked("A", 0), - Identifier::new_unchecked("B", 3), + Identifier::new_unchecked("A", Case::None, 0), + Identifier::new_unchecked("B", Case::None, 3), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); @@ -466,7 +592,7 @@ mod test { let parser = Tokenizer::new(); let input = "A_B"; - let expected: Vec = vec![Identifier::new_unchecked("A_B", 0)]; + let expected: Vec = vec![Identifier::new_unchecked("A_B", Case::None, 0)]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); let actual: Vec<_> = parser.parse_str(input).collect(); @@ -475,12 +601,15 @@ mod test { #[test] fn tokenize_ignore_hex_enabled() { - let parser = TokenizerBuilder::new().ignore_hex(true).build(); + let parser = TokenizerBuilder::new() + .ignore_hex(true) + .leading_digits(true) + .build(); let input = "Hello 0xDEADBEEF World"; let expected: Vec = vec![ - Identifier::new_unchecked("Hello", 0), - Identifier::new_unchecked("World", 17), + Identifier::new_unchecked("Hello", Case::None, 0), + Identifier::new_unchecked("World", Case::None, 17), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); @@ -497,9 +626,47 @@ mod test { let input = "Hello 0xDEADBEEF World"; let expected: Vec = vec![ - Identifier::new_unchecked("Hello", 0), - Identifier::new_unchecked("0xDEADBEEF", 6), - Identifier::new_unchecked("World", 17), + Identifier::new_unchecked("Hello", Case::None, 0), + Identifier::new_unchecked("0xDEADBEEF", Case::None, 6), + Identifier::new_unchecked("World", Case::None, 17), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_leading_digits_enabled() { + let parser = TokenizerBuilder::new() + .ignore_hex(false) + .leading_digits(true) + .build(); + + let input = "Hello 0Hello 124 0xDEADBEEF World"; + let expected: Vec = vec![ + Identifier::new_unchecked("Hello", Case::None, 0), + Identifier::new_unchecked("0Hello", Case::None, 6), + Identifier::new_unchecked("0xDEADBEEF", Case::None, 17), + Identifier::new_unchecked("World", Case::None, 28), + ]; + let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); + assert_eq!(expected, actual); + let actual: Vec<_> = parser.parse_str(input).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_leading_digits_disabled() { + let parser = TokenizerBuilder::new() + .ignore_hex(false) + .leading_digits(false) + .build(); + + let input = "Hello 0Hello 124 0xDEADBEEF World"; + let expected: Vec = vec![ + Identifier::new_unchecked("Hello", Case::None, 0), + Identifier::new_unchecked("World", Case::None, 28), ]; let actual: Vec<_> = parser.parse_bytes(input.as_bytes()).collect(); assert_eq!(expected, actual); @@ -564,7 +731,7 @@ mod test { ), ]; for (input, expected) in cases.iter() { - let ident = Identifier::new_unchecked(input, 0); + let ident = Identifier::new_unchecked(input, Case::None, 0); let result: Vec<_> = ident.split().map(|w| (w.token, w.case, w.offset)).collect(); assert_eq!(&result, expected); } diff --git a/docs/reference.md b/docs/reference.md index 1feda8a..a9fed8f 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -23,11 +23,9 @@ Configuration is read from the following (in precedence order) | default.binary | --binary | bool | Check binary files as text | | default.check-filename | \- | bool | Verifying spelling in file names. | | default.check-file | \- | bool | Verifying spelling in files. | +| default.unicode | --unicode | bool | Allow unicode characters in identifiers (and not just ASCII) | | default.ignore-hex | \- | bool | Do not check identifiers that appear to be hexadecimal values. | | default.identifier-leading-digits | \- | bool | Allow identifiers to start with digits, in addition to letters. | -| default.identifier-include-digits | \- | bool | Allow identifiers to include digits, in addition to letters. | -| default.identifier-leading-chars | \- | string | Allow identifiers to start with one of these characters. | -| default.identifier-include-chars | \- | string | Allow identifiers to include these characters. | | default.locale | --locale | en, en-us, en-gb, en-ca, en-au | English dialect to correct to. | | default.extend-identifiers | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | | default.extend-words | \- | table of strings | Corrections for identifiers. When the correction is blank, the word is never valid. When the correction is the key, the word is always valid. | diff --git a/src/args.rs b/src/args.rs index aa8bf00..53f96ac 100644 --- a/src/args.rs +++ b/src/args.rs @@ -123,6 +123,12 @@ pub(crate) struct FileArgs { #[structopt(long, overrides_with("no-check-files"), hidden(true))] check_files: bool, + #[structopt(long, overrides_with("no-unicode"), hidden(true))] + unicode: bool, + #[structopt(long, overrides_with("unicode"))] + /// Only allow ASCII characters in identifiers + no_unicode: bool, + #[structopt( long, possible_values(&config::Locale::variants()), @@ -136,7 +142,10 @@ impl FileArgs { binary: self.binary(), check_filename: self.check_filename(), check_file: self.check_file(), - tokenizer: None, + tokenizer: Some(config::TokenizerConfig { + unicode: self.unicode(), + ..Default::default() + }), dict: Some(config::DictConfig { locale: self.locale, ..Default::default() @@ -145,30 +154,19 @@ impl FileArgs { } fn binary(&self) -> Option { - match (self.binary, self.no_binary) { - (true, false) => Some(true), - (false, true) => Some(false), - (false, false) => None, - (_, _) => unreachable!("StructOpt should make this impossible"), - } + resolve_bool_arg(self.binary, self.no_binary) } fn check_filename(&self) -> Option { - match (self.check_filenames, self.no_check_filenames) { - (true, false) => Some(true), - (false, true) => Some(false), - (false, false) => None, - (_, _) => unreachable!("StructOpt should make this impossible"), - } + resolve_bool_arg(self.check_filenames, self.no_check_filenames) + } + + fn unicode(&self) -> Option { + resolve_bool_arg(self.unicode, self.no_unicode) } fn check_file(&self) -> Option { - match (self.check_files, self.no_check_files) { - (true, false) => Some(true), - (false, true) => Some(false), - (false, false) => None, - (_, _) => unreachable!("StructOpt should make this impossible"), - } + resolve_bool_arg(self.check_files, self.no_check_files) } } @@ -244,56 +242,35 @@ impl WalkArgs { } fn ignore_hidden(&self) -> Option { - match (self.hidden, self.no_hidden) { - (true, false) => Some(false), - (false, true) => Some(true), - (false, false) => None, - (_, _) => unreachable!("StructOpt should make this impossible"), - } + resolve_bool_arg(self.no_hidden, self.hidden) } fn ignore_files(&self) -> Option { - match (self.no_ignore, self.ignore) { - (true, false) => Some(false), - (false, true) => Some(true), - (false, false) => None, - (_, _) => unreachable!("StructOpt should make this impossible"), - } + resolve_bool_arg(self.ignore, self.no_ignore) } fn ignore_dot(&self) -> Option { - match (self.no_ignore_dot, self.ignore_dot) { - (true, false) => Some(false), - (false, true) => Some(true), - (false, false) => None, - (_, _) => unreachable!("StructOpt should make this impossible"), - } + resolve_bool_arg(self.ignore_dot, self.no_ignore_dot) } fn ignore_vcs(&self) -> Option { - match (self.no_ignore_vcs, self.ignore_vcs) { - (true, false) => Some(false), - (false, true) => Some(true), - (false, false) => None, - (_, _) => unreachable!("StructOpt should make this impossible"), - } + resolve_bool_arg(self.ignore_vcs, self.no_ignore_vcs) } fn ignore_global(&self) -> Option { - match (self.no_ignore_global, self.ignore_global) { - (true, false) => Some(false), - (false, true) => Some(true), - (false, false) => None, - (_, _) => unreachable!("StructOpt should make this impossible"), - } + resolve_bool_arg(self.ignore_global, self.no_ignore_global) } fn ignore_parent(&self) -> Option { - match (self.no_ignore_parent, self.ignore_parent) { - (true, false) => Some(false), - (false, true) => Some(true), - (false, false) => None, - (_, _) => unreachable!("StructOpt should make this impossible"), - } + resolve_bool_arg(self.ignore_parent, self.no_ignore_parent) + } +} + +fn resolve_bool_arg(yes: bool, no: bool) -> Option { + match (yes, no) { + (true, false) => Some(true), + (false, true) => Some(false), + (false, false) => None, + (_, _) => unreachable!("StructOpt should make this impossible"), } } diff --git a/src/config.rs b/src/config.rs index 285009e..4e3ef1e 100644 --- a/src/config.rs +++ b/src/config.rs @@ -230,50 +230,38 @@ impl EngineConfig { #[serde(deny_unknown_fields, default)] #[serde(rename_all = "kebab-case")] pub struct TokenizerConfig { + /// Allow unicode characters in identifiers (and not just ASCII) + pub unicode: Option, /// Do not check identifiers that appear to be hexadecimal values. pub ignore_hex: Option, /// Allow identifiers to start with digits, in addition to letters. pub identifier_leading_digits: Option, - /// Allow identifiers to start with one of these characters. - pub identifier_leading_chars: Option, - /// Allow identifiers to include digits, in addition to letters. - pub identifier_include_digits: Option, - /// Allow identifiers to include these characters. - pub identifier_include_chars: Option, } impl TokenizerConfig { pub fn from_defaults() -> Self { let empty = Self::default(); Self { + unicode: Some(empty.unicode()), ignore_hex: Some(empty.ignore_hex()), identifier_leading_digits: Some(empty.identifier_leading_digits()), - identifier_leading_chars: Some(kstring::KString::from_ref( - empty.identifier_leading_chars(), - )), - identifier_include_digits: Some(empty.identifier_include_digits()), - identifier_include_chars: Some(kstring::KString::from_ref( - empty.identifier_include_chars(), - )), } } pub fn update(&mut self, source: &TokenizerConfig) { + if let Some(source) = source.unicode { + self.unicode = Some(source); + } if let Some(source) = source.ignore_hex { self.ignore_hex = Some(source); } if let Some(source) = source.identifier_leading_digits { self.identifier_leading_digits = Some(source); } - if let Some(source) = source.identifier_leading_chars.as_ref() { - self.identifier_leading_chars = Some(source.clone()); - } - if let Some(source) = source.identifier_include_digits { - self.identifier_include_digits = Some(source); - } - if let Some(source) = source.identifier_include_chars.as_ref() { - self.identifier_include_chars = Some(source.clone()); - } + } + + pub fn unicode(&self) -> bool { + self.unicode.unwrap_or(true) } pub fn ignore_hex(&self) -> bool { @@ -283,18 +271,6 @@ impl TokenizerConfig { pub fn identifier_leading_digits(&self) -> bool { self.identifier_leading_digits.unwrap_or(false) } - - pub fn identifier_leading_chars(&self) -> &str { - self.identifier_leading_chars.as_deref().unwrap_or("_") - } - - pub fn identifier_include_digits(&self) -> bool { - self.identifier_include_digits.unwrap_or(true) - } - - pub fn identifier_include_chars(&self) -> &str { - self.identifier_include_chars.as_deref().unwrap_or("_'") - } } #[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)] diff --git a/src/policy.rs b/src/policy.rs index 7393be0..020eeb0 100644 --- a/src/policy.rs +++ b/src/policy.rs @@ -217,11 +217,9 @@ impl<'s> ConfigEngine<'s> { let dict_config = dict.unwrap_or_else(crate::config::DictConfig::from_defaults); let tokenizer = typos::tokens::TokenizerBuilder::new() + .unicode(tokenizer_config.unicode()) .ignore_hex(tokenizer_config.ignore_hex()) .leading_digits(tokenizer_config.identifier_leading_digits()) - .leading_chars(tokenizer_config.identifier_leading_chars().to_owned()) - .include_digits(tokenizer_config.identifier_include_digits()) - .include_chars(tokenizer_config.identifier_include_chars().to_owned()) .build(); let dict = crate::dict::BuiltIn::new(dict_config.locale());