Skip to content

Commit

Permalink
Improve wordlist, replacements.
Browse files Browse the repository at this point in the history
  • Loading branch information
finnbear committed May 4, 2024
1 parent bd7d85c commit cf67ec4
Show file tree
Hide file tree
Showing 10 changed files with 276 additions and 25 deletions.
7 changes: 5 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "rustrict"
authors = ["Finn Bear"]
version = "0.7.24"
version = "0.7.25"
edition = "2021"
license = "MIT OR Apache-2.0"
repository = "https://github.com/finnbear/rustrict/"
Expand Down Expand Up @@ -48,6 +48,9 @@ serde = ["dep:serde", "arrayvec/serde"]
[package.metadata.docs.rs]
features = ["censor", "context", "customize", "width"]

[profile.release]
panic = 'abort'

[dependencies]
arrayvec = {version = "0.7", optional = true}
finl_unicode = "1.2"
Expand All @@ -73,7 +76,7 @@ serde = {version = "1", features=["derive"], optional = true}
rand = "0.8"
csv = "1.1"
censor_crate = {package = "censor", version = "0.3.0"}
rustrict_old = {package = "rustrict", version = "0.7.21"}
rustrict_old = {package = "rustrict", version = "0.7.24"}
serial_test = "0.5"
bincode = "1.3.3"
serde_json = "1"
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ is used as a dataset. Positive accuracy is the percentage of profanity detected

| Crate | Accuracy | Positive Accuracy | Negative Accuracy | Time |
|-------|----------|-------------------|-------------------|------|
| [rustrict](https://crates.io/crates/rustrict) | 79.81% | 93.99% | 76.27% | 9s |
| [rustrict](https://crates.io/crates/rustrict) | 79.74% | 94.00% | 76.18% | 9s |
| [censor](https://crates.io/crates/censor) | 76.16% | 72.76% | 77.01% | 23s |

## Development
Expand Down
1 change: 1 addition & 0 deletions src/dictionary_blacklist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ h
hand job
handjobs
hang yourself
hate negro
hater
haters
hates
Expand Down
3 changes: 3 additions & 0 deletions src/dictionary_extra.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#8
# of
2 secs
3 secs
4 secs
Expand Down Expand Up @@ -155,11 +156,13 @@ make a hole
minigame
mini game
n't eat
negativly
ngad
nigth
of agitation
omg
opps
outgaminged
pegging the
plss
plsss
Expand Down
72 changes: 71 additions & 1 deletion src/false_positives.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# of
#8
0 secs
2 secs
Expand Down Expand Up @@ -953,6 +954,7 @@ antiromance
antisex
antislavery
antispastic
antonio as
antonio or
ants cumulative
ants ext
Expand Down Expand Up @@ -3344,6 +3346,9 @@ bundles
bunga
burgh little
burgundies
burn china
burn israel
burn palestine
burst fu
burst its
burst texts
Expand Down Expand Up @@ -4979,7 +4984,6 @@ deboned
deboners
debug ger
debuggers
debut hole
decadic
decaspermal
decimosexto
Expand Down Expand Up @@ -7244,6 +7248,10 @@ gns hit
gns lut
gns perm
gns seeks
go died
go diego
go dies
go diet
go ok
go vary
go vulnerability
Expand Down Expand Up @@ -7357,6 +7365,7 @@ graphs hit
graphs lut
graphs perm
graphs seeks
grapiest
graping
grappling
grass cocktail
Expand Down Expand Up @@ -9182,6 +9191,7 @@ inns perm
inns seeks
inns semina
ino life
ino parents
inohymenitic
ins cumulative
ins eminem
Expand Down Expand Up @@ -9361,6 +9371,7 @@ it wat
it wats
it watts
italiano life
italiano parents
italic cocktail
italic commission
italic cook
Expand Down Expand Up @@ -9752,6 +9763,7 @@ kennedy ker
kennedy kevin
kennedy key
keno life
keno parents
kers cumulative
kers ext
kers hilt
Expand Down Expand Up @@ -11353,6 +11365,7 @@ miss seeks
missionary
mistful
mitchell
mitchell hole
mitchell illinois
mix linge
mixer da
Expand Down Expand Up @@ -11484,6 +11497,7 @@ moments hit
moments lut
moments perm
moments seeks
moms milk
monaco jones
monaco om
monaco on
Expand Down Expand Up @@ -11717,6 +11731,7 @@ n't eat
nabobish
nabobs
naggar
nail ger
nail zimb
nail zinc
nake da
Expand Down Expand Up @@ -11750,6 +11765,7 @@ nances lut
nances perm
nances seeks
nano life
nano parents
nanocephalus
nap anti
nap peru
Expand Down Expand Up @@ -11838,6 +11854,7 @@ negativing
negativism
negativist
negativity
negativly
negaton
negator
negatron
Expand Down Expand Up @@ -11990,6 +12007,38 @@ nigrosin
nigrous
nigth
nigua
nike er
nike exercise
nike rabbi
nike race
nike rach
nike racial
nike racing
nike rack
nike rad
nike rag
nike raid
nike rail
nike rain
nike rais
nike rale
nike rall
nike ralph
nike ran
nike rap
nike rare
nike rat
nike ray
nike re
nike refrig
nike republic
nike rh
nike ri
nike ro
nike ru
nike rw
nike rya
nike xerox
nilgai
nilgau
nilghai
Expand Down Expand Up @@ -12334,6 +12383,7 @@ ones lut
ones perm
ones seeks
ono life
ono parents
ont its
ont texts
ont thick
Expand Down Expand Up @@ -12390,6 +12440,7 @@ or appeal
or appear
or append
or jewish
or phantom
ora appeal
ora appear
ora append
Expand Down Expand Up @@ -12443,6 +12494,17 @@ organize men
orgyia
ornithocephalus
oroanal
orphanages
orphancy
orphandom
orphaned
orphange
orphanhood
orphaning
orphanism
orphanize
orphanry
orphans
orra appeal
orra appear
orra append
Expand Down Expand Up @@ -12521,6 +12583,7 @@ outers hit
outers lut
outers perm
outers seeks
outgaminged
outligger
outromance
outsuck
Expand Down Expand Up @@ -13190,6 +13253,7 @@ phys lut
phys perm
phys seeks
piano life
piano parents
pic cocktail
pic commission
pic cook
Expand Down Expand Up @@ -15031,6 +15095,7 @@ reno observation
reno observe
reno obtain
reno obv
reno parents
rents cumulative
rents ext
rents hilt
Expand Down Expand Up @@ -15723,6 +15788,7 @@ scopulate
scouriness
scrap
scrapling
scrappiest
screens cumulative
screens ext
screens hilt
Expand Down Expand Up @@ -16148,6 +16214,7 @@ sheets perm
sheets seeks
sheiklike
shell
shell hole
shell illinois
shellcracker
sheth
Expand Down Expand Up @@ -17605,6 +17672,7 @@ tech linking
tech links
tech little
techno life
techno parents
teens cumulative
teens ext
teens hilt
Expand Down Expand Up @@ -18392,6 +18460,7 @@ toshiba arizona
toshiba arling
toshiba arri
totanus
touch children
towcock
towers cumulative
towers ext
Expand Down Expand Up @@ -18520,6 +18589,7 @@ trapezoid
trapezophora
trapezophoron
trapezophozophora
trappiest
trasses
travesti dies
travesti it
Expand Down
21 changes: 12 additions & 9 deletions src/pii.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ lazy_static! {
static ref PHONE : Regex = Regex::new(r#"(\+\d{1,2})?\s*\(?\d{3}\)?[\s\.-]*\d{3}[\s\.-]*\d{4}"#).unwrap();
static ref IP_ADDRESS : Regex = Regex::new(r#"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"#).unwrap();
static ref EMAIL_ADDRESS : Regex = Regex::new(r#"(?i)[a-z0-9_\-]{3,}\s*(@|[\[\(\s]at[\s\)\]])\s*[a-z0-9_\-]{5,}\s*(\.|dot)\s*[a-z]{2,3}"#).unwrap();
static ref ADDRESS : Regex = Regex::new(r#"(?i)\d+[ ](?:[A-Za-z0-9\.-]+ )+(?:Avenue|Lane|Road|Boulevard|Drive|Street|Ave|Dr|Rd|Blvd|Ln|St)\.?(\s+#[0-9]{1,5})?"#).unwrap();
//static ref ADDRESS : Regex = Regex::new(r#"(?i)\d+[ ](?:[A-Za-z0-9\.-]+ )+(?:Avenue|Lane|Road|Boulevard|Drive|Street|Ave|Dr|Rd|Blvd|Ln|St)\.?(\s+#[0-9]{1,5})?"#).unwrap();
static ref NAME : Regex = Regex::new(r#"(?i)(real\s)?name\s+is:?\s[a-zA-Z]+(\s[a-zA-z]+)?"#).unwrap();
static ref URL : Regex = Regex::new(r#"(?i)(https?:?/*)?[a-zA-Z0-9]+\.[a-zA-Z]{2,3}"#).unwrap();
static ref URL : Regex = Regex::new(r#"(?i)(https?:?/*)?[a-zA-Z0-9]{4,}\.[a-zA-Z]{2,3}"#).unwrap();
}

/// Returns [`s`] with personally-identifiable information censored out, and a `true` if
/// anything was censored.
/// - phone numbers
/// - physical addresses
/// - physical addresses (disabled for now, due to excessive false positives)
/// - ip addresses
/// - email addresses
/// - self-described full names
Expand All @@ -28,8 +28,9 @@ pub fn censor_and_analyze_pii(s: &str) -> (String, bool) {
censored |= matches!(ret, Cow::Owned(_));
let ret = EMAIL_ADDRESS.replace_all(&ret, "****@*****.***");
censored |= matches!(ret, Cow::Owned(_));
let ret = ADDRESS.replace_all(&ret, "***** **** Ave #***");
censored |= matches!(ret, Cow::Owned(_));
// too many false positives
//let ret = ADDRESS.replace_all(&ret, "***** **** Ave #***");
//censored |= matches!(ret, Cow::Owned(_));
let ret = NAME.replace_all(&ret, "name is ***** *****");
censored |= matches!(ret, Cow::Owned(_));
let ret = URL.replace_all(&ret, "******.***");
Expand All @@ -51,6 +52,10 @@ mod tests {

#[test]
fn pii() {
/*
12345 SW 54th ST #150
go to 1234 Main Street for free candy
*/
let pii = r#"
[email protected]
hello f00 @ gmail.com
Expand All @@ -71,17 +76,15 @@ mod tests {
123.123.123.123
8.8.8.8
999.999.999.999
12345 SW 54th ST #150
go to 1234 Main Street for free candy
my name is: ALEX Smith
my real name is Alex smith
his name is alex smith
her real name is alex Smith
my name is alex. smith
hello.com
http://hello.com
https://foo.com
bar.com
https://foooo.com
barrr.com
example.org
twitch.tv
http:/chat.dev
Expand Down
Loading

0 comments on commit cf67ec4

Please sign in to comment.