-
-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
276 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -329,6 +329,7 @@ h | |
hand job | ||
handjobs | ||
hang yourself | ||
hate negro | ||
hater | ||
haters | ||
hates | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,15 +6,15 @@ lazy_static! { | |
static ref PHONE : Regex = Regex::new(r#"(\+\d{1,2})?\s*\(?\d{3}\)?[\s\.-]*\d{3}[\s\.-]*\d{4}"#).unwrap(); | ||
static ref IP_ADDRESS : Regex = Regex::new(r#"(?:[0-9]{1,3}\.){3}[0-9]{1,3}"#).unwrap(); | ||
static ref EMAIL_ADDRESS : Regex = Regex::new(r#"(?i)[a-z0-9_\-]{3,}\s*(@|[\[\(\s]at[\s\)\]])\s*[a-z0-9_\-]{5,}\s*(\.|dot)\s*[a-z]{2,3}"#).unwrap(); | ||
static ref ADDRESS : Regex = Regex::new(r#"(?i)\d+[ ](?:[A-Za-z0-9\.-]+ )+(?:Avenue|Lane|Road|Boulevard|Drive|Street|Ave|Dr|Rd|Blvd|Ln|St)\.?(\s+#[0-9]{1,5})?"#).unwrap(); | ||
//static ref ADDRESS : Regex = Regex::new(r#"(?i)\d+[ ](?:[A-Za-z0-9\.-]+ )+(?:Avenue|Lane|Road|Boulevard|Drive|Street|Ave|Dr|Rd|Blvd|Ln|St)\.?(\s+#[0-9]{1,5})?"#).unwrap(); | ||
static ref NAME : Regex = Regex::new(r#"(?i)(real\s)?name\s+is:?\s[a-zA-Z]+(\s[a-zA-z]+)?"#).unwrap(); | ||
static ref URL : Regex = Regex::new(r#"(?i)(https?:?/*)?[a-zA-Z0-9]+\.[a-zA-Z]{2,3}"#).unwrap(); | ||
static ref URL : Regex = Regex::new(r#"(?i)(https?:?/*)?[a-zA-Z0-9]{4,}\.[a-zA-Z]{2,3}"#).unwrap(); | ||
} | ||
|
||
/// Returns [`s`] with personally-identifiable information censored out, and a `true` if | ||
/// anything was censored. | ||
/// - phone numbers | ||
/// - physical addresses | ||
/// - physical addresses (disabled for now, due to excessive false positives) | ||
/// - ip addresses | ||
/// - email addresses | ||
/// - self-described full names | ||
|
@@ -28,8 +28,9 @@ pub fn censor_and_analyze_pii(s: &str) -> (String, bool) { | |
censored |= matches!(ret, Cow::Owned(_)); | ||
let ret = EMAIL_ADDRESS.replace_all(&ret, "****@*****.***"); | ||
censored |= matches!(ret, Cow::Owned(_)); | ||
let ret = ADDRESS.replace_all(&ret, "***** **** Ave #***"); | ||
censored |= matches!(ret, Cow::Owned(_)); | ||
// too many false positives | ||
//let ret = ADDRESS.replace_all(&ret, "***** **** Ave #***"); | ||
//censored |= matches!(ret, Cow::Owned(_)); | ||
let ret = NAME.replace_all(&ret, "name is ***** *****"); | ||
censored |= matches!(ret, Cow::Owned(_)); | ||
let ret = URL.replace_all(&ret, "******.***"); | ||
|
@@ -51,6 +52,10 @@ mod tests { | |
|
||
#[test] | ||
fn pii() { | ||
/* | ||
12345 SW 54th ST #150 | ||
go to 1234 Main Street for free candy | ||
*/ | ||
let pii = r#" | ||
[email protected] | ||
hello f00 @ gmail.com | ||
|
@@ -71,17 +76,15 @@ mod tests { | |
123.123.123.123 | ||
8.8.8.8 | ||
999.999.999.999 | ||
12345 SW 54th ST #150 | ||
go to 1234 Main Street for free candy | ||
my name is: ALEX Smith | ||
my real name is Alex smith | ||
his name is alex smith | ||
her real name is alex Smith | ||
my name is alex. smith | ||
hello.com | ||
http://hello.com | ||
https://foo.com | ||
bar.com | ||
https://foooo.com | ||
barrr.com | ||
example.org | ||
twitch.tv | ||
http:/chat.dev | ||
|
Oops, something went wrong.