Skip to content

Commit

Permalink
Improve wordlists and algorithm (0.7.18)
Browse files Browse the repository at this point in the history
* Improve wordlists and algorithm (0.7.18)

* Cleanup.

* More improvements.
  • Loading branch information
finnbear authored Dec 9, 2023
1 parent 9502430 commit c37227a
Show file tree
Hide file tree
Showing 13 changed files with 256 additions and 264 deletions.
5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "rustrict"
authors = ["Finn Bear"]
version = "0.7.17"
version = "0.7.18"
edition = "2021"
license = "MIT OR Apache-2.0"
repository = "https://github.com/finnbear/rustrict/"
Expand Down Expand Up @@ -41,6 +41,7 @@ width = ["lazy_static"]
find_false_positives = ["censor", "regex", "indicatif", "rayon"]
find_replacements = ["csv"]
trace = ["censor"]
trace_full = ["trace"]
serde = ["dep:serde", "arrayvec/serde"]

[package.metadata.docs.rs]
Expand Down Expand Up @@ -71,7 +72,7 @@ serde = {version = "1", features=["derive"], optional = true}
rand = "0.8"
csv = "1.1"
censor_crate = {package = "censor", version = "0.3.0"}
rustrict_old = {package = "rustrict", version = "0.7.15"}
rustrict_old = {package = "rustrict", version = "0.7.17"}
serial_test = "0.5"
bincode = "1.3.3"
serde_json = "1"
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ widths:
test:
cargo test --release --features width,serde -- --nocapture

compare:
COMPARE=1 make test

# Skips accuracy analysis so finishes faster.
test_debug:
cargo test
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ is used as a dataset. Positive accuracy is the percentage of profanity detected

| Crate | Accuracy | Positive Accuracy | Negative Accuracy | Time |
|-------|----------|-------------------|-------------------|------|
| [rustrict](https://crates.io/crates/rustrict) | 80.18% | 93.93% | 76.76% | 8s |
| [rustrict](https://crates.io/crates/rustrict) | 79.86% | 93.96% | 76.34% | 8s |
| [censor](https://crates.io/crates/censor) | 76.16% | 72.76% | 77.01% | 23s |

## Development
Expand Down
2 changes: 1 addition & 1 deletion pages/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version = "0.1.0"
edition = "2021"

[dependencies]
rustrict = { path = "..", features = ["width"] }
rustrict = { path = "..", features = ["trace_full", "width"] }
yew = { version = "0.21", features = ["csr"] }

[dependencies.web-sys]
Expand Down
7 changes: 5 additions & 2 deletions pages/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@ fn app() -> Html {
analysis_element.set_inner_html("N/A");
censored_element.set_value("");
} else {
let (censored, analysis) = Censor::from_str(&uncensored).censor_and_analyze();
let mut censor = Censor::from_str(&uncensored);
let (censored, analysis) = censor.censor_and_analyze();
let count = censor.total_matches();
let detections = censor.detections();
let width = rustrict::width_str(&uncensored);
let result = format!("{analysis:?} (width = {width})");
let result = format!("{analysis:?} (width = {width}, count = {count}, detections = {detections:?})");
analysis_element.set_inner_html(&result);
censored_element.set_value(&censored);
}
Expand Down
50 changes: 44 additions & 6 deletions src/censor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ struct AllocatedState {
matches_tmp: Set<Match>,
/// Where matches are kept after they are complete but may be cancelled due to false positives.
pending_commit: Vec<Match>,
#[cfg(feature = "trace_full")]
detections: crate::Map<String, usize>,
}

impl AllocatedState {
Expand All @@ -124,10 +126,14 @@ impl AllocatedState {
matches,
matches_tmp,
pending_commit,
#[cfg(feature = "trace_full")]
detections,
} = self;
matches.clear();
matches_tmp.clear();
pending_commit.clear();
#[cfg(feature = "trace_full")]
detections.clear();
}
}

Expand Down Expand Up @@ -319,6 +325,11 @@ impl<I: Iterator<Item = char>> Censor<I> {
self.inline.total_match_characters
}

#[cfg(feature = "trace_full")]
pub fn detections(&self) -> &crate::Map<String, usize> {
&self.allocated.detections
}

fn ensure_done(&mut self) {
if !self.inline.done {
for _ in self {}
Expand Down Expand Up @@ -528,6 +539,15 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
for m in self.allocated.matches_tmp.iter() {
let m = m.clone();

if m.low_confidence_replacements > 5
|| m.skipped > 5
|| (m.node.word && m.repetitions > 20)
{
#[cfg(feature = "trace")]
println!("throwing out low confidence match: \"{}\"", m.node.trace);
//continue;
}

safety_end = safety_end.min(m.start);

#[cfg(feature = "trace")]
Expand All @@ -536,17 +556,19 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
m.node.trace, m.spaces, m.replacements
);

let new_repetition = c == m.last;
if (skippable || new_repetition) && m.start != pos.unwrap_or(0) {
if (skippable || c == m.last || Some(c) == m.node.last)
&& m.start != pos.unwrap_or(0)
{
// Here, '.' is primarily for allowing ellipsis ("...") as a form of
// space.
// ( and ) are for ignoring appositive phrases.
// Checking node.last is to collapse multiple spaces into one
let new_space = matches!(c, ' ' | '.' | ',' | ':' | ';' | '…' | '(' | ')')
&& m.node.last != Some(' ');
let new_skip = !new_space && skippable && !ignore_sep;
let new_repetition: bool = !new_space && c == m.last;
let new_skip = !new_space && skippable && !ignore_sep && !new_repetition;
// dil -> dii
let new_replacement = c == m.last && raw_c != c;
let new_replacement = c == m.last && raw_c != c && !new_repetition;
let new_low_confidence_replacement =
new_replacement && raw_c.is_ascii_digit();

Expand All @@ -558,6 +580,7 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
.low_confidence_replacements
.saturating_add(new_low_confidence_replacement as u8),
repetitions: m.repetitions.saturating_add(new_repetition as u8),
last: c,
..m
};
#[cfg(feature = "trace")]
Expand Down Expand Up @@ -664,8 +687,11 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
let spy = &mut self.buffer;
let options = &self.options;
let inline = &mut self.inline;
let pending_commit = &mut self.allocated.pending_commit;
#[cfg(feature = "trace_full")]
let detections = &mut self.allocated.detections;

self.allocated.pending_commit.retain(|pending| {
pending_commit.retain(|pending| {
#[cfg(feature = "trace")]
println!("Consider whether to cancel pending commit {} with start={} against drain_start={:?}", pending.node.trace, pending.start, drain_start);

Expand All @@ -692,6 +718,10 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
inline.match_ptrs ^= pending.node as *const _ as usize;
inline.total_matches += 1;
inline.total_match_characters += pending.end - pending.start;
#[cfg(feature = "trace_full")]
{
*detections.entry(pending.node.trace.clone()).or_default() += 1;
}
}
}
return false;
Expand Down Expand Up @@ -738,6 +768,14 @@ impl<I: Iterator<Item = char>> Iterator for Censor<I> {
self.inline.match_ptrs ^= pending.node as *const _ as usize;
self.inline.total_matches += 1;
self.inline.total_match_characters += pending.end - pending.start;
#[cfg(feature = "trace_full")]
{
*self
.allocated
.detections
.entry(pending.node.trace.clone())
.or_default() += 1;
}
}
}
}
Expand Down Expand Up @@ -1128,7 +1166,7 @@ mod tests {
"https://crates.io/crates/rustrict",
rustrict,
false, // true,
Some(rustrict_old),
Some(rustrict_old as fn(&str) -> bool).filter(|_| std::env::var("COMPARE").is_ok()),
);
print_accuracy("https://crates.io/crates/censor", censor, false, None);
}
Expand Down
1 change: 1 addition & 0 deletions src/dictionary_extra.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ braig
brain cell
braincell
brauer
brazilian s
btw it
cheese nips
chonkey
Expand Down
21 changes: 20 additions & 1 deletion src/false_positive_finder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,26 @@ fn main() {

progress.finish();

let mut sorted: Vec<_> = false_positives.into_inner().unwrap().into_iter().collect();
let mut false_positives = false_positives.into_inner().unwrap();

let clone = false_positives.clone();
false_positives.retain(|false_positive| {
let baseline = is_ignore_fp(false_positive.chars(), true);
for clone in &clone {
if false_positive.len() != clone.len()
&& (false_positive.starts_with(clone) || false_positive.ends_with(clone))
{
let shorter = is_ignore_fp(clone.chars(), true);
if baseline == shorter {
println!("filter out {false_positive} in favor of {clone}");
return false;
}
}
}
true
});

let mut sorted: Vec<_> = false_positives.into_iter().collect();
sorted.sort();

fs::write("src/false_positives.txt", sorted.join("\n")).unwrap();
Expand Down
Loading

0 comments on commit c37227a

Please sign in to comment.