Skip to content

Commit

Permalink
Bump tantivy version, and add phrase prefix query support. (#3543)
Browse files Browse the repository at this point in the history
  • Loading branch information
fmassot authored Jun 12, 2023
1 parent 6adf4bd commit 798ad68
Show file tree
Hide file tree
Showing 8 changed files with 76 additions and 39 deletions.
24 changes: 12 additions & 12 deletions quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion quickwit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ quickwit-serve = { version = "0.6.0", path = "./quickwit-serve" }
quickwit-storage = { version = "0.6.0", path = "./quickwit-storage" }
quickwit-telemetry = { version = "0.6.0", path = "./quickwit-telemetry" }

tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "7ee78bd", default-features = false, features = [
tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "924fc70", default-features = false, features = [
"mmap",
"lz4-compression",
"zstd-compression",
Expand Down
2 changes: 1 addition & 1 deletion quickwit/quickwit-doc-mapper/src/doc_mapper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ mod tests {
let (query, _) = doc_mapper.query(schema, &query_ast, true).unwrap();
assert_eq!(
format!("{query:?}"),
r#"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Json, path=toto, type=U64, 5))), (Should, TermQuery(Term(field=0, type=Json, path=toto, type=Str, "5")))] }"#
r#"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Json, path=toto, type=I64, 5))), (Should, TermQuery(Term(field=0, type=Json, path=toto, type=Str, "5")))] }"#
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ impl ConvertableToQueryAst for MatchPhrasePrefix {
let phrase_prefix_query_ast = query_ast::PhrasePrefixQuery {
field: self.field,
phrase: query,
analyzer,
params: analyzer,
max_expansions,
};
Ok(phrase_prefix_query_ast.into())
Expand Down
4 changes: 2 additions & 2 deletions quickwit/quickwit-query/src/query_ast/full_text_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ impl FullTextParams {
let text_indexing_options = json_options
.get_text_indexing_options()
.with_context(|| format!("Json field text `{}` is not indexed", json_path))?;
let text_analyzer: TextAnalyzer = self.text_analyzer(text_indexing_options)?;
let mut text_analyzer: TextAnalyzer = self.text_analyzer(text_indexing_options)?;
let mut token_stream: BoxTokenStream = text_analyzer.token_stream(text);
let mut tokens = Vec::new();
let mut term = Term::with_capacity(100);
Expand All @@ -91,7 +91,7 @@ impl FullTextParams {
text: &str,
text_field_indexing: &TextFieldIndexing,
) -> anyhow::Result<Vec<(usize, Term)>> {
let text_analyzer: TextAnalyzer = self.text_analyzer(text_field_indexing)?;
let mut text_analyzer: TextAnalyzer = self.text_analyzer(text_field_indexing)?;
let mut token_stream: BoxTokenStream = text_analyzer.token_stream(text);
let mut tokens = Vec::new();
token_stream.process(&mut |token| {
Expand Down
8 changes: 4 additions & 4 deletions quickwit/quickwit-query/src/query_ast/phrase_prefix_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ pub struct PhrasePrefixQuery {
pub field: String,
pub phrase: String,
pub max_expansions: u32,
pub analyzer: FullTextParams,
pub params: FullTextParams,
}

impl PhrasePrefixQuery {
Expand All @@ -63,7 +63,7 @@ impl PhrasePrefixQuery {
));
}

let terms = self.analyzer.tokenize_text_into_terms(
let terms = self.params.tokenize_text_into_terms(
field,
&self.phrase,
text_field_indexing,
Expand All @@ -85,7 +85,7 @@ impl PhrasePrefixQuery {
.to_string(),
));
}
let terms = self.analyzer.tokenize_text_into_terms_json(
let terms = self.params.tokenize_text_into_terms_json(
field,
json_path,
&self.phrase,
Expand Down Expand Up @@ -116,7 +116,7 @@ impl BuildTantivyAst for PhrasePrefixQuery {
let (_, terms) = self.get_terms(schema)?;

if terms.is_empty() {
if self.analyzer.zero_terms_query.is_none() {
if self.params.zero_terms_query.is_none() {
Ok(TantivyQueryAst::match_none())
} else {
Ok(TantivyQueryAst::match_all())
Expand Down
31 changes: 31 additions & 0 deletions quickwit/quickwit-query/src/query_ast/user_input_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ use crate::query_ast::tantivy_query_ast::TantivyQueryAst;
use crate::query_ast::{self, BuildTantivyAst, FullTextMode, FullTextParams, QueryAst};
use crate::{BooleanOperand, InvalidQuery, JsonLiteral};

const DEFAULT_PHRASE_QUERY_MAX_EXPANSION: u32 = 50;

/// A query expressed in the tantivy query grammar DSL.
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
pub struct UserInputQuery {
Expand Down Expand Up @@ -182,6 +184,7 @@ fn convert_user_input_literal(
let UserInputLiteral {
field_name,
phrase,
prefix,
delimiter,
slop,
} = user_input_literal;
Expand Down Expand Up @@ -211,6 +214,15 @@ fn convert_user_input_literal(
let mut phrase_queries: Vec<QueryAst> = field_names
.into_iter()
.map(|field_name| {
if prefix {
return query_ast::PhrasePrefixQuery {
field: field_name,
phrase: phrase.clone(),
params: full_text_params.clone(),
max_expansions: DEFAULT_PHRASE_QUERY_MAX_EXPANSION,
}
.into();
}
query_ast::FullTextQuery {
field: field_name,
text: phrase.clone(),
Expand Down Expand Up @@ -309,6 +321,25 @@ mod tests {
);
}

#[test]
fn test_user_input_query_phrase_with_prefix() {
let ast = UserInputQuery {
user_text: "field:\"hello\"*".to_string(),
default_fields: None,
default_operator: BooleanOperand::And,
}
.parse_user_query(&[])
.unwrap();
let QueryAst::PhrasePrefix(phrase_prefix_query) = ast else { panic!() };
assert_eq!(&phrase_prefix_query.field, "field");
assert_eq!(&phrase_prefix_query.phrase, "hello");
assert_eq!(phrase_prefix_query.max_expansions, 50);
assert_eq!(
phrase_prefix_query.params.mode,
FullTextMode::Phrase { slop: 0 }
);
}

#[test]
fn test_user_input_query_override_default_fields() {
let ast = UserInputQuery {
Expand Down
42 changes: 24 additions & 18 deletions quickwit/quickwit-query/src/tokenizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use tantivy::tokenizer::{
};

fn create_quickwit_tokenizer_manager() -> TokenizerManager {
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer)
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.build();

Expand All @@ -41,14 +41,14 @@ fn create_quickwit_tokenizer_manager() -> TokenizerManager {

tokenizer_manager.register(
"default",
TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer)
TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.filter(LowerCaser)
.build(),
);
tokenizer_manager.register(
"en_stem",
TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer)
TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.filter(LowerCaser)
.filter(tantivy::tokenizer::Stemmer::new(
Expand All @@ -61,11 +61,11 @@ fn create_quickwit_tokenizer_manager() -> TokenizerManager {
}

fn create_quickwit_fastfield_normalizer_manager() -> TokenizerManager {
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer)
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(RemoveLongFilter::limit(255))
.build();

let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer)
let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.filter(RemoveLongFilter::limit(255))
.build();
Expand All @@ -82,7 +82,7 @@ struct ChineseTokenizer;
impl Tokenizer for ChineseTokenizer {
type TokenStream<'a> = ChineseTokenStream<'a>;

fn token_stream<'a>(&self, text: &'a str) -> Self::TokenStream<'a> {
fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> {
ChineseTokenStream {
text,
last_char: None,
Expand Down Expand Up @@ -209,21 +209,27 @@ mod tests {
sand in my face
"#;

let tokenizer = get_quickwit_tokenizer_manager().get("raw").unwrap();
let mut haiku_stream = tokenizer.token_stream(my_haiku);
assert!(haiku_stream.advance());
assert!(!haiku_stream.advance());
let my_too_long_text = vec!["a".repeat(255)].join("");
assert!(!tokenizer.token_stream(&my_too_long_text).advance());
let my_long_text = vec!["a".repeat(254)].join("");
assert!(tokenizer.token_stream(&my_long_text).advance());
let mut tokenizer = get_quickwit_tokenizer_manager().get("raw").unwrap();
{
let mut haiku_stream = tokenizer.token_stream(my_haiku);
assert!(haiku_stream.advance());
assert!(!haiku_stream.advance());
}
{
let my_too_long_text = vec!["a".repeat(255)].join("");
assert!(!tokenizer.token_stream(&my_too_long_text).advance());
}
{
let my_long_text = vec!["a".repeat(254)].join("");
assert!(tokenizer.token_stream(&my_long_text).advance());
}
}

#[test]
fn test_chinese_tokenizer() {
let text = "Hello world, 你好世界, bonjour monde";

let tokenizer = get_quickwit_tokenizer_manager()
let mut tokenizer = get_quickwit_tokenizer_manager()
.get("chinese_compatible")
.unwrap();
let mut text_stream = tokenizer.token_stream(text);
Expand Down Expand Up @@ -300,7 +306,7 @@ mod tests {
fn test_chinese_tokenizer_no_space() {
let text = "Hello你好bonjour";

let tokenizer = get_quickwit_tokenizer_manager()
let mut tokenizer = get_quickwit_tokenizer_manager()
.get("chinese_compatible")
.unwrap();
let mut text_stream = tokenizer.token_stream(text);
Expand Down Expand Up @@ -347,8 +353,8 @@ mod tests {
proptest::proptest! {
#[test]
fn test_proptest_ascii_default_chinese_equal(text in "[ -~]{0,64}") {
let cn_tok = get_quickwit_tokenizer_manager().get("chinese_compatible").unwrap();
let default_tok = get_quickwit_tokenizer_manager().get("default").unwrap();
let mut cn_tok = get_quickwit_tokenizer_manager().get("chinese_compatible").unwrap();
let mut default_tok = get_quickwit_tokenizer_manager().get("default").unwrap();

let mut text_stream = cn_tok.token_stream(&text);

Expand Down

0 comments on commit 798ad68

Please sign in to comment.