Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text search experimentation #740

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion isic/core/models/collection.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from django.contrib.auth.models import User
from django.contrib.postgres.indexes import GinIndex, OpClass
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
from django.core.exceptions import ValidationError
from django.db import models
from django.db.models.constraints import CheckConstraint, UniqueConstraint
Expand Down Expand Up @@ -28,6 +29,14 @@ def regular(self):
# regular means not magic
return self.filter(cohort=None)

def text_search(self, value: str, rank_threshold: float = 0.0):
vector = SearchVector("name", weight="A") + SearchVector("description", weight="B")
return (
self.annotate(search_rank=SearchRank(vector, SearchQuery(value)))
.order_by("-search_rank")
.filter(search_rank__gt=rank_threshold)
)


class Collection(TimeStampedModel):
"""
Expand All @@ -51,7 +60,7 @@ class Meta(TimeStampedModel.Meta):
]
indexes = [
# icontains uses Upper(name) for searching
GinIndex(OpClass(Upper("name"), name="gin_trgm_ops"), name="collection_name_gin")
GinIndex(OpClass(Upper("name"), name="gin_trgm_ops"), name="collection_name_gin"),
]

creator = models.ForeignKey(User, on_delete=models.PROTECT)
Expand Down
12 changes: 11 additions & 1 deletion isic/core/models/image.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from django.contrib.auth.models import User
from django.contrib.postgres.aggregates.general import ArrayAgg
from django.contrib.postgres.indexes import GinIndex, OpClass
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
from django.db import models
from django.db.models.constraints import CheckConstraint
from django.db.models.expressions import F
Expand Down Expand Up @@ -41,12 +42,21 @@ def public(self):
def private(self):
return self.filter(public=False)

def text_search(self, value: str, rank_threshold: float = 0.0):
vector = SearchVector("isic", "accession__metadata__diagnosis")
return (
self.select_related("accession")
.annotate(search_rank=SearchRank(vector, SearchQuery(value)))
.order_by("-search_rank")
.filter(search_rank__gt=rank_threshold)
)


class Image(CreationSortedTimeStampedModel):
class Meta(CreationSortedTimeStampedModel.Meta):
indexes = [
# icontains uses Upper(name) for searching
GinIndex(OpClass(Upper("isic"), name="gin_trgm_ops"), name="isic_name_gin")
GinIndex(OpClass(Upper("isic"), name="gin_trgm_ops"), name="isic_name_gin"),
]

accession = models.OneToOneField(
Expand Down
20 changes: 20 additions & 0 deletions isic/find/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Optional

from django.contrib.auth.models import AnonymousUser, User
from django.db.models.query import QuerySet
from django.db.models.query_utils import Q
from jaro import jaro_winkler_metric

Expand Down Expand Up @@ -104,3 +105,22 @@ def default_sort(search, v):
ret.append(serializer.data)

return ret


def text_search(value: str, category_limit: int = 10) -> dict:
return {
"collections": Collection.objects.text_search(value)[:category_limit],
"images": Image.objects.text_search(value)[:category_limit],
"studies": Study.objects.text_search(value)[:category_limit],
}


def smart_text_search(value: str, limit: int = 10) -> QuerySet[Collection | Image | Study]:
"""Try to figure out the most relevant category based on the given search value."""

def category_relevance(qs: QuerySet):
# Simplistic example algorithm: sum of top k result ranks.
# Naturally penalizes categories that return fewer than `limit` results.
return sum(obj.search_rank for obj in qs)

return max(text_search(value, category_limit=limit).values(), key=category_relevance)
9 changes: 9 additions & 0 deletions isic/studies/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from django.contrib.auth.models import User
from django.contrib.postgres.fields import ArrayField
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
from django.core.exceptions import ValidationError
from django.db import models
from django.db.models import Case, CharField, Value, When
Expand Down Expand Up @@ -135,6 +136,14 @@ def public(self):
def private(self):
return self.filter(public=False)

def text_search(self, value: str, rank_threshold: float = 0.0):
vector = SearchVector("name", weight="A") + SearchVector("description", weight="B")
return (
self.annotate(search_rank=SearchRank(vector, SearchQuery(value)))
.order_by("-search_rank")
.filter(search_rank__gt=rank_threshold)
)


class Study(TimeStampedModel):
class Meta(TimeStampedModel.Meta):
Expand Down
89 changes: 89 additions & 0 deletions search.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Notes on full-text search experimentation

## Postgres notes

Because we want different search weights, sorting by relevance, and multiple columns searched, a
simplistic GIN index is not sufficient to avoid full table scans, even for the Study and Collection
searches. The following `EXPLAIN` output happens even with the presence of ts-vector GIN indexes on
all relevant columns.

```python
In [33]: Image.objects.text_search('melanoma').explain()
EXPLAIN SELECT "core_image"."id",
"core_image"."modified",
"core_image"."created",
"core_image"."accession_id",
"core_image"."isic_id",
"core_image"."creator_id",
"core_image"."public",
ts_rank(to_tsvector(COALESCE("core_image"."isic_id", '') || ' ' || COALESCE((("ingest_accession"."metadata" -> 'diagnosis'))::text, '')), plainto_tsquery('melanoma')) AS "search_rank",
"ingest_accession"."id",
"ingest_accession"."modified",
"ingest_accession"."created",
"ingest_accession"."creator_id",
"ingest_accession"."girder_id",
"ingest_accession"."zip_upload_id",
"ingest_accession"."cohort_id",
"ingest_accession"."original_blob",
"ingest_accession"."original_blob_name",
"ingest_accession"."original_blob_size",
"ingest_accession"."blob",
"ingest_accession"."blob_name",
"ingest_accession"."blob_size",
"ingest_accession"."width",
"ingest_accession"."height",
"ingest_accession"."status",
"ingest_accession"."thumbnail_256",
"ingest_accession"."thumbnail_256_size",
"ingest_accession"."metadata",
"ingest_accession"."unstructured_metadata"
FROM "core_image"
INNER JOIN "ingest_accession"
ON ("core_image"."accession_id" = "ingest_accession"."id")
WHERE ts_rank(to_tsvector(COALESCE("core_image"."isic_id", '') || ' ' || COALESCE((("ingest_accession"."metadata" -> 'diagnosis'))::text, '')), plainto_tsquery('melanoma')) > 0.0
ORDER BY "search_rank" DESC

Execution time: 0.008215s [Database: default]
Out[33]: "Gather Merge (cost=117918.35..123349.11 rows=47224 width=921)\n Workers Planned: 1 -> Sort (cost=116918.34..117036.40 rows=47224 width=921)\n Sort Key: (ts_rank(to_tsvector((((COALESCE(core_image.isic_id, ''::character varying))::text || ' '::text) || COALESCE(((ingest_accession.metadata -> 'diagnosis'::text))::text, ''::text))), plainto_tsquery('melanoma'::text))) DESC\n -> Parallel Hash Join (cost=49319.08..94044.05 rows=47224 width=921)\n Hash Cond: (core_image.accession_id = ingest_accession.id)\n Join Filter: (ts_rank(to_tsvector((((COALESCE(core_image.isic_id, ''::character varying))::text || ' '::text) || COALESCE(((ingest_accession.metadata -> 'diagnosis'::text))::text, ''::text))), plainto_tsquery('melanoma'::text)) > '0'::double precision)\n -> Parallel Seq Scan on core_image (cost=0.00..4153.72 rows=141672 width=50)\n -> Parallel Hash (cost=34709.70..34709.70 rows=119870 width=867)\n -> Parallel Seq Scan on ingest_accession (cost=0.00..34709.70 rows=119870 width=867)\nJIT:\n Functions: 12\n Options: Inlining false, Optimization false, Expressions true, Deforming true"

In [35]: Collection.objects.text_search('melanoma').explain()
EXPLAIN SELECT "core_collection"."id",
"core_collection"."created",
"core_collection"."modified",
"core_collection"."creator_id",
"core_collection"."name",
"core_collection"."description",
"core_collection"."public",
"core_collection"."pinned",
"core_collection"."doi_id",
"core_collection"."locked",
ts_rank((setweight(to_tsvector(COALESCE("core_collection"."name", '')), 'A') || setweight(to_tsvector(COALESCE("core_collection"."description", '')), 'B')), plainto_tsquery('melanoma')) AS "search_rank"
FROM "core_collection"
WHERE ts_rank((setweight(to_tsvector(COALESCE("core_collection"."name", '')), 'A') || setweight(to_tsvector(COALESCE("core_collection"."description", '')), 'B')), plainto_tsquery('melanoma')) > 0.0
ORDER BY "search_rank" DESC

Execution time: 0.003668s [Database: default]
Out[35]: 'Sort (cost=121.71..121.81 rows=38 width=124)\n Sort Key: (ts_rank((setweight(to_tsvector((COALESCE(name, \'\'::character varying))::text), \'A\'::"char") || setweight(to_tsvector(COALESCE(description, \'\'::text)), \'B\'::"char")), plainto_tsquery(\'melanoma\'::text))) DESC\n -> Seq Scan on core_collection (cost=0.00..120.72 rows=38 width=124)\n Filter: (ts_rank((setweight(to_tsvector((COALESCE(name, \'\'::character varying))::text), \'A\'::"char") || setweight(to_tsvector(COALESCE(description, \'\'::text)), \'B\'::"char")), plainto_tsquery(\'melanoma\'::text)) > \'0\'::double precision)'
```

These full table scans are completely fine on the Collection and Study tables due to their
cardinality, but require hundreds of ms to full-text search the Images/Accessions.

In the case of Collections and Studies, we could use a `GENERATED ALWAYS` derived column to
ensure the text search hits the index. However, in the case of Images, there is no way to take
advantage of an indexed full-text search without application-layer denormalization, since its
searched fields span multiple relations.

**Conclusion**: Given that:

* The cost of full-text search is dominated by Image search,
* We will require application-layer data duplication somewhere,
* We are already using ElasticSearch anyway,

if indexed full-text search is a must-have, we should probably leverage
ElasticSearch. However, if we don't care about a full table scan, this postgres approach affords us
a great deal of flexibility, power, and simplicity for minimal effort.

## ElasticSearch

...to be continued
Loading