From d4c21cb47d302a588d56133bae3ff15f7485af81 Mon Sep 17 00:00:00 2001 From: Zach Mullen Date: Fri, 4 Aug 2023 12:56:13 -0400 Subject: [PATCH] Postgres text search experimentation --- isic/core/models/collection.py | 11 ++++- isic/core/models/image.py | 12 ++++- isic/find/find.py | 20 ++++++++ isic/studies/models.py | 9 ++++ search.md | 89 ++++++++++++++++++++++++++++++++++ 5 files changed, 139 insertions(+), 2 deletions(-) create mode 100644 search.md diff --git a/isic/core/models/collection.py b/isic/core/models/collection.py index c3bd818d..df92686f 100644 --- a/isic/core/models/collection.py +++ b/isic/core/models/collection.py @@ -1,5 +1,6 @@ from django.contrib.auth.models import User from django.contrib.postgres.indexes import GinIndex, OpClass +from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector from django.core.exceptions import ValidationError from django.db import models from django.db.models.constraints import CheckConstraint, UniqueConstraint @@ -28,6 +29,14 @@ def regular(self): # regular means not magic return self.filter(cohort=None) + def text_search(self, value: str, rank_threshold: float = 0.0): + vector = SearchVector("name", weight="A") + SearchVector("description", weight="B") + return ( + self.annotate(search_rank=SearchRank(vector, SearchQuery(value))) + .order_by("-search_rank") + .filter(search_rank__gt=rank_threshold) + ) + class Collection(TimeStampedModel): """ @@ -51,7 +60,7 @@ class Meta(TimeStampedModel.Meta): ] indexes = [ # icontains uses Upper(name) for searching - GinIndex(OpClass(Upper("name"), name="gin_trgm_ops"), name="collection_name_gin") + GinIndex(OpClass(Upper("name"), name="gin_trgm_ops"), name="collection_name_gin"), ] creator = models.ForeignKey(User, on_delete=models.PROTECT) diff --git a/isic/core/models/image.py b/isic/core/models/image.py index 1682a8b6..3270afb1 100644 --- a/isic/core/models/image.py +++ b/isic/core/models/image.py @@ -1,6 +1,7 @@ from django.contrib.auth.models import User from django.contrib.postgres.aggregates.general import ArrayAgg from django.contrib.postgres.indexes import GinIndex, OpClass +from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector from django.db import models from django.db.models.constraints import CheckConstraint from django.db.models.expressions import F @@ -41,12 +42,21 @@ def public(self): def private(self): return self.filter(public=False) + def text_search(self, value: str, rank_threshold: float = 0.0): + vector = SearchVector("isic", "accession__metadata__diagnosis") + return ( + self.select_related("accession") + .annotate(search_rank=SearchRank(vector, SearchQuery(value))) + .order_by("-search_rank") + .filter(search_rank__gt=rank_threshold) + ) + class Image(CreationSortedTimeStampedModel): class Meta(CreationSortedTimeStampedModel.Meta): indexes = [ # icontains uses Upper(name) for searching - GinIndex(OpClass(Upper("isic"), name="gin_trgm_ops"), name="isic_name_gin") + GinIndex(OpClass(Upper("isic"), name="gin_trgm_ops"), name="isic_name_gin"), ] accession = models.OneToOneField( diff --git a/isic/find/find.py b/isic/find/find.py index 03a31736..3d5e1bed 100644 --- a/isic/find/find.py +++ b/isic/find/find.py @@ -2,6 +2,7 @@ from typing import Optional from django.contrib.auth.models import AnonymousUser, User +from django.db.models.query import QuerySet from django.db.models.query_utils import Q from jaro import jaro_winkler_metric @@ -104,3 +105,22 @@ def default_sort(search, v): ret.append(serializer.data) return ret + + +def text_search(value: str, category_limit: int = 10) -> dict: + return { + "collections": Collection.objects.text_search(value)[:category_limit], + "images": Image.objects.text_search(value)[:category_limit], + "studies": Study.objects.text_search(value)[:category_limit], + } + + +def smart_text_search(value: str, limit: int = 10) -> QuerySet[Collection | Image | Study]: + """Try to figure out the most relevant category based on the given search value.""" + + def category_relevance(qs: QuerySet): + # Simplistic example algorithm: sum of top k result ranks. + # Naturally penalizes categories that return fewer than `limit` results. + return sum(obj.search_rank for obj in qs) + + return max(text_search(value, category_limit=limit).values(), key=category_relevance) diff --git a/isic/studies/models.py b/isic/studies/models.py index a3903cde..e97932ae 100644 --- a/isic/studies/models.py +++ b/isic/studies/models.py @@ -4,6 +4,7 @@ from django.contrib.auth.models import User from django.contrib.postgres.fields import ArrayField +from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector from django.core.exceptions import ValidationError from django.db import models from django.db.models import Case, CharField, Value, When @@ -135,6 +136,14 @@ def public(self): def private(self): return self.filter(public=False) + def text_search(self, value: str, rank_threshold: float = 0.0): + vector = SearchVector("name", weight="A") + SearchVector("description", weight="B") + return ( + self.annotate(search_rank=SearchRank(vector, SearchQuery(value))) + .order_by("-search_rank") + .filter(search_rank__gt=rank_threshold) + ) + class Study(TimeStampedModel): class Meta(TimeStampedModel.Meta): diff --git a/search.md b/search.md new file mode 100644 index 00000000..ac716deb --- /dev/null +++ b/search.md @@ -0,0 +1,89 @@ +# Notes on full-text search experimentation + +## Postgres notes + +Because we want different search weights, sorting by relevance, and multiple columns searched, a +simplistic GIN index is not sufficient to avoid full table scans, even for the Study and Collection +searches. The following `EXPLAIN` output happens even with the presence of ts-vector GIN indexes on +all relevant columns. + +```python +In [33]: Image.objects.text_search('melanoma').explain() +EXPLAIN SELECT "core_image"."id", + "core_image"."modified", + "core_image"."created", + "core_image"."accession_id", + "core_image"."isic_id", + "core_image"."creator_id", + "core_image"."public", + ts_rank(to_tsvector(COALESCE("core_image"."isic_id", '') || ' ' || COALESCE((("ingest_accession"."metadata" -> 'diagnosis'))::text, '')), plainto_tsquery('melanoma')) AS "search_rank", + "ingest_accession"."id", + "ingest_accession"."modified", + "ingest_accession"."created", + "ingest_accession"."creator_id", + "ingest_accession"."girder_id", + "ingest_accession"."zip_upload_id", + "ingest_accession"."cohort_id", + "ingest_accession"."original_blob", + "ingest_accession"."original_blob_name", + "ingest_accession"."original_blob_size", + "ingest_accession"."blob", + "ingest_accession"."blob_name", + "ingest_accession"."blob_size", + "ingest_accession"."width", + "ingest_accession"."height", + "ingest_accession"."status", + "ingest_accession"."thumbnail_256", + "ingest_accession"."thumbnail_256_size", + "ingest_accession"."metadata", + "ingest_accession"."unstructured_metadata" + FROM "core_image" + INNER JOIN "ingest_accession" + ON ("core_image"."accession_id" = "ingest_accession"."id") + WHERE ts_rank(to_tsvector(COALESCE("core_image"."isic_id", '') || ' ' || COALESCE((("ingest_accession"."metadata" -> 'diagnosis'))::text, '')), plainto_tsquery('melanoma')) > 0.0 + ORDER BY "search_rank" DESC + +Execution time: 0.008215s [Database: default] +Out[33]: "Gather Merge (cost=117918.35..123349.11 rows=47224 width=921)\n Workers Planned: 1 -> Sort (cost=116918.34..117036.40 rows=47224 width=921)\n Sort Key: (ts_rank(to_tsvector((((COALESCE(core_image.isic_id, ''::character varying))::text || ' '::text) || COALESCE(((ingest_accession.metadata -> 'diagnosis'::text))::text, ''::text))), plainto_tsquery('melanoma'::text))) DESC\n -> Parallel Hash Join (cost=49319.08..94044.05 rows=47224 width=921)\n Hash Cond: (core_image.accession_id = ingest_accession.id)\n Join Filter: (ts_rank(to_tsvector((((COALESCE(core_image.isic_id, ''::character varying))::text || ' '::text) || COALESCE(((ingest_accession.metadata -> 'diagnosis'::text))::text, ''::text))), plainto_tsquery('melanoma'::text)) > '0'::double precision)\n -> Parallel Seq Scan on core_image (cost=0.00..4153.72 rows=141672 width=50)\n -> Parallel Hash (cost=34709.70..34709.70 rows=119870 width=867)\n -> Parallel Seq Scan on ingest_accession (cost=0.00..34709.70 rows=119870 width=867)\nJIT:\n Functions: 12\n Options: Inlining false, Optimization false, Expressions true, Deforming true" + +In [35]: Collection.objects.text_search('melanoma').explain() +EXPLAIN SELECT "core_collection"."id", + "core_collection"."created", + "core_collection"."modified", + "core_collection"."creator_id", + "core_collection"."name", + "core_collection"."description", + "core_collection"."public", + "core_collection"."pinned", + "core_collection"."doi_id", + "core_collection"."locked", + ts_rank((setweight(to_tsvector(COALESCE("core_collection"."name", '')), 'A') || setweight(to_tsvector(COALESCE("core_collection"."description", '')), 'B')), plainto_tsquery('melanoma')) AS "search_rank" + FROM "core_collection" + WHERE ts_rank((setweight(to_tsvector(COALESCE("core_collection"."name", '')), 'A') || setweight(to_tsvector(COALESCE("core_collection"."description", '')), 'B')), plainto_tsquery('melanoma')) > 0.0 + ORDER BY "search_rank" DESC + +Execution time: 0.003668s [Database: default] +Out[35]: 'Sort (cost=121.71..121.81 rows=38 width=124)\n Sort Key: (ts_rank((setweight(to_tsvector((COALESCE(name, \'\'::character varying))::text), \'A\'::"char") || setweight(to_tsvector(COALESCE(description, \'\'::text)), \'B\'::"char")), plainto_tsquery(\'melanoma\'::text))) DESC\n -> Seq Scan on core_collection (cost=0.00..120.72 rows=38 width=124)\n Filter: (ts_rank((setweight(to_tsvector((COALESCE(name, \'\'::character varying))::text), \'A\'::"char") || setweight(to_tsvector(COALESCE(description, \'\'::text)), \'B\'::"char")), plainto_tsquery(\'melanoma\'::text)) > \'0\'::double precision)' +``` + +These full table scans are completely fine on the Collection and Study tables due to their +cardinality, but require hundreds of ms to full-text search the Images/Accessions. + +In the case of Collections and Studies, we could use a `GENERATED ALWAYS` derived column to +ensure the text search hits the index. However, in the case of Images, there is no way to take +advantage of an indexed full-text search without application-layer denormalization, since its +searched fields span multiple relations. + +**Conclusion**: Given that: + +* The cost of full-text search is dominated by Image search, +* We will require application-layer data duplication somewhere, +* We are already using ElasticSearch anyway, + +if indexed full-text search is a must-have, we should probably leverage +ElasticSearch. However, if we don't care about a full table scan, this postgres approach affords us +a great deal of flexibility, power, and simplicity for minimal effort. + +## ElasticSearch + +...to be continued