Postgres text search experimentation

ImageMarkup · Aug 4, 2023 · 8fd7d2e · 8fd7d2e
1 parent 6d7b2c8
commit 8fd7d2e
Show file tree

Hide file tree

Showing 6 changed files with 151 additions and 2 deletions.
diff --git a/isic/core/models/collection.py b/isic/core/models/collection.py
@@ -1,5 +1,6 @@
 from django.contrib.auth.models import User
 from django.contrib.postgres.indexes import GinIndex, OpClass
+from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
 from django.core.exceptions import ValidationError
 from django.db import models
 from django.db.models.constraints import CheckConstraint, UniqueConstraint
@@ -28,6 +29,14 @@ def regular(self):
         # regular means not magic
         return self.filter(cohort=None)
 
+    def text_search(self, value: str, rank_threshold: float = 0.0):
+        vector = SearchVector("name", weight="A") + SearchVector("description", weight="B")
+        return (
+            self.annotate(search_rank=SearchRank(vector, SearchQuery(value)))
+            .order_by("-search_rank")
+            .filter(search_rank__gt=rank_threshold)
+        )
+
 
 class Collection(TimeStampedModel):
     """
@@ -51,7 +60,8 @@ class Meta(TimeStampedModel.Meta):
         ]
         indexes = [
             # icontains uses Upper(name) for searching
-            GinIndex(OpClass(Upper("name"), name="gin_trgm_ops"), name="collection_name_gin")
+            GinIndex(OpClass(Upper("name"), name="gin_trgm_ops"), name="collection_name_gin"),
+            GinIndex(SearchVector("name", "description", config="english"), name="collection_text"),
         ]
 
     creator = models.ForeignKey(User, on_delete=models.PROTECT)

diff --git a/isic/core/models/image.py b/isic/core/models/image.py
@@ -1,6 +1,7 @@
 from django.contrib.auth.models import User
 from django.contrib.postgres.aggregates.general import ArrayAgg
 from django.contrib.postgres.indexes import GinIndex, OpClass
+from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
 from django.db import models
 from django.db.models.constraints import CheckConstraint
 from django.db.models.expressions import F
@@ -41,12 +42,22 @@ def public(self):
     def private(self):
         return self.filter(public=False)
 
+    def text_search(self, value: str, rank_threshold: float = 0.0):
+        vector = SearchVector("isic", "accession__metadata__diagnosis")
+        return (
+            self.select_related("accession")
+            .annotate(search_rank=SearchRank(vector, SearchQuery(value)))
+            .order_by("-search_rank")
+            .filter(search_rank__gt=rank_threshold)
+        )
+
 
 class Image(CreationSortedTimeStampedModel):
     class Meta(CreationSortedTimeStampedModel.Meta):
         indexes = [
             # icontains uses Upper(name) for searching
-            GinIndex(OpClass(Upper("isic"), name="gin_trgm_ops"), name="isic_name_gin")
+            GinIndex(OpClass(Upper("isic"), name="gin_trgm_ops"), name="isic_name_gin"),
+            GinIndex(SearchVector("isic", config="english"), name="image_text"),
         ]
 
     accession = models.OneToOneField(

diff --git a/isic/find/find.py b/isic/find/find.py
@@ -2,6 +2,7 @@
 from typing import Optional
 
 from django.contrib.auth.models import AnonymousUser, User
+from django.db.models.query import QuerySet
 from django.db.models.query_utils import Q
 from jaro import jaro_winkler_metric
 
@@ -104,3 +105,22 @@ def default_sort(search, v):
             ret.append(serializer.data)
 
     return ret
+
+
+def text_search(value: str, category_limit: int = 10) -> dict:
+    return {
+        "collections": Collection.objects.text_search(value)[:category_limit],
+        "images": Image.objects.text_search(value)[:category_limit],
+        "studies": Study.objects.text_search(value)[:category_limit],
+    }
+
+
+def smart_text_search(value: str, limit: int = 10) -> QuerySet[Collection | Image | Study]:
+    """Try to figure out the most relevant category based on the given search value."""
+
+    def category_relevance(qs: QuerySet):
+        # Simplistic example algorithm: sum of top k result ranks.
+        # Naturally penalizes categories that return fewer than `limit` results.
+        return sum(obj.search_rank for obj in qs)
+
+    return max(text_search(value, category_limit=limit).values(), key=category_relevance)
diff --git a/isic/ingest/models/accession.py b/isic/ingest/models/accession.py
@@ -6,6 +6,8 @@
 
 import PIL.Image
 from django.contrib.auth.models import User
+from django.contrib.postgres.indexes import GinIndex
+from django.contrib.postgres.search import SearchVector
 from django.core.exceptions import ValidationError
 from django.core.files.uploadedfile import InMemoryUploadedFile
 from django.db import models, transaction
@@ -127,6 +129,10 @@ class Meta(CreationSortedTimeStampedModel.Meta):
             ),
         ]
 
+        indexes = [
+            GinIndex(SearchVector("metadata__diagnosis", config="english"), name="accession_text"),
+        ]
+
     # the creator is either inherited from the zip creator, or directly attached in the
     # case of a single shot upload.
     creator = models.ForeignKey(User, on_delete=models.PROTECT, related_name="accessions")

diff --git a/isic/studies/models.py b/isic/studies/models.py
@@ -4,6 +4,8 @@
 
 from django.contrib.auth.models import User
 from django.contrib.postgres.fields import ArrayField
+from django.contrib.postgres.indexes import GinIndex
+from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
 from django.core.exceptions import ValidationError
 from django.db import models
 from django.db.models import Case, CharField, Value, When
@@ -135,10 +137,21 @@ def public(self):
     def private(self):
         return self.filter(public=False)
 
+    def text_search(self, value: str, rank_threshold: float = 0.0):
+        vector = SearchVector("name", weight="A") + SearchVector("description", weight="B")
+        return (
+            self.annotate(search_rank=SearchRank(vector, SearchQuery(value)))
+            .order_by("-search_rank")
+            .filter(search_rank__gt=rank_threshold)
+        )
+
 
 class Study(TimeStampedModel):
     class Meta(TimeStampedModel.Meta):
         verbose_name_plural = "Studies"
+        indexes = [
+            GinIndex(SearchVector("name", "description", config="english"), name="study_text"),
+        ]
 
     creator = models.ForeignKey(User, on_delete=models.PROTECT)
     owners = models.ManyToManyField(User, related_name="owned_studies")

diff --git a/search.md b/search.md
@@ -0,0 +1,89 @@
+# Notes on full-text search experimentation
+
+## Postgres notes
+
+Because we want different search weights, sorting by relevance, and multiple columns searched, a
+simplistic GIN index is not sufficient to avoid full table scans, even for the Study and Collection
+searches. The following `EXPLAIN` output happens even with the presence of ts-vector GIN indexes on
+all relevant columns.
+
+```python
+In [33]: Image.objects.text_search('melanoma').explain()
+EXPLAIN SELECT "core_image"."id",
+       "core_image"."modified",
+       "core_image"."created",
+       "core_image"."accession_id",
+       "core_image"."isic_id",
+       "core_image"."creator_id",
+       "core_image"."public",
+       ts_rank(to_tsvector(COALESCE("core_image"."isic_id", '') || ' ' || COALESCE((("ingest_accession"."metadata" -> 'diagnosis'))::text, '')), plainto_tsquery('melanoma')) AS "search_rank",
+       "ingest_accession"."id",
+       "ingest_accession"."modified",
+       "ingest_accession"."created",
+       "ingest_accession"."creator_id",
+       "ingest_accession"."girder_id",
+       "ingest_accession"."zip_upload_id",
+       "ingest_accession"."cohort_id",
+       "ingest_accession"."original_blob",
+       "ingest_accession"."original_blob_name",
+       "ingest_accession"."original_blob_size",
+       "ingest_accession"."blob",
+       "ingest_accession"."blob_name",
+       "ingest_accession"."blob_size",
+       "ingest_accession"."width",
+       "ingest_accession"."height",
+       "ingest_accession"."status",
+       "ingest_accession"."thumbnail_256",
+       "ingest_accession"."thumbnail_256_size",
+       "ingest_accession"."metadata",
+       "ingest_accession"."unstructured_metadata"
+  FROM "core_image"
+ INNER JOIN "ingest_accession"
+    ON ("core_image"."accession_id" = "ingest_accession"."id")
+ WHERE ts_rank(to_tsvector(COALESCE("core_image"."isic_id", '') || ' ' || COALESCE((("ingest_accession"."metadata" -> 'diagnosis'))::text, '')), plainto_tsquery('melanoma')) > 0.0
+ ORDER BY "search_rank" DESC
+
+Execution time: 0.008215s [Database: default]
+Out[33]: "Gather Merge  (cost=117918.35..123349.11 rows=47224 width=921)\n  Workers Planned: 1  ->  Sort  (cost=116918.34..117036.40 rows=47224 width=921)\n        Sort Key: (ts_rank(to_tsvector((((COALESCE(core_image.isic_id, ''::character varying))::text || ' '::text) || COALESCE(((ingest_accession.metadata -> 'diagnosis'::text))::text, ''::text))), plainto_tsquery('melanoma'::text))) DESC\n        ->  Parallel Hash Join  (cost=49319.08..94044.05 rows=47224 width=921)\n              Hash Cond: (core_image.accession_id = ingest_accession.id)\n              Join Filter: (ts_rank(to_tsvector((((COALESCE(core_image.isic_id, ''::character varying))::text || ' '::text) || COALESCE(((ingest_accession.metadata -> 'diagnosis'::text))::text, ''::text))), plainto_tsquery('melanoma'::text)) > '0'::double precision)\n              ->  Parallel Seq Scan on core_image  (cost=0.00..4153.72 rows=141672 width=50)\n              ->  Parallel Hash  (cost=34709.70..34709.70 rows=119870 width=867)\n                    ->  Parallel Seq Scan on ingest_accession  (cost=0.00..34709.70 rows=119870 width=867)\nJIT:\n  Functions: 12\n  Options: Inlining false, Optimization false, Expressions true, Deforming true"
+
+In [35]: Collection.objects.text_search('melanoma').explain()
+EXPLAIN SELECT "core_collection"."id",
+       "core_collection"."created",
+       "core_collection"."modified",
+       "core_collection"."creator_id",
+       "core_collection"."name",
+       "core_collection"."description",
+       "core_collection"."public",
+       "core_collection"."pinned",
+       "core_collection"."doi_id",
+       "core_collection"."locked",
+       ts_rank((setweight(to_tsvector(COALESCE("core_collection"."name", '')), 'A') || setweight(to_tsvector(COALESCE("core_collection"."description", '')), 'B')), plainto_tsquery('melanoma')) AS "search_rank"
+  FROM "core_collection"
+ WHERE ts_rank((setweight(to_tsvector(COALESCE("core_collection"."name", '')), 'A') || setweight(to_tsvector(COALESCE("core_collection"."description", '')), 'B')), plainto_tsquery('melanoma')) > 0.0
+ ORDER BY "search_rank" DESC
+
+Execution time: 0.003668s [Database: default]
+Out[35]: 'Sort  (cost=121.71..121.81 rows=38 width=124)\n  Sort Key: (ts_rank((setweight(to_tsvector((COALESCE(name, \'\'::character varying))::text), \'A\'::"char") || setweight(to_tsvector(COALESCE(description, \'\'::text)), \'B\'::"char")), plainto_tsquery(\'melanoma\'::text))) DESC\n  ->  Seq Scan on core_collection  (cost=0.00..120.72 rows=38 width=124)\n        Filter: (ts_rank((setweight(to_tsvector((COALESCE(name, \'\'::character varying))::text), \'A\'::"char") || setweight(to_tsvector(COALESCE(description, \'\'::text)), \'B\'::"char")), plainto_tsquery(\'melanoma\'::text)) > \'0\'::double precision)'
+```
+
+These full table scans are completely fine on the Collection and Study tables due to their
+cardinality, but require hundreds of ms to full-text search the Images/Accessions.
+
+In the case of Collections and Studies, we could use a `GENERATED ALWAYS` derived column to
+ensure the text search hits the index. However, in the case of Images, there is no way to take
+advantage of an indexed full-text search without application-layer denormalization, since its
+searched fields span multiple relations.
+
+**Conclusion**: Given that:
+
+* The cost of full-text search is dominated by Image search,
+* We will require application-layer data duplication somewhere,
+* We are already using ElasticSearch anyway,
+
+if indexed full-text search is a must-have, we should probably leverage
+ElasticSearch. However, if we don't care about a full table scan, this postgres approach affords us
+a great deal of flexibility, power, and simplicity for minimal effort.
+
+## ElasticSearch
+
+...to be continued