From af6551e491c3d091ab05a92659e48f551389a4c3 Mon Sep 17 00:00:00 2001 From: Sushil Tiwari Date: Fri, 9 Aug 2024 18:27:55 +0545 Subject: [PATCH] Add new changes and Apis for extracts Change latest prompt with used excerpts Add OpenAiChat class for AzureOpenAI Handle Empty df for Ops and recursive function for summary --- main/permissions.py | 4 + main/urls.py | 1 - per/cache.py | 32 +- per/drf_views.py | 66 ++- per/factories.py | 18 +- .../0121_opslearningcacheresponse_and_more.py | 18 +- per/models.py | 34 +- per/ops_learning_summary.py | 394 ++++++++++++------ per/serializers.py | 78 +++- per/task.py | 66 ++- poetry.lock | 2 +- 11 files changed, 500 insertions(+), 213 deletions(-) diff --git a/main/permissions.py b/main/permissions.py index d58662e96..390c782b3 100644 --- a/main/permissions.py +++ b/main/permissions.py @@ -20,6 +20,10 @@ class DenyGuestUserMutationPermission(permissions.BasePermission): """ def _has_permission(self, request, view): + # Allow all safe methods (GET, HEAD, OPTIONS) which are non-mutating. + if request.method in permissions.SAFE_METHODS: + return True + # For mutation methods (POST, PUT, DELETE, etc.): # Check if the user is authenticated. if not bool(request.user and request.user.is_authenticated): diff --git a/main/urls.py b/main/urls.py index 591aae031..430342b8c 100644 --- a/main/urls.py +++ b/main/urls.py @@ -117,7 +117,6 @@ router.register(r"public-per-stats", per_views.CountryPublicPerStatsViewset, basename="public_country_per_stats") router.register(r"per-stats", per_views.CountryPerStatsViewset, basename="country_per_stats") router.register(r"ops-learning", per_views.OpsLearningViewset, basename="ops_learning") -router.register(r"ops-learning-summary", per_views.OpsLearningSummaryViewset, basename="ops_learning_summary") router.register(r"per-document-upload", per_views.PerDocumentUploadViewSet, basename="per_document_upload") router.register(r"personnel_deployment", deployment_views.PersonnelDeploymentViewset, basename="personnel_deployment") diff --git a/per/cache.py b/per/cache.py index e6587abd8..bbdd0f175 100644 --- a/per/cache.py +++ b/per/cache.py @@ -4,8 +4,13 @@ import django_filters from django.core.serializers.json import DjangoJSONEncoder +from django.db.models import Count, Prefetch -from per.models import OpsLearningCacheResponse +from per.models import ( + OpsLearningCacheResponse, + OpsLearningComponentCacheResponse, + OpsLearningSectorCacheResponse, +) class OpslearningSummaryCacheHelper: @@ -47,15 +52,24 @@ def get_or_create( } hash_value = self.generate_hash(filter_data) # Check if the summary is already cached - ops_learning_summary, created = OpsLearningCacheResponse.objects.get_or_create( + # NOTE: count for the related components and sectors are prefetched + ops_learning_summary, created = OpsLearningCacheResponse.objects.prefetch_related( + "used_ops_learning", + Prefetch( + "ops_learning_component", + queryset=OpsLearningComponentCacheResponse.objects.select_related( + "component", + ).annotate(count=Count("used_ops_learning")), + ), + Prefetch( + "ops_learning_sector", + queryset=OpsLearningSectorCacheResponse.objects.select_related( + "sector", + ).annotate(count=Count("used_ops_learning")), + ), + ).get_or_create( used_filters_hash=hash_value, used_filters=filter_data, - status=OpsLearningCacheResponse.Status.SUCCESS, defaults={"status": OpsLearningCacheResponse.Status.PENDING}, ) - if not created: - return ops_learning_summary - # TODO send a http code of task is pending and return the task id - # transaction.on_commit(lambda: generate_summary.delay(ops_learning_summary, filter_data)) - # return Response({"task_id": ops_learning_summary.id}, status=202) - return OpsLearningCacheResponse.objects.filter(status=OpsLearningCacheResponse.Status.SUCCESS).first() + return ops_learning_summary, filter_data diff --git a/per/drf_views.py b/per/drf_views.py index 23e7e8e72..385959a98 100644 --- a/per/drf_views.py +++ b/per/drf_views.py @@ -2,6 +2,7 @@ import pytz from django.conf import settings +from django.db import transaction from django.db.models import Prefetch, Q from django.http import HttpResponse from django.shortcuts import get_object_or_404 @@ -34,6 +35,7 @@ PerGeneralPermission, PerPermission, ) +from per.task import generate_summary from per.utils import filter_per_queryset_by_user_access from .admin_classes import RegionRestrictedAdmin @@ -54,6 +56,8 @@ NiceDocument, OpsLearning, OpsLearningCacheResponse, + OpsLearningComponentCacheResponse, + OpsLearningSectorCacheResponse, OrganizationTypes, Overview, PerAssessment, @@ -678,6 +682,18 @@ class OpsLearningFilter(filters.FilterSet): widget=CSVWidget, queryset=FormComponent.objects.all(), ) + insight_id = filters.NumberFilter( + label="Base Insight id for used extracts", + method="get_cache_response", + ) + insight_sector_id = filters.NumberFilter(label="Sector insight id for used extracts", method="get_cache_response_sector") + insight_component_id = filters.NumberFilter( + label="Component insight id for used extracts", + method="get_cache_response_component", + ) + # NOTE: overriding the fields for the typing issue + sector_validated = filters.NumberFilter(field_name="sector_validated", lookup_expr="exact") + per_component_validated = filters.NumberFilter(field_name="per_component_validated", lookup_expr="exact") class Meta: model = OpsLearning @@ -689,8 +705,6 @@ class Meta: "learning": ("exact", "icontains"), "learning_validated": ("exact", "icontains"), "organization_validated": ("exact",), - "sector_validated": ("exact",), - "per_component_validated": ("exact",), "appeal_code": ("exact", "in"), "appeal_code__code": ("exact", "icontains", "in"), "appeal_code__num_beneficiaries": ("exact", "gt", "gte", "lt", "lte"), @@ -704,6 +718,23 @@ class Meta: "appeal_code__region": ("exact", "in"), } + def get_cache_response(self, queryset, name, value): + if value and (ops_learning_cache_response := OpsLearningCacheResponse.objects.filter(id=value).first()): + return queryset.filter(id__in=ops_learning_cache_response.used_ops_learning.all()) + return queryset + + def get_cache_response_sector(self, queryset, name, value): + if value and (ops_learning_sector_cache_response := OpsLearningSectorCacheResponse.objects.filter(id=value).first()): + return queryset.filter(id__in=ops_learning_sector_cache_response.used_ops_learning.all()) + return queryset + + def get_cache_response_component(self, queryset, name, value): + if value and ( + ops_learning_component_cache_response := OpsLearningComponentCacheResponse.objects.filter(id=value).first() + ): + return queryset.filter(id__in=ops_learning_component_cache_response.used_ops_learning.all()) + return queryset + class OpsLearningViewset(viewsets.ModelViewSet): """ @@ -736,7 +767,13 @@ def get_queryset(self): return qs.select_related( "appeal_code", ).prefetch_related( - "sector", "organization", "per_component", "sector_validated", "organization_validated", "per_component_validated" + "sector", + "organization", + "per_component", + "sector_validated", + "organization_validated", + "per_component_validated", + "appeal_code__event__countries_for_preview", ) return ( qs.filter(is_validated=True) @@ -744,7 +781,13 @@ def get_queryset(self): "appeal_code", ) .prefetch_related( - "sector", "organization", "per_component", "sector_validated", "organization_validated", "per_component_validated" + "sector", + "organization", + "per_component", + "sector_validated", + "organization_validated", + "per_component_validated", + "appeal_code__event__countries_for_preview", ) ) @@ -813,7 +856,7 @@ def get_renderer_context(self): @extend_schema( request=None, filters=True, - responses=OpsLearningSummarySerializer(), + responses=OpsLearningSummarySerializer, ) @action( detail=False, @@ -825,7 +868,11 @@ def summary(self, request): """ Get the Ops Learning Summary based on the filters """ - ops_learning_summary_instance = OpslearningSummaryCacheHelper.get_or_create(request, [self.filterset_class]) + ops_learning_summary_instance, filter_data = OpslearningSummaryCacheHelper.get_or_create(request, [self.filterset_class]) + if ops_learning_summary_instance.status == OpsLearningCacheResponse.Status.SUCCESS: + return response.Response(OpsLearningSummarySerializer(ops_learning_summary_instance).data) + + transaction.on_commit(lambda: generate_summary.delay(ops_learning_summary_instance.id, filter_data)) return response.Response(OpsLearningSummarySerializer(ops_learning_summary_instance).data) @@ -839,10 +886,3 @@ def get_queryset(self): queryset = super().get_queryset() user = self.request.user return filter_per_queryset_by_user_access(user, queryset) - - -class OpsLearningSummaryViewset(viewsets.ReadOnlyModelViewSet): - queryset = OpsLearningCacheResponse.objects.all() - serializer_class = OpsLearningSummarySerializer - permission_classes = [permissions.IsAuthenticated] - pagination_class = None diff --git a/per/factories.py b/per/factories.py index a1cf16bf9..d9673ea92 100644 --- a/per/factories.py +++ b/per/factories.py @@ -64,7 +64,7 @@ class Meta: class FormComponentFactory(factory.django.DjangoModelFactory): area = factory.SubFactory(FormAreaFactory) - title = fuzzy.FuzzyText(length=50, prefix="component-") + title = factory.Faker("sentence", nb_words=5) class Meta: model = FormComponent @@ -114,12 +114,12 @@ class Meta: class OpsLearningCacheResponseFactory(factory.django.DjangoModelFactory): used_filters_hash = fuzzy.FuzzyText(length=20) - insights1_title = fuzzy.FuzzyText(length=50, prefix="insights1-title-") - insights1_content = fuzzy.FuzzyText(length=100, prefix="insights1-content-") - insights2_title = fuzzy.FuzzyText(length=50, prefix="insights2-title-") - insights2_content = fuzzy.FuzzyText(length=100, prefix="insights2-content-") - insights3_title = fuzzy.FuzzyText(length=50, prefix="insights3-title-") - insights3_content = fuzzy.FuzzyText(length=100, prefix="insights3-content-") + insights1_title = factory.Faker("sentence", nb_words=5) + insights1_content = factory.Faker("sentence", nb_words=20) + insights2_title = factory.Faker("sentence", nb_words=5) + insights2_content = factory.Faker("sentence", nb_words=25) + insights3_title = factory.Faker("sentence", nb_words=10) + insights3_content = factory.Faker("sentence", nb_words=30) class Meta: model = OpsLearningCacheResponse @@ -127,7 +127,7 @@ class Meta: class OpsLearningSectorCacheResponseFactory(factory.django.DjangoModelFactory): filter_response = factory.SubFactory(OpsLearningCacheResponseFactory) - content = fuzzy.FuzzyText(length=50) + content = factory.Faker("sentence", nb_words=30) sector = factory.SubFactory(SectorTagFactory) class Meta: @@ -136,7 +136,7 @@ class Meta: class OpsLearningComponentCacheResponseFactory(factory.django.DjangoModelFactory): filter_response = factory.SubFactory(OpsLearningCacheResponseFactory) - content = fuzzy.FuzzyText(length=50) + content = factory.Faker("sentence", nb_words=30) component = factory.SubFactory(FormComponentFactory) class Meta: diff --git a/per/migrations/0121_opslearningcacheresponse_and_more.py b/per/migrations/0121_opslearningcacheresponse_and_more.py index 867283872..0d28af841 100644 --- a/per/migrations/0121_opslearningcacheresponse_and_more.py +++ b/per/migrations/0121_opslearningcacheresponse_and_more.py @@ -1,4 +1,4 @@ -# Generated by Django 4.2.14 on 2024-08-02 10:51 +# Generated by Django 4.2.15 on 2024-08-23 03:54 import django.db.models.deletion from django.db import migrations, models @@ -21,7 +21,9 @@ class Migration(migrations.Migration): ( "status", models.IntegerField( - choices=[(0, "pending"), (1, "started"), (2, "success"), (3, "failed")], default=0, verbose_name="status" + choices=[(1, "Pending"), (2, "Started"), (3, "Success"), (4, "No extract available"), (5, "Failed")], + default=1, + verbose_name="status", ), ), ("insights1_content", models.TextField(blank=True, null=True, verbose_name="insights 1")), @@ -32,15 +34,15 @@ class Migration(migrations.Migration): ("insights3_title", models.CharField(blank=True, max_length=255, null=True, verbose_name="insights 3 title")), ( "insights1_confidence_level", - models.CharField(blank=True, null=True, verbose_name="insights 1 confidence level"), + models.CharField(blank=True, max_length=10, null=True, verbose_name="insights 1 confidence level"), ), ( "insights2_confidence_level", - models.CharField(blank=True, null=True, verbose_name="insights 2 confidence level"), + models.CharField(blank=True, max_length=10, null=True, verbose_name="insights 2 confidence level"), ), ( "insights3_confidence_level", - models.CharField(blank=True, null=True, verbose_name="insights 3 confidence level"), + models.CharField(blank=True, max_length=10, null=True, verbose_name="insights 3 confidence level"), ), ("contradictory_reports", models.TextField(blank=True, null=True, verbose_name="contradictory reports")), ("modified_at", models.DateTimeField(auto_now=True, verbose_name="modified_at")), @@ -54,7 +56,7 @@ class Migration(migrations.Migration): ("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), ("prompt_hash", models.CharField(max_length=32, verbose_name="used prompt hash")), ("prompt", models.TextField(blank=True, null=True, verbose_name="used prompt")), - ("type", models.IntegerField(choices=[(0, "primary"), (1, "secondary")], verbose_name="type")), + ("type", models.IntegerField(choices=[(1, "Primary"), (2, "Secondary")], verbose_name="type")), ("response", models.JSONField(default=dict, verbose_name="response")), ], ), @@ -62,7 +64,7 @@ class Migration(migrations.Migration): name="OpsLearningSectorCacheResponse", fields=[ ("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), - ("content", models.TextField(verbose_name="content")), + ("content", models.TextField(blank=True, null=True, verbose_name="content")), ( "filter_response", models.ForeignKey( @@ -88,7 +90,7 @@ class Migration(migrations.Migration): name="OpsLearningComponentCacheResponse", fields=[ ("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")), - ("content", models.TextField(verbose_name="content")), + ("content", models.TextField(blank=True, null=True, verbose_name="content")), ( "component", models.ForeignKey( diff --git a/per/models.py b/per/models.py index 3795e2ba4..8752f28ef 100644 --- a/per/models.py +++ b/per/models.py @@ -753,8 +753,8 @@ def __str__(self): class OpsLearningPromptResponseCache(models.Model): class PromptType(models.IntegerChoices): - PRIMARY = 0, _("primary") - SECONDARY = 1, _("secondary") + PRIMARY = 1, _("Primary") + SECONDARY = 2, _("Secondary") prompt_hash = models.CharField(verbose_name=_("used prompt hash"), max_length=32) prompt = models.TextField(verbose_name=_("used prompt"), null=True, blank=True) @@ -768,10 +768,11 @@ def __str__(self) -> str: class OpsLearningCacheResponse(models.Model): class Status(models.IntegerChoices): - PENDING = 0, _("pending") - STARTED = 1, _("started") - SUCCESS = 2, _("success") - FAILED = 3, _("failed") + PENDING = 1, _("Pending") + STARTED = 2, _("Started") + SUCCESS = 3, _("Success") + NO_EXTRACT_AVAILABLE = 4, _("No extract available") + FAILED = 5, _("Failed") used_filters_hash = models.CharField(verbose_name=_("used filters hash"), max_length=32) used_filters = models.JSONField(verbose_name=_("used filters"), default=dict) @@ -786,10 +787,15 @@ class Status(models.IntegerChoices): insights2_title = models.CharField(verbose_name=_("insights 2 title"), max_length=255, null=True, blank=True) insights3_title = models.CharField(verbose_name=_("insights 3 title"), max_length=255, null=True, blank=True) - insights1_confidence_level = models.CharField(verbose_name=_("insights 1 confidence level"), null=True, blank=True) - insights2_confidence_level = models.CharField(verbose_name=_("insights 2 confidence level"), null=True, blank=True) - insights3_confidence_level = models.CharField(verbose_name=_("insights 3 confidence level"), null=True, blank=True) - + insights1_confidence_level = models.CharField( + verbose_name=_("insights 1 confidence level"), max_length=10, null=True, blank=True + ) + insights2_confidence_level = models.CharField( + verbose_name=_("insights 2 confidence level"), max_length=10, null=True, blank=True + ) + insights3_confidence_level = models.CharField( + verbose_name=_("insights 3 confidence level"), max_length=10, null=True, blank=True + ) contradictory_reports = models.TextField(verbose_name=_("contradictory reports"), null=True, blank=True) used_ops_learning = models.ManyToManyField( @@ -816,14 +822,14 @@ class OpsLearningSectorCacheResponse(models.Model): on_delete=models.CASCADE, related_name="+", ) - content = models.TextField(verbose_name=_("content")) + content = models.TextField(verbose_name=_("content"), null=True, blank=True) used_ops_learning = models.ManyToManyField( OpsLearning, related_name="+", ) def __str__(self) -> str: - return f"sector - {self.content}" + return f"Summary - sector - {self.sector.title}" class OpsLearningComponentCacheResponse(models.Model): @@ -839,11 +845,11 @@ class OpsLearningComponentCacheResponse(models.Model): on_delete=models.CASCADE, related_name="+", ) - content = models.TextField(verbose_name=_("content")) + content = models.TextField(verbose_name=_("content"), null=True, blank=True) used_ops_learning = models.ManyToManyField( OpsLearning, related_name="+", ) def __str__(self) -> str: - return f"component - {self.content}" + return f"Summary - component - {self.component.title}" diff --git a/per/ops_learning_summary.py b/per/ops_learning_summary.py index 70dbad9e8..68a9a018d 100644 --- a/per/ops_learning_summary.py +++ b/per/ops_learning_summary.py @@ -1,12 +1,15 @@ import ast -import os +import re import typing -from itertools import chain +from itertools import chain, zip_longest import pandas as pd import tiktoken from django.conf import settings + +# from django.db import transaction from django.db.models import F +from django.utils.functional import cached_property from openai import AzureOpenAI from api.logger import logger @@ -25,6 +28,24 @@ ) +class AzureOpenAiChat: + + @cached_property + def client(self): + return AzureOpenAI( + azure_endpoint=settings.AZURE_OPENAI_ENDPOINT, api_key=settings.AZURE_OPENAI_KEY, api_version="2023-05-15" + ) + + def get_response(self, message): + try: + response = self.client.chat.completions.create( + model=settings.AZURE_OPENAI_DEPLOYMENT_NAME, messages=message, temperature=0.7 + ) + return response.choices[0].message.content + except Exception as e: + logger.error(f"Error while generating response: {e}", exc_info=True) + + class OpsLearningSummaryTask: PROMPT_DATA_LENGTH_LIMIT = 5000 @@ -37,43 +58,54 @@ class OpsLearningSummaryTask: primary_prompt = ( "Please aggregate and summarize the provided data into UP TO THREE structured paragraphs. " "The output MUST strictly adhere to the format below: " - "- Title: Each finding should begin with the main finding TITLE in bold. " + "- *Title*: Each finding should begin with the main finding TITLE in bold. " + "Should be a high level summary of the finding below. " + "The length of the title MUST be between 20 and 30 characters." + "- *Excerpts ID*: Identify the ids of the excerpts you took into account for creating the summary. " "- Content: Aggregate findings so that they are supported by evidence from more than one report. " "Always integrate evidence from multiple reports or items into the paragraph, and " "include the year and country of the evidence." - "- Confidence Level: For each finding, based on the number of items/reports connected to the finding, " - "assign a score from 1 to 5 where 1 is the lowest and 5 is the highest. " - "The format should be 'Confidence level: #/5' (e.g., 'Confidence level: 4/5'). " + "- *Confidence Level*: Based on the number of excerpts connected to the finding, " + "assign a score from 1 to 5 where 1 is the lowest and 5 is the highest, e.g. 4/5" "At the end of the summary, please highlight any contradictory country reports. " - "DO NOT use data from any source other than the one provided. Provide your answer in JSON form. " - "Reply with only the answer in valid JSON form and include no other commentary: " + "Important:" + "-- DO NOT mention the excerpts id in the content of the summary." + "-- DO NOT mention the confidence level in the content of the summary." + "-- DO NOT use data from any source other than the one provided." + "Output Format:" + "Provide your answer in valid JSON form. Reply with only the answer in valid JSON form and include no other commentary. " "Example: " - '{"0": {"title": "Flexible and Adaptive Response Planning", ' + '{"0": {"title": "Flexible and Adaptive Response Planning", "excerpts id":"123, 45" ' '"content": "Responses in Honduras, Peru, Ecuador, and Panama highlight the importance of adaptable strategies. ' "The shift from youth-focused MHPSS to inclusive care in Peru in 2021, the pivot from sanitation infrastructure " "to direct aid in Ecuador in 2022, and the responsive livelihood support in Panama in 2020, " "all underscore the need for continuous reassessment and agile adaptation to the complex, " 'changing needs of disaster-affected communities.", "confidence level": "4/5"}, ' - '"1": {"title": "...", "content": "...", "confidence level": "..."}, ' - '"2": {"title": "...", "content": "...", "confidence level": "..."}, ' + '"1": {"title": "...", "excerpts id":"...", "content": "...", "confidence level": "..."}, ' + '"2": {"title": "...", "excerpts id":"...", "content": "...", "confidence level": "..."}, ' '"contradictory reports": "..."}' ) secondary_prompt = ( "Please aggregate and summarize this data into structured paragraphs (as few as possible, as many as necessary). " "The output SHOULD ALWAYS follow the format below: " - "Type: Whether the paragraph is related to a 'sector' or a 'component'. " - "Subtype: Provides the name of the sector or of the component to which the paragraph refers. " - "Content: A short summary aggregating findings related to the Subtype, so that they are supported by " - "evidence coming from more than one report, " - "and there is ONLY ONE entry per subtype. Always integrate in the paragraph evidence that supports it " - "from the data available from multiple reports or items, " - "include year and country of the evidence. DO NOT use data from any source other than the " - "one provided. Provide your answer in JSON form. " - "Reply with ONLY the answer in valid JSON form and include NO OTHER COMMENTARY: " - '{"0": {"type": "sector", "subtype": "shelter", "content": "lorem ipsum"}, ' - '"1": {"type": "component", "subtype": "Information Management (IM)", "content": "lorem ipsum"}, ' - '"2": {"type": "sector", "subtype": "WASH", "content": "lorem ipsum"}}' + "- *Type*: Whether the paragraph is related to a 'sector' or a 'component' " + "- *Subtype*: Provides the name of the sector or of the component to which the paragraph refers." + "- *Excerpts ID*: Identify the ids of the excerpts you took into account for creating the summary." + "*Content*: A short summary aggregating findings related to the Subtype, " + "so that they are supported by evidence coming from more than one report, " + "and there is ONLY ONE entry per subtype. Always integrate in the paragraph evidence that supports " + "it from the data available from multiples reports or items, include year and country of the evidence. " + "The length of each paragraph MUST be between 20 and 30 words." + " Important:" + "- ONLY create one summary per subtype" + "- DO NOT mention the ids of the excerpts in the content of the summary." + "- DO NOT use data from any source other than the one provided. " + "Output Format:" + "Provide your answer in valid JSON form. Reply with ONLY the answer in JSON form and include NO OTHER COMMENTARY." + '{"0": {"type": "sector", "subtype": "shelter", "excerpts id":"43, 1375, 14543", "content": "lorem ipsum"}, ' + '"1": {"type": "component", "subtype": "Information Management", "excerpts id":"23, 235", "content": "lorem ipsum"}, ' + '"2": {"type": "sector", "subtype": "WASH", "excerpts id":"30, 40", "content": "lorem ipsum"}}' ) system_message = ( @@ -88,7 +120,7 @@ class OpsLearningSummaryTask: primary_instruction_prompt = ( "You should:" - "1. Describe, Summarize and Compare: Identify and detail the who, what, where, when and how many." + "1. Describe, Summarize and Compare: Identify and detail the who, what, where and when" "2. Explain and Connect: Analyze why events happened and how they are related" "3. Identify gaps: Assess what data is available, what is missing and potential biases" "4. Identify key messages: Determine important stories and signals hidden in the data" @@ -97,17 +129,13 @@ class OpsLearningSummaryTask: secondary_instruction_prompt = ( "You should for each section in the data (TYPE & SUBTYPE combination):" - "1. Describe, Summarize and Compare: Identify and detail the who, what, where, when and how many." + "1. Describe, Summarize and Compare: Identify and detail the who, what, where and when" "2. Explain and Connect: Analyze why events happened and how they are related" "3. Identify gaps: Assess what data is available, what is missing and potential biases" "4. Identify key messages: Determine if there are important stories and signals hidden in the data" "5. Conclude and make your case" ) - client = AzureOpenAI( - azure_endpoint=settings.AZURE_OPENAI_ENDPOINT, api_key=settings.AZURE_OPENAI_KEY, api_version="2023-05-15" - ) - def count_tokens(string, encoding_name): """Returns the number of tokens in a text string.""" encoding = tiktoken.get_encoding(encoding_name) @@ -119,40 +147,127 @@ def change_ops_learning_status(instance: OpsLearningCacheResponse, status: OpsLe instance.status = status instance.save(update_fields=["status"]) + @staticmethod + def add_used_ops_learnings(instance: OpsLearningCacheResponse, used_ops_learnings: typing.List[int]): + """Adds the used OPS learnings to the cache response.""" + instance.used_ops_learning.add(*used_ops_learnings) + + @staticmethod + def add_used_ops_learnings_sector( + instance: OpsLearningCacheResponse, content: str, used_ops_learnings: typing.List[int], sector: str + ): + """Adds the used OPS learnings to the cache response.""" + sector_instance = ( + SectorTag.objects.exclude(is_deprecated=True) + .filter( + title__iexact=sector, + ) + .first() + ) + if not sector_instance: + logger.error(f"Sector '{sector}' not found.", exc_info=True) + return + ops_learning_sector, created = ( + OpsLearningSectorCacheResponse.objects.select_related("filter_response", "sector") + .prefetch_related( + "used_ops_learning", + ) + .get_or_create(sector=sector_instance, filter_response=instance, defaults={"content": content}) + ) + if created: + for ops_learning in used_ops_learnings: + ops_learning_instance = OpsLearning.objects.filter(id=ops_learning).first() + if not ops_learning_instance: + logger.error(f"Excerpt '{ops_learning}' not found.", exc_info=True) + continue + ops_learning_sector.used_ops_learning.add(ops_learning_instance) + + @staticmethod + def add_used_ops_learnings_component( + instance: OpsLearningCacheResponse, content: str, used_ops_learnings: typing.List[int], component: str + ): + """Adds the used OPS learnings to the cache response.""" + component_instance = FormComponent.objects.filter( + title__iexact=component, + ).first() + if not component_instance: + logger.error(f"Component '{component}' not found.", exc_info=True) + return + ops_learning_component, created = ( + OpsLearningComponentCacheResponse.objects.select_related("filter_response", "component") + .prefetch_related( + "used_ops_learning", + ) + .get_or_create(component=component_instance, filter_response=instance, defaults={"content": content}) + ) + if created: + for ops_learning in used_ops_learnings: + ops_learning_instance = OpsLearning.objects.filter(id=ops_learning).first() + if not ops_learning_instance: + logger.error(f"Excerpt '{ops_learning}' not found.", exc_info=True) + continue + ops_learning_component.used_ops_learning.add(ops_learning_instance) + @classmethod def fetch_ops_learnings(self, filter_data): """Fetches the OPS learnings from the database.""" - ops_learning_qs = OpsLearning.objects.annotate( - component_title=F("per_component__title"), - sector_title=F("sector__title"), - ops_learning_id=F("id"), + ops_learning_qs = ( + OpsLearning.objects.filter(is_validated=True) + .select_related("per_component", "sector", "appeal_code__country", "appeal_code__region", "appeal_code__dtype") + .annotate( + excerpts_id=F("id"), + component_title=F("per_component__title"), + sector_title=F("sector__title"), + country_id=F("appeal_code__country__id"), + country_name=F("appeal_code__country__name"), + region_id=F("appeal_code__region__id"), + region_name=F("appeal_code__region__label"), + appeal_name=F("appeal_code__name"), + appeal_year=F("appeal_code__start_date"), + dtype_name=F("appeal_code__dtype__name"), + ) ) from per.drf_views import OpsLearningFilter ops_learning_filtered_qs = OpsLearningFilter(filter_data, queryset=ops_learning_qs).qs - ops_learning_df = pd.DataFrame( - list( - ops_learning_filtered_qs.values( + if not ops_learning_filtered_qs.exists(): + logger.info("No OPS learnings found for the given filter.") + ops_learning_df = pd.DataFrame( + columns=[ "id", - "ops_learning_id", - "component_title", + "excerpts_id", + "component", + "sector", "learning", - "appeal_code__country_id", - "appeal_code__country__region_id", - "appeal_code__name", - "appeal_code__start_date", - "sector_title", - ) + "country_id", + "country_name", + "region_id", + "region_name", + "appeal_name", + "appeal_year", + "dtype_name", + ] + ) + return ops_learning_df + ops_learning_df = pd.DataFrame.from_records( + ops_learning_filtered_qs.values( + "id", + "excerpts_id", + "component_title", + "sector_title", + "learning", + "country_id", + "country_name", + "region_id", + "region_name", + "appeal_name", + "appeal_year", ) ) ops_learning_df = ops_learning_df.rename( columns={ "component_title": "component", "sector_title": "sector", - "appeal_code__country_id": "country_id", - "appeal_code__country__region_id": "region_id", - "appeal_code__name": "appeal_name", - "appeal_code__start_date": "appeal_year", } ) ops_learning_df.set_index("id", inplace=True) @@ -303,7 +418,7 @@ def _add_new_component(prioritized_components, per_prioritized_components, df): @classmethod def prioritize_components( self, - filter_data: dict, + ops_learning_df: pd.DataFrame, regional_list, global_list, country_list, @@ -329,7 +444,9 @@ def _identify_type_prioritization(df): def _contextualize_learnings(df): """Adds appeal year and event name as a contextualization of the leannings.""" for index, row in df.iterrows(): - df.at[index, "learning"] = f"In {row['appeal_year']} in {row['appeal_name']}: {row['learning']}" + df.at[index, "learning"] = ( + f"{row['excerpts_id']}. In {row['appeal_year']} in {row['appeal_name']}: {row['learning']}" + ) df = df.drop(columns=["appeal_name"]) logger.info("Contextualization added to DataFrame.") @@ -341,7 +458,8 @@ def _contextualize_learnings(df): components_regions = regional_list.to_dict(orient="records") components_regions = {item["region"]: item["components"] for item in components_regions} - ops_learning_df = self.fetch_ops_learnings(filter_data) + # Contectualize the learnings + ops_learning_df = _contextualize_learnings(ops_learning_df) if _need_component_prioritization(ops_learning_df, self.MIN_DIF_COMPONENTS, self.MIN_DIF_EXCERPTS): type_prioritization = _identify_type_prioritization(ops_learning_df) @@ -350,13 +468,12 @@ def _contextualize_learnings(df): ) prioritized_learnings = ops_learning_df logger.info("Prioritization of components completed.") - prioritized_learnings = _contextualize_learnings(prioritized_learnings) return prioritized_learnings @classmethod def slice_dataframe(self, df, limit=2000, encoding_name="cl100k_base"): - df["count_temp"] = [self.count_tokens(x, encoding_name) for x in df["learning"]] - df["cumsum"] = df["count_temp"].cumsum() + df.loc[:, "count_temp"] = [self.count_tokens(x, encoding_name) for x in df["learning"]] + df.loc[:, "cumsum"] = df["count_temp"].cumsum() slice_index = None for i in range(1, len(df)): @@ -376,21 +493,23 @@ def prioritize_excerpts(self, df: pd.DataFrame): logger.info("Prioritizing excerpts within token limit.") # Droping duplicates based on 'learning' column for primary DataFrame - primary_learning_df = df.drop_duplicates(subset="learning") - primary_learning_df = primary_learning_df.sort_values(by="appeal_year", ascending=False) - primary_learning_df.reset_index(inplace=True, drop=True) + primary_learning_df = ( + df.drop_duplicates(subset="learning").sort_values(by="appeal_year", ascending=False).reset_index(drop=True) + ) # Droping duplicates based on 'learning' and 'component' columns for secondary DataFrame - secondary_learning_df = df.drop_duplicates(subset=["learning", "component"]) - secondary_learning_df = secondary_learning_df.sort_values(by=["component", "appeal_year"], ascending=[True, False]) + secondary_learning_df = df.drop_duplicates(subset=["learning", "component", "sector"]).sort_values( + by=["component", "appeal_year"], ascending=[True, False] + ) grouped = secondary_learning_df.groupby("component") # Create an interleaved list of rows - interleaved = list(chain(*zip(*[group[1].itertuples(index=False) for group in grouped]))) + interleaved = list(chain(*zip_longest(*[group[1].itertuples(index=False) for group in grouped], fillvalue=None))) # Convert the interleaved list of rows back to a DataFrame - result = pd.DataFrame(interleaved) - result.reset_index(inplace=True, drop=True) + result = ( + pd.DataFrame(interleaved, columns=secondary_learning_df.columns).dropna(subset=["component"]).reset_index(drop=True) + ) # Slice the Primary and secondary dataframes sliced_primary_learning_df = self.slice_dataframe(primary_learning_df, self.PROMPT_DATA_LENGTH_LIMIT, self.ENCODING_NAME) @@ -401,6 +520,7 @@ def prioritize_excerpts(self, df: pd.DataFrame): @classmethod def format_prompt( self, + ops_learning_summary_instance: OpsLearningCacheResponse, primary_learning_df: pd.DataFrame, secondary_learning_df: pd.DataFrame, filter_data: dict, @@ -413,8 +533,7 @@ def _build_intro_section(): return ( "I will provide you with a set of instructions, data, and formatting requests in three sections." + " I will pass you the INSTRUCTIONS section, are you ready?" - + os.linesep - + os.linesep + + "\n\n\n\n" ) def _build_instruction_section(request_filter: dict, df: pd.DataFrame, instruction: str): @@ -448,7 +567,7 @@ def _build_instruction_section(request_filter: dict, df: pd.DataFrame, instructi instructions.append("in Emergency Response.") instructions.append("\n\n" + instruction) - instructions.append("\n\nI will pass you the DATA section, are you ready?\n\n") + instructions.append("\n\nI will pass you the DATA section, are you ready?\n\n\n") return "\n".join(instructions) def get_main_sectors(df: pd.DataFrame): @@ -463,7 +582,8 @@ def get_main_sectors(df: pd.DataFrame): return available_sectors def get_main_components(df: pd.DataFrame): - available_components = list(df["component"].unique()) + temp = df[df["component"] != "NS-specific areas of intervention"] + available_components = list(temp["component"].unique()) nb_components = len(available_components) if nb_components == 0: logger.info("There were not specific components") @@ -474,6 +594,10 @@ def get_main_components(df: pd.DataFrame): def process_learnings_sector(sector, df, max_length_per_section): df = df[df["sector"] == sector].dropna() df_sliced = self.slice_dataframe(df, max_length_per_section, self.ENCODING_NAME) + + if df_sliced["learning"].empty: + return "" + learnings_sector = ( "\n----------------\n" + "SUBTYPE: " @@ -486,6 +610,10 @@ def process_learnings_sector(sector, df, max_length_per_section): def process_learnings_component(component, df, max_length_per_section): df = df[df["component"] == component].dropna() df_sliced = self.slice_dataframe(df, max_length_per_section, self.ENCODING_NAME) + + if df_sliced["learning"].empty: + return "" + learnings_component = ( "\n----------------\n" + "SUBTYPE: " @@ -499,10 +627,20 @@ def _build_data_section(primary_df: pd.DataFrame, secondary_df: pd.DataFrame): # Primary learnings section primary_learnings_data = "\n----------------\n".join(primary_df["learning"].dropna()) + primary_learning_data = primary_df["excerpts_id"].dropna().tolist() + + self.add_used_ops_learnings( + ops_learning_summary_instance, + used_ops_learnings=primary_learning_data, + ) # Secondary learnings section sectors = get_main_sectors(secondary_df) components = get_main_components(secondary_df) - max_length_per_section = self.PROMPT_DATA_LENGTH_LIMIT / (len(components) + len(sectors)) + max_length_per_section = self.PROMPT_DATA_LENGTH_LIMIT + + if (len(sectors) + len(components)) > 0: + max_length_per_section = self.PROMPT_DATA_LENGTH_LIMIT / (len(components) + len(sectors)) + learnings_sectors = ( "\n----------------\n\n" + "TYPE: SECTORS" @@ -564,17 +702,12 @@ def _summarize(prompt, type: OpsLearningPromptResponseCache.PromptType, system_m logger.warning("The length of the prompt might be too long.") return "{}" - try: - response = self.client.chat.completions.create( - model=settings.AZURE_OPENAI_DEPLOYMENT_NAME, messages=messages, temperature=0.7 - ) - summary = response.choices[0].message.content - return summary - except Exception as e: - logger.error(f"Error in summarizing: {e}") - raise + # Using Azure OpenAI to summarize the prompt + client = AzureOpenAiChat() + response = client.get_response(message=messages) + return response - def _validate_format(summary) -> bool: + def _validate_format(summary, MAX_RETRIES=3): """ Validates the format of the summary and modifies it if necessary. """ @@ -595,30 +728,41 @@ def _modify_format(summary) -> str: return formatted_summary except Exception as e: - logger.error(f"Modification failed: {e}") + logger.error(f"Modification failed: {e}", exc_info=True) return "{}" formatted_summary = {} + retires = 0 + # Attempt to parse the summary as a dictionary if _validate_text_is_dictionary(summary): formated_summary = ast.literal_eval(summary) - return formated_summary else: formatted_summary = _modify_format(summary) formatted_summary = ast.literal_eval(formatted_summary) - return formatted_summary + + # Checking if the generated summary is empty + if bool(formated_summary): + return formated_summary + + # NOTE: Generating the summary if summary is empty + while retires < MAX_RETRIES: + self.generate_summary(prompt, type) + retires += 1 + logger.info(f"Retrying.... Attempt {retires}/{MAX_RETRIES}") def _modify_summary(summary: dict) -> dict: """ Checks if the "Confidence level" is present in the primary response and skipping for the secondary summary """ for key, value in summary.items(): - if key == "contradictory reports" or "confidence level" in value: + confidence_level = "confidence level" + if key == "contradictory reports" or confidence_level in value: continue - if "Confidence level" in value["content"]: - confidence_value = value["content"].split("Confidence level:")[-1].strip() - value["content"] = value["content"].split("Confidence level:")[0] - value["confidence level"] = confidence_value + if confidence_level in value["content"].lower(): + parts = re.split(rf"(?i)\b{confidence_level}\b", value["content"]) + value["content"] = parts[0] + value["confidence level"] = parts[1][1:].strip() return summary @@ -635,8 +779,10 @@ def _get_or_create_summary(self, prompt: str, prompt_hash: str, type: OpsLearnin type=type, defaults={"prompt": prompt}, ) - if not created: - summary = instance.response + if not created and not bool(instance.response): + summary = self.generate_summary(prompt=prompt, type=type) + instance.response = summary + instance.save(update_fields=["response"]) return summary summary = self.generate_summary(prompt=prompt, type=type) instance.response = summary @@ -652,16 +798,16 @@ def save_to_db( ): logger.info("Saving to database.") # Primary summary - ops_learning_summary_instance.insights1_title = primary_summary["0"]["title"] - ops_learning_summary_instance.insights2_title = primary_summary["1"]["title"] - ops_learning_summary_instance.insights3_title = primary_summary["2"]["title"] - ops_learning_summary_instance.insights1_content = primary_summary["0"]["content"] - ops_learning_summary_instance.insights2_content = primary_summary["1"]["content"] - ops_learning_summary_instance.insights3_content = primary_summary["2"]["content"] - ops_learning_summary_instance.insights1_confidence_level = primary_summary["0"]["confidence level"] - ops_learning_summary_instance.insights2_confidence_level = primary_summary["1"]["confidence level"] - ops_learning_summary_instance.insights3_confidence_level = primary_summary["2"]["confidence level"] - ops_learning_summary_instance.contradictory_reports = primary_summary["contradictory reports"] + ops_learning_summary_instance.insights1_title = primary_summary.get("0", None).get("title", None) + ops_learning_summary_instance.insights2_title = primary_summary.get("1", None).get("title", None) + ops_learning_summary_instance.insights3_title = primary_summary.get("2", None).get("title", None) + ops_learning_summary_instance.insights1_content = primary_summary.get("0", None).get("content", None) + ops_learning_summary_instance.insights2_content = primary_summary.get("1", None).get("content", None) + ops_learning_summary_instance.insights3_content = primary_summary.get("2", None).get("content", None) + ops_learning_summary_instance.insights1_confidence_level = primary_summary.get("0", None).get("confidence level", None) + ops_learning_summary_instance.insights2_confidence_level = primary_summary.get("1", None).get("confidence level", None) + ops_learning_summary_instance.insights3_confidence_level = primary_summary.get("2", None).get("confidence level", None) + ops_learning_summary_instance.contradictory_reports = primary_summary.get("contradictory reports", None) ops_learning_summary_instance.save( update_fields=[ "insights1_title", @@ -676,40 +822,31 @@ def save_to_db( "contradictory_reports", ] ) - # Secondary summary - for key, value in secondary_summary.items(): - type = value["type"] - subtype = value["subtype"] - content = value["content"] - - if type == "component": - component_instance = FormComponent.objects.filter( - title__iexact=subtype, - ).first() - if not component_instance: - logger.error(f"Component '{subtype}' not found.") - continue - OpsLearningComponentCacheResponse.objects.create( - component=component_instance, + for _, value in secondary_summary.items(): + type = value["type"].strip() + subtype = value["subtype"].strip() + content = value["content"].strip() + excerpt_ids = value["excerpts id"] + excerpt_id_list = list(set(int(id.strip()) for id in excerpt_ids.split(",") if excerpt_ids != "")) + + if type == "component" and excerpt_id_list: + self.add_used_ops_learnings_component( + instance=ops_learning_summary_instance, content=content, - filter_response=ops_learning_summary_instance, + used_ops_learnings=excerpt_id_list, + component=subtype, ) - elif type == "sector": - sector_instance = SectorTag.objects.filter( - title__iexact=subtype, - ).first() - if not sector_instance: - logger.error(f"Sector '{subtype}' not found.") - continue - OpsLearningSectorCacheResponse.objects.create( - sector=sector_instance, + + elif type == "sector" and excerpt_id_list: + self.add_used_ops_learnings_sector( + instance=ops_learning_summary_instance, content=content, - filter_response=ops_learning_summary_instance, + used_ops_learnings=excerpt_id_list, + sector=subtype, ) else: - logger.error(f"Invalid type '{type}' on secondary summary.") - self.change_ops_learning_status(ops_learning_summary_instance, OpsLearningCacheResponse.Status.SUCCESS) + logger.error(f"Type '{type}' of {len(excerpt_id_list)} on secondary summary.", exc_info=True) logger.info("Saved to database.") @classmethod @@ -741,4 +878,9 @@ def get_or_create_summary( ) # Saving into the database - self.save_to_db(ops_learning_summary_instance, primary_summary, secondary_summary) + self.save_to_db( + ops_learning_summary_instance=ops_learning_summary_instance, + primary_summary=primary_summary, + secondary_summary=secondary_summary, + ) + self.change_ops_learning_status(instance=ops_learning_summary_instance, status=OpsLearningCacheResponse.Status.SUCCESS) diff --git a/per/serializers.py b/per/serializers.py index 298853172..9bc76e3b3 100644 --- a/per/serializers.py +++ b/per/serializers.py @@ -6,9 +6,10 @@ from drf_spectacular.utils import extend_schema_field from rest_framework import serializers -from api.models import Appeal, AppealType, Country, Region +from api.models import Appeal, AppealDocument, AppealType, Country, Region from api.serializers import ( MiniCountrySerializer, + MiniEventSerializer, RegoCountrySerializer, UserNameSerializer, ) @@ -927,6 +928,7 @@ class Meta: class FullAppealSerializer(serializers.ModelSerializer): atype = serializers.SerializerMethodField() + event_details = MiniEventSerializer(source="event", read_only=True) @staticmethod def get_atype(obj): @@ -939,6 +941,7 @@ class Meta: class MicroAppealSerializer(serializers.ModelSerializer): atype = serializers.SerializerMethodField() + event_details = MiniEventSerializer(source="event", read_only=True) @staticmethod def get_atype(obj): @@ -946,7 +949,14 @@ def get_atype(obj): class Meta: model = Appeal - fields = ("code", "name", "atype") + fields = ( + "id", + "code", + "name", + "atype", + "event_details", + "country", + ) class OpsLearningCSVSerializer(serializers.ModelSerializer): @@ -1017,18 +1027,25 @@ class Meta: class OpsLearningSerializer(serializers.ModelSerializer): - appeal_code = FullAppealSerializer(allow_null=True, read_only=True) - - def to_representation(self, instance): - data = super().to_representation(instance) - data["appeal"] = data.pop("appeal_code") - return data + appeal = FullAppealSerializer(source="appeal_code", allow_null=True, read_only=True) + document_url = serializers.SerializerMethodField() + document_name = serializers.SerializerMethodField() class Meta: model = OpsLearning fields = "__all__" read_only_fields = ("created_at", "modified_at") + @staticmethod + def get_document_url(obj): + if obj.appeal_document_id and (document := AppealDocument.objects.filter(id=obj.appeal_document_id).first()): + return document.document_url + + @staticmethod + def get_document_name(obj): + if obj.appeal_document_id and (document := AppealDocument.objects.filter(id=obj.appeal_document_id).first()): + return document.name + class OpsLearningInSerializer(serializers.ModelSerializer): @@ -1041,13 +1058,28 @@ class PublicOpsLearningSerializer(serializers.ModelSerializer): # We do not extract appeal details here, except appeal type, which is important. # Only the validated items are shown, arriving from get_queryset(). - appeal_code = MicroAppealSerializer(allow_null=True, read_only=True) + appeal = MicroAppealSerializer(source="appeal_code", allow_null=True, read_only=True) + document_url = serializers.SerializerMethodField() + document_name = serializers.SerializerMethodField() class Meta: model = OpsLearning - read_only_fields = ("created_at", "modified_at") + read_only_fields = ( + "created_at", + "modified_at", + ) exclude = ("learning", "type", "organization", "sector", "per_component") + @staticmethod + def get_document_url(obj): + if obj.appeal_document_id and (document := AppealDocument.objects.filter(id=obj.appeal_document_id).first()): + return document.document_url + + @staticmethod + def get_document_name(obj): + if obj.appeal_document_id and (document := AppealDocument.objects.filter(id=obj.appeal_document_id).first()): + return document.name + class PerDocumentUploadSerializer(serializers.ModelSerializer): MAX_NUMBER_OF_DOCUMENTS = 10 @@ -1125,23 +1157,39 @@ class Meta: class OpsLearningSectorCacheResponseSerializer(serializers.ModelSerializer): title = serializers.CharField(source="sector.title", read_only=True) + # NOTE: Prefetched count is used here + extract_count = serializers.IntegerField(source="count", read_only=True) class Meta: model = OpsLearningSectorCacheResponse - fields = ["content", "title"] + fields = [ + "id", + "content", + "title", + "extract_count", + ] class OpsLearningComponentCacheResponseSerializer(serializers.ModelSerializer): title = serializers.CharField(source="component.title", read_only=True) + # NOTE: Prefetched count is used here + extract_count = serializers.IntegerField(source="count", read_only=True) class Meta: model = OpsLearningComponentCacheResponse - fields = ["content", "title"] + fields = [ + "id", + "content", + "title", + "extract_count", + ] class OpsLearningSummarySerializer(serializers.ModelSerializer): sectors = OpsLearningSectorCacheResponseSerializer(source="ops_learning_sector", many=True) components = OpsLearningComponentCacheResponseSerializer(source="ops_learning_component", many=True) + status_display = serializers.CharField(source="get_status_display", read_only=True) + extract_count = serializers.SerializerMethodField() class Meta: model = OpsLearningCacheResponse @@ -1153,6 +1201,12 @@ class Meta: "insights2_content", "insights3_title", "insights3_content", + "extract_count", + "status", + "status_display", "sectors", "components", ] + + def get_extract_count(self, obj) -> int: + return OpsLearning.objects.values("id").filter(id__in=obj.used_ops_learning.all()).count() diff --git a/per/task.py b/per/task.py index faf80eaf7..a149101a7 100644 --- a/per/task.py +++ b/per/task.py @@ -6,25 +6,51 @@ @shared_task -def generate_summary(ops_learning_summary_instance: OpsLearningCacheResponse, filter_data: dict): - try: - OpsLearningSummaryTask.change_ops_learning_status( - instance=ops_learning_summary_instance, status=OpsLearningCacheResponse.Status.STARTED - ) - regional_list, global_list, country_list = OpsLearningSummaryTask.generate_priotization_list() - prioritized_learnings = OpsLearningSummaryTask.prioritize_components( - filter_data, regional_list, global_list, country_list - ) - primary_learning_df, secondary_learning_df = OpsLearningSummaryTask.prioritize_excerpts(prioritized_learnings) - primary_learning_prompt, secondary_learning_prompt = OpsLearningSummaryTask.format_prompt( - primary_learning_df, secondary_learning_df, filter_data - ) - OpsLearningSummaryTask.get_or_create_summary( - ops_learning_summary_instance, primary_learning_prompt, secondary_learning_prompt - ) - except Exception as e: +def generate_summary(ops_learning_summary_id: int, filter_data: dict): + ops_learning_summary_instance = OpsLearningCacheResponse.objects.filter(id=ops_learning_summary_id).first() + if not ops_learning_summary_instance: + logger.error("Ops learning summary not found", exc_info=True) + return False + + # Change Ops Learning Summary Status to STARTED + OpsLearningSummaryTask.change_ops_learning_status( + instance=ops_learning_summary_instance, status=OpsLearningCacheResponse.Status.STARTED + ) + + # Fetch ops-learning/extracts data + ops_learning_df = OpsLearningSummaryTask.fetch_ops_learnings(filter_data=filter_data) + + # Check if ops-learning data is available + if not ops_learning_df.empty: + try: + # Generate prioritization list + regional_list, global_list, country_list = OpsLearningSummaryTask.generate_priotization_list() + + # Prioritize components + prioritized_learnings = OpsLearningSummaryTask.prioritize_components( + ops_learning_df=ops_learning_df, regional_list=regional_list, global_list=global_list, country_list=country_list + ) + primary_learning_df, secondary_learning_df = OpsLearningSummaryTask.prioritize_excerpts(prioritized_learnings) + + # Format prompt + primary_learning_prompt, secondary_learning_prompt = OpsLearningSummaryTask.format_prompt( + ops_learning_summary_instance, primary_learning_df, secondary_learning_df, filter_data + ) + + # Generate summary + OpsLearningSummaryTask.get_or_create_summary( + ops_learning_summary_instance, primary_learning_prompt, secondary_learning_prompt + ) + return True + except Exception: + OpsLearningSummaryTask.change_ops_learning_status( + instance=ops_learning_summary_instance, status=OpsLearningCacheResponse.Status.FAILED + ) + logger.error("Ops learning summary process failed", exc_info=True) + return False + else: OpsLearningSummaryTask.change_ops_learning_status( - instance=ops_learning_summary_instance, status=OpsLearningCacheResponse.Status.FAILED + instance=ops_learning_summary_instance, status=OpsLearningCacheResponse.Status.NO_EXTRACT_AVAILABLE ) - logger.error(e) - raise e + logger.error("No extracts found", exc_info=True) + return False diff --git a/poetry.lock b/poetry.lock index fbb023e2f..322a937d7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4174,4 +4174,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "d7ce03962d2902b2a528f6c8076b46b3a81ce65192169bdd0b7a8f84939c1b6e" +content-hash = "20605696f885463e7cd329d14f615db1eee638209ce1cbd457f4a369dc16e096"