openvinotoolkit · Bepitic · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024 · Jul 15, 2024
@@ -50,6 +50,7 @@ core = [
     "lightning>=2.2",
     "torch>=2",
     "torchmetrics>=1.3.2",
+    "openai>=1.38.0",
     # NOTE: open-clip-torch throws the following error on v2.26.1
     #   torch.onnx.errors.UnsupportedOperatorError: Exporting the operator
     #   'aten::_native_multi_head_attention' to ONNX opset version 14 is not supported

@@ -22,3 +22,4 @@ class TaskType(str, Enum):
     CLASSIFICATION = "classification"
     DETECTION = "detection"
     SEGMENTATION = "segmentation"
+    VISUAL_PROMPTING = "visual prompting"
@@ -75,10 +75,10 @@ def setup(
         pixel_metric_names: list[str] | dict[str, dict[str, Any]]
         if self.pixel_metric_names is None:
             pixel_metric_names = []
-        elif self.task == TaskType.CLASSIFICATION:
+        elif self.task in (TaskType.CLASSIFICATION, TaskType.VISUAL_PROMPTING):
             pixel_metric_names = []
             logger.warning(
-                "Cannot perform pixel-level evaluation when task type is classification. "
+                "Cannot perform pixel-level evaluation when task type is classification or language. "
                 "Ignoring the following pixel-level metrics: %s",
                 self.pixel_metric_names,
             )

@@ -20,9 +20,11 @@
 from anomalib.data.utils import LabelName, masks_to_boxes, read_image, read_mask
 
 _EXPECTED_COLUMNS_CLASSIFICATION = ["image_path", "split"]
+_EXPECTED_COLUMNS_VISUAL_PROMPTING = ["image_path", "split"]
 _EXPECTED_COLUMNS_SEGMENTATION = [*_EXPECTED_COLUMNS_CLASSIFICATION, "mask_path"]
 _EXPECTED_COLUMNS_PERTASK = {
     "classification": _EXPECTED_COLUMNS_CLASSIFICATION,
+    "visual prompting": _EXPECTED_COLUMNS_VISUAL_PROMPTING,
     "segmentation": _EXPECTED_COLUMNS_SEGMENTATION,
     "detection": _EXPECTED_COLUMNS_SEGMENTATION,
 }
@@ -169,7 +171,7 @@ def __getitem__(self, index: int) -> dict[str, str | torch.Tensor]:
         image = read_image(image_path, as_tensor=True)
         item = {"image_path": image_path, "label": label_index}
 
-        if self.task == TaskType.CLASSIFICATION:
+        if self.task in (TaskType.CLASSIFICATION, TaskType.VISUAL_PROMPTING):
             item["image"] = self.transform(image) if self.transform else image
         elif self.task in (TaskType.DETECTION, TaskType.SEGMENTATION):
             # Only Anomalous (1) images have masks in anomaly datasets

@@ -48,7 +48,7 @@ def __getitem__(self, index: int) -> dict[str, str | torch.Tensor]:
         depth_image = to_tensor(read_depth_image(depth_path))
         item = {"image_path": image_path, "depth_path": depth_path, "label": label_index}
 
-        if self.task == TaskType.CLASSIFICATION:
+        if self.task in (TaskType.CLASSIFICATION, TaskType.VISUAL_PROMPTING):
             item["image"], item["depth_image"] = (
                 self.transform(image, depth_image) if self.transform else (image, depth_image)
             )

@@ -277,7 +277,7 @@ def post_process(self, predictions: np.ndarray, metadata: dict | DictConfig | No
             pred_idx = pred_score >= metadata["image_threshold"]
             pred_label = LabelName.ABNORMAL if pred_idx else LabelName.NORMAL
 
-        if task == TaskType.CLASSIFICATION:
+        if task in (TaskType.CLASSIFICATION, TaskType.VISUAL_PROMPTING):
             _, pred_score = self._normalize(pred_scores=pred_score, metadata=metadata)
         elif task in (TaskType.SEGMENTATION, TaskType.DETECTION):
             if "pixel_threshold" in metadata:

@@ -24,6 +24,7 @@
     Fastflow,
     Fre,
     Ganomaly,
+    GptVad,
     Padim,
     Patchcore,
     ReverseDistillation,
@@ -51,6 +52,7 @@ class UnknownModelError(ModuleNotFoundError):
     "Fastflow",
     "Fre",
     "Ganomaly",
+    "GptVad",
     "Padim",
     "Patchcore",
     "ReverseDistillation",

@@ -14,6 +14,7 @@
 from .fastflow import Fastflow
 from .fre import Fre
 from .ganomaly import Ganomaly
+from .gptvad import GptVad
 from .padim import Padim
 from .patchcore import Patchcore
 from .reverse_distillation import ReverseDistillation
@@ -34,6 +35,7 @@
     "Fastflow",
     "Fre",
     "Ganomaly",
+    "GptVad",
     "Padim",
     "Patchcore",
     "ReverseDistillation",

@@ -0,0 +1,7 @@
+"""Generative Pre-Trained Transformer (GPT) based Large Language Model (LLM)."""
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from .lightning_model import GptVad
+
+__all__ = ["GptVad"]
@@ -0,0 +1,169 @@
+"""Wrapper for the OpenAI calls to the VLM model."""
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import os
+from typing import Any
+
+import openai
+
+
+class APIKeyError(Exception):
+    """APIKeyError error."""
+
+
+class GPTWrapper:
+    """A wrapper class for making API calls to OpenAI's GPT-4 model to detect anomalies in images.
+
+    Environment variable OPENAI_API_KEY (str): API key for OpenAI.
+    https://platform.openai.com/docs/quickstart/step-2-set-up-your-api-key
+    Other possible models: https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4
+    All models with vision capabilities: 'gpt-4-turbo-2024-04-09', 'gpt-4-turbo',
+    all versions of 'gpt-4o-mini', and 'gpt-4o'
+
+    Args:
+        images (list[str]): List of base64 images. If only one image is provided,
+        it is treated as the anomalous image. If multiple images are provided,
+        the last one is considered anomalous, and the rest are treated as normal examples.
+        model_name (str): Model name for OpenAI API VLM. Default "gpt-4o"
+        detail (bool): If the images will be sended with high detail or low detail.
+
+    """
+
+    def __init__(self, model_name: str = "gpt-4o", detail: bool = True) -> None:
+        openai_key = os.getenv("OPENAI_API_KEY")
+        self.model_name = model_name
+        self.detail = detail
+        if not openai_key:
+            msg = "OpenAI environment key not found.(OPENAI_API_KEY)"
+            raise APIKeyError(msg)
+
+    def api_call(
+        self,
+        images: list[str],
+        extension: str = "png",
+    ) -> str:
+        """Makes an API call to OpenAI's GPT-4 model to detect anomalies in an image.
+
+        Args:
+            images (list[str]): List of base64 images. If only one image is provided,
+              it is treated as the anomalous image. If multiple images are provided,
+              the last one is considered anomalous, and the rest are treated as normal examples.
+            extension (str): Extension of the group of images that needs to be checked for anomalies. Default = 'png'
+
+        Returns:
+            str: The response from the GPT-4 model indicating whether the image has anomalies or not.
+                  It returns 'NO' if there are no anomalies and 'YES: description' if there are anomalies,
+                  where 'description' provides details of the anomaly and its position.
+
+        Raises:
+            openai.error.OpenAIError: If there is an error during the API call.
+        """
+        prompt: str = ""
+
+        detail_img = "high" if self.detail else "low"
+        messages: list[dict[str, Any]] = []
+
+        if len(images) > 0:
+            # If multiple images are provided, the last one is considered anomalous,
+            # and the rest are treated as normal examples.
+            prompt = """
+             You will receive a group of images that are going to be an example
+             of the typical image without any anomaly,
+             and the last image that you need to decide if it has an anomaly or not.
+             Answer with a 'NO' if it does not have any anomalies and 'YES: description'
+             where description is a description of the anomaly provided, position.
+            """
+
+            messages.append(
+                {
+                    "role": "system",
+                    "content": prompt,
+                },
+            )
+            for image in images:
+                image_message = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/{extension};base64,{image}",
+                                    "detail": detail_img,
+                                },
+                            },
+                        ],
+                    },
+                ]
+                messages.extend(image_message)
+
+        elif len(images) == 1:
+            # If only one image is provided,
+            # it is treated as the anomalous image.
+            prompt = """
+            Examine the provided image carefully to determine if there is an obvious anomaly present.
+            Anomalies may include mechanical malfunctions, unexpected objects, safety hazards, structural damages,
+            or unusual patterns or defects in the objects.
+
+            Instructions:
+
+            1. Thoroughly inspect the image for any irregularities or deviations from normal operating conditions.
+
+            2. Clearly state if an obvious anomaly is detected.
+            - If an anomaly is detected, begin with 'YES,' followed by a detailed description of the anomaly.
+            - If no anomaly is detected, simply state 'NO' and end the analysis.
+
+            Example Output Structure:
+
+            'YES:
+            - Description: Conveyor belt misalignment causing potential blockages.
+            This may result in production delays and equipment damage.
+            Immediate realignment and inspection are recommended.'
+
+            'NO'
+
+            Considerations:
+
+            - Ensure accuracy in identifying anomalies to prevent overlooking critical issues.
+            - Provide clear and concise descriptions for any detected anomalies.
+            - Focus on obvious anomalies that could impact final use of the object operation or safety.
+            """
+            messages.append(
+                {
+                    "role": "system",
+                    "content": prompt,
+                },
+            )
+            # Add the single image
+            messages.append(
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/{extension};base64,{images[0]}",
+                                "detail": detail_img,
+                            },
+                        },
+                    ],
+                },
+            )
+        else:
+            msg = "No images provided for anomaly detection."
+            raise ValueError(msg)
+
+        try:
+            # Make the API call using the openai library
+            response = openai.chat.completions.create(
+                model=self.model_name,
+                messages=messages,
+                max_tokens=300,
+            )
+            return response.choices[-1].message.content or ""
+        except Exception:
+            msg = "Error generating a response with OpenAI API."
+            logging.exception(msg)
+            raise