the-deep · ranjan-stha · Apr 27, 2022 · Apr 27, 2022 · Apr 27, 2022 · Apr 27, 2022
diff --git a/.github/workflows/terraform-plan.yml b/.github/workflows/terraform-plan.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - initial_setup
+      - feat/sentry-integration
 
 jobs:
   terraform:

diff --git a/dev.tfvars b/dev.tfvars
@@ -5,6 +5,7 @@ environment = "dev"
 # api gateway
 api_gateway_name = "rapi"
 vpce_id = "vpce-02c7bb08b571074e1"
+vpc_id = "vpc-0e65245d5e4c2deaf"
 
 # models
 model_endpoint_name = "test-all-models-rsh"
@@ -16,4 +17,7 @@ model_info_fn_name = "model_info"
 docs_extract_fn_image_name = "extractor-tool"
 
 # docs convert lambda
-docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice"
+docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice"
+
+# sentry url
+sentry_url = "https://[email protected]/1223576"
diff --git a/lambda_fns/entry_predict/app.py b/lambda_fns/entry_predict/app.py
@@ -6,6 +6,8 @@
 import logging
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
+import sentry_sdk
+
 from postprocess_raw_preds import get_predictions_all, get_clean_thresholds, get_clean_ratios
 
 logging.getLogger().setLevel(logging.INFO)
@@ -19,13 +21,18 @@
 RELIABILITY_FN_NAME = os.environ.get("RELIABILITY_FN_NAME")
 MODEL_INFO_FN_NAME = os.environ.get("MODEL_INFO_FN_NAME")
 
+SENTRY_URL = os.environ.get("SENTRY_URL")
+ENVIRONMENT = os.environ.get("ENVIRONMENT")
+
 sqs_client = boto3.client('sqs', region_name=AWS_REGION)
 
 sagemaker_rt = boto3.client("runtime.sagemaker", region_name="us-east-1")  # todo: update the region later.
 geolocation_client = boto3.client("lambda", region_name="us-east-1")
 reliability_client = boto3.client("lambda", region_name="us-east-1")
 model_info_client = boto3.client("lambda", region_name=AWS_REGION)
 
+sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0)
+
 
 class PredictionStatus(Enum):
     FAILED = 0

diff --git a/lambda_fns/entry_predict/requirements.txt b/lambda_fns/entry_predict/requirements.txt
@@ -1 +1,2 @@
-requests==2.26.0
+requests==2.26.0
+sentry-sdk==1.5.8
diff --git a/lambda_fns/entry_predict_output_request/app.py b/lambda_fns/entry_predict_output_request/app.py
@@ -1,13 +1,20 @@
+import os
 import requests
 import json
 import logging
+import sentry_sdk
 from mappings.tags_mapping import get_all_mappings, get_categories
 try:
     from lambda_fns.model_info.app import lambda_handler
     model_info_mock_data = json.loads(lambda_handler({"mock": True}, None)["body"])
 except ImportError:
     pass
 
+SENTRY_URL = os.environ.get("SENTRY_URL")
+ENVIRONMENT = os.environ.get("ENVIRONMENT")
+
+sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0)
+
 logging.getLogger().setLevel(logging.INFO)
 
 mappings = get_all_mappings()

diff --git a/lambda_fns/entry_predict_output_request/requirements.txt b/lambda_fns/entry_predict_output_request/requirements.txt
@@ -1 +1,2 @@
-requests==2.26.0
+requests==2.26.0
+sentry-sdk==1.5.8
diff --git a/lambda_fns/extract_docs/app.py b/lambda_fns/extract_docs/app.py
@@ -12,6 +12,8 @@
 import tempfile
 import signal
 
+import sentry_sdk
+
 from deep_parser import TextFromFile
 from deep_parser import TextFromWeb
 
@@ -32,12 +34,21 @@
 
 domain_name = os.environ.get("EXTRACTOR_DOMAIN_NAME", "http://extractor:8001")
 
+SENTRY_URL = os.environ.get("SENTRY_URL")
+ENVIRONMENT = os.environ.get("ENVIRONMENT")
+
 s3_client = boto3.client('s3', region_name=aws_region)
 sqs_client = boto3.client('sqs', region_name=aws_region)
 lambda_client = boto3.client('lambda', region_name="us-east-1")
 
 extract_content_type = ExtractContentType()
 
+sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0)
+
+REQ_HEADERS = {
+    'user-agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1')
+}
+
 
 class ExtractionStatus(Enum):
     FAILED = 0
@@ -245,7 +256,7 @@ def get_extracted_text_web_links(link, file_name, mock=False):
 def handle_urls(url, mock=False):
     file_name = None
 
-    content_type = extract_content_type.get_content_type(url)
+    content_type = extract_content_type.get_content_type(url, REQ_HEADERS)
 
     file_name = EXTRACTED_FILE_NAME
 
@@ -270,7 +281,7 @@ def handle_urls(url, mock=False):
         s3_file_path = None
         s3_images_path = None
 
-        response = requests.get(url, stream=True)
+        response = requests.get(url, headers=REQ_HEADERS, stream=True)
         with tempfile.NamedTemporaryFile(mode='w+b') as temp:
             for chunk in response.iter_content(chunk_size=128):
                 temp.write(chunk)
@@ -298,7 +309,7 @@ def handle_urls(url, mock=False):
 
         ext_type = content_type
 
-        response = requests.get(url, stream=True)
+        response = requests.get(url, headers=REQ_HEADERS, stream=True)
 
         with tempfile.NamedTemporaryFile(mode='w+b') as temp:
             for chunk in response.iter_content(chunk_size=128):
@@ -339,6 +350,10 @@ def handle_urls(url, mock=False):
                 logging.error(f"Exception occurred {e}")
                 s3_file_path, s3_images_path, total_pages, total_words_count = None, None, -1, -1
                 extraction_status = ExtractionStatus.FAILED.value
+    elif content_type == UrlTypes.IMG.value:
+        logging.warn("Text extraction from Images is not available.")
+        s3_file_path, s3_images_path, total_pages, total_words_count = None, None, -1, -1
+        extraction_status = ExtractionStatus.FAILED.value
     else:
         raise NotImplementedError
 
@@ -388,7 +403,7 @@ def process_docs(event, context):
                 }
                 send_message2sqs(**sqs_message)
         except Exception as e:
-            logging.error(f"Exception is {e}")
+            logging.error(e, exc_info=True)
 
         signal.alarm(0)
 

diff --git a/lambda_fns/extract_docs/content_types.py b/lambda_fns/extract_docs/content_types.py
@@ -1,6 +1,11 @@
+import os
 import requests
 from enum import Enum
 import logging
+try:
+    from wget import download
+except ImportError:
+    from .wget import download
 
 logging.getLogger().setLevel(logging.INFO)
 
@@ -14,6 +19,7 @@ class UrlTypes(str, Enum):
     MSWORD = 'doc'
     XLSX = 'xlsx'
     XLS = 'xls'
+    IMG = 'img'
 
 
 class ExtractContentType:
@@ -27,10 +33,11 @@ def __init__(self):
         self.content_types_ppt = ('application/vnd.ms-powerpoint')
         self.content_types_xlsx = ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
         self.content_types_xls = ('application/vnd.ms-excel')
+        self.content_types_img = ('image/jpeg', 'image/gif', 'image/png', 'image/svg+xml', 'image/webp', 'image/bmp', 'image/tiff')
 
-    def get_content_type(self, url):
+    def get_content_type(self, url, req_headers):
         try:
-            response = requests.head(url)
+            response = requests.head(url, headers=req_headers)
             content_type = response.headers['Content-Type']
 
             logging.info(f'The content type of {url} is {content_type}')
@@ -53,9 +60,33 @@ def get_content_type(self, url):
                 return UrlTypes.PPTX.value
             elif url.endswith(".ppt") or content_type in self.content_types_ppt:
                 return UrlTypes.PPT.value
+            elif url.endswith(".jpg") or url.endswith(".jpeg") or url.endswith(".png") or \
+                url.endswith(".gif") or url.endswith(".bmp") or content_type in self.content_types_img:
+                return UrlTypes.IMG.value
             else:
-                logging.warn(f'Could not determine the content-type of the {url}')
-                return None
+                temp_filepath = download(url, out="/tmp/")
+                if os.path.exists(temp_filepath):
+                    os.remove(temp_filepath)
+                if temp_filepath.endswith(".pdf"):
+                    return UrlTypes.PDF.value
+                elif temp_filepath.endswith(".docx"):
+                    return UrlTypes.DOCX.value
+                elif temp_filepath.endswith(".doc"):
+                    return UrlTypes.MSWORD.value
+                elif temp_filepath.endswith(".xlsx"):
+                    return UrlTypes.XLSX.value
+                elif temp_filepath.endswith(".xls"):
+                    return UrlTypes.XLS.value
+                elif temp_filepath.endswith(".pptx"):
+                    return UrlTypes.PPTX.value
+                elif temp_filepath.endswith(".ppt"):
+                    return UrlTypes.PPT.value
+                elif temp_filepath.endswith(".jpg") or temp_filepath.endswith(".jpeg") or temp_filepath.endswith(".png") or \
+                    temp_filepath.endswith(".gif") or temp_filepath.endswith(".bmp"):
+                    return UrlTypes.IMG.value
+                else:
+                    logging.warn(f'Could not determine the content-type of the {url}')
+                    return None
         except requests.exceptions.RequestException:
             logging.error(f'Exception occurred. Could not determine the content-type of the {url}')
             return None
diff --git a/lambda_fns/extract_docs/requirements.txt b/lambda_fns/extract_docs/requirements.txt
@@ -10,4 +10,5 @@ six==1.16.0
 urllib3==1.26.6
 Pillow==8.3.2
 lxml==4.7.1
-deep-parser @ git+https://github.com/the-deep/deepExt
+sentry-sdk==1.5.8
+deep-parser @ git+https://github.com/the-deep/deepex