Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/sentry integration #2

Open
wants to merge 13 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/terraform-plan.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ on:
push:
branches:
- initial_setup
- feat/sentry-integration

jobs:
terraform:
Expand Down
6 changes: 5 additions & 1 deletion dev.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ environment = "dev"
# api gateway
api_gateway_name = "rapi"
vpce_id = "vpce-02c7bb08b571074e1"
vpc_id = "vpc-0e65245d5e4c2deaf"

# models
model_endpoint_name = "test-all-models-rsh"
Expand All @@ -16,4 +17,7 @@ model_info_fn_name = "model_info"
docs_extract_fn_image_name = "extractor-tool"

# docs convert lambda
docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice"
docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice"

# sentry url
sentry_url = "https://[email protected]/1223576"
7 changes: 7 additions & 0 deletions lambda_fns/entry_predict/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed

import sentry_sdk

from postprocess_raw_preds import get_predictions_all, get_clean_thresholds, get_clean_ratios

logging.getLogger().setLevel(logging.INFO)
Expand All @@ -19,13 +21,18 @@
RELIABILITY_FN_NAME = os.environ.get("RELIABILITY_FN_NAME")
MODEL_INFO_FN_NAME = os.environ.get("MODEL_INFO_FN_NAME")

SENTRY_URL = os.environ.get("SENTRY_URL")
ENVIRONMENT = os.environ.get("ENVIRONMENT")

sqs_client = boto3.client('sqs', region_name=AWS_REGION)

sagemaker_rt = boto3.client("runtime.sagemaker", region_name="us-east-1") # todo: update the region later.
geolocation_client = boto3.client("lambda", region_name="us-east-1")
reliability_client = boto3.client("lambda", region_name="us-east-1")
model_info_client = boto3.client("lambda", region_name=AWS_REGION)

sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0)


class PredictionStatus(Enum):
FAILED = 0
Expand Down
3 changes: 2 additions & 1 deletion lambda_fns/entry_predict/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
requests==2.26.0
requests==2.26.0
sentry-sdk==1.5.8
7 changes: 7 additions & 0 deletions lambda_fns/entry_predict_output_request/app.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
import os
import requests
import json
import logging
import sentry_sdk
from mappings.tags_mapping import get_all_mappings, get_categories
try:
from lambda_fns.model_info.app import lambda_handler
model_info_mock_data = json.loads(lambda_handler({"mock": True}, None)["body"])
except ImportError:
pass

SENTRY_URL = os.environ.get("SENTRY_URL")
ENVIRONMENT = os.environ.get("ENVIRONMENT")

sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0)

logging.getLogger().setLevel(logging.INFO)

mappings = get_all_mappings()
Expand Down
3 changes: 2 additions & 1 deletion lambda_fns/entry_predict_output_request/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
requests==2.26.0
requests==2.26.0
sentry-sdk==1.5.8
23 changes: 19 additions & 4 deletions lambda_fns/extract_docs/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import tempfile
import signal

import sentry_sdk

from deep_parser import TextFromFile
from deep_parser import TextFromWeb

Expand All @@ -32,12 +34,21 @@

domain_name = os.environ.get("EXTRACTOR_DOMAIN_NAME", "http://extractor:8001")

SENTRY_URL = os.environ.get("SENTRY_URL")
ENVIRONMENT = os.environ.get("ENVIRONMENT")

s3_client = boto3.client('s3', region_name=aws_region)
sqs_client = boto3.client('sqs', region_name=aws_region)
lambda_client = boto3.client('lambda', region_name="us-east-1")

extract_content_type = ExtractContentType()

sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0)

REQ_HEADERS = {
'user-agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1')
}


class ExtractionStatus(Enum):
FAILED = 0
Expand Down Expand Up @@ -245,7 +256,7 @@ def get_extracted_text_web_links(link, file_name, mock=False):
def handle_urls(url, mock=False):
file_name = None

content_type = extract_content_type.get_content_type(url)
content_type = extract_content_type.get_content_type(url, REQ_HEADERS)

file_name = EXTRACTED_FILE_NAME

Expand All @@ -270,7 +281,7 @@ def handle_urls(url, mock=False):
s3_file_path = None
s3_images_path = None

response = requests.get(url, stream=True)
response = requests.get(url, headers=REQ_HEADERS, stream=True)
with tempfile.NamedTemporaryFile(mode='w+b') as temp:
for chunk in response.iter_content(chunk_size=128):
temp.write(chunk)
Expand Down Expand Up @@ -298,7 +309,7 @@ def handle_urls(url, mock=False):

ext_type = content_type

response = requests.get(url, stream=True)
response = requests.get(url, headers=REQ_HEADERS, stream=True)

with tempfile.NamedTemporaryFile(mode='w+b') as temp:
for chunk in response.iter_content(chunk_size=128):
Expand Down Expand Up @@ -339,6 +350,10 @@ def handle_urls(url, mock=False):
logging.error(f"Exception occurred {e}")
s3_file_path, s3_images_path, total_pages, total_words_count = None, None, -1, -1
extraction_status = ExtractionStatus.FAILED.value
elif content_type == UrlTypes.IMG.value:
logging.warn("Text extraction from Images is not available.")
s3_file_path, s3_images_path, total_pages, total_words_count = None, None, -1, -1
extraction_status = ExtractionStatus.FAILED.value
else:
raise NotImplementedError

Expand Down Expand Up @@ -388,7 +403,7 @@ def process_docs(event, context):
}
send_message2sqs(**sqs_message)
except Exception as e:
logging.error(f"Exception is {e}")
logging.error(e, exc_info=True)

signal.alarm(0)

Expand Down
39 changes: 35 additions & 4 deletions lambda_fns/extract_docs/content_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import os
import requests
from enum import Enum
import logging
try:
from wget import download
except ImportError:
from .wget import download

logging.getLogger().setLevel(logging.INFO)

Expand All @@ -14,6 +19,7 @@ class UrlTypes(str, Enum):
MSWORD = 'doc'
XLSX = 'xlsx'
XLS = 'xls'
IMG = 'img'


class ExtractContentType:
Expand All @@ -27,10 +33,11 @@ def __init__(self):
self.content_types_ppt = ('application/vnd.ms-powerpoint')
self.content_types_xlsx = ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
self.content_types_xls = ('application/vnd.ms-excel')
self.content_types_img = ('image/jpeg', 'image/gif', 'image/png', 'image/svg+xml', 'image/webp', 'image/bmp', 'image/tiff')

def get_content_type(self, url):
def get_content_type(self, url, req_headers):
try:
response = requests.head(url)
response = requests.head(url, headers=req_headers)
content_type = response.headers['Content-Type']

logging.info(f'The content type of {url} is {content_type}')
Expand All @@ -53,9 +60,33 @@ def get_content_type(self, url):
return UrlTypes.PPTX.value
elif url.endswith(".ppt") or content_type in self.content_types_ppt:
return UrlTypes.PPT.value
elif url.endswith(".jpg") or url.endswith(".jpeg") or url.endswith(".png") or \
url.endswith(".gif") or url.endswith(".bmp") or content_type in self.content_types_img:
return UrlTypes.IMG.value
else:
logging.warn(f'Could not determine the content-type of the {url}')
return None
temp_filepath = download(url, out="/tmp/")
if os.path.exists(temp_filepath):
os.remove(temp_filepath)
if temp_filepath.endswith(".pdf"):
return UrlTypes.PDF.value
elif temp_filepath.endswith(".docx"):
return UrlTypes.DOCX.value
elif temp_filepath.endswith(".doc"):
return UrlTypes.MSWORD.value
elif temp_filepath.endswith(".xlsx"):
return UrlTypes.XLSX.value
elif temp_filepath.endswith(".xls"):
return UrlTypes.XLS.value
elif temp_filepath.endswith(".pptx"):
return UrlTypes.PPTX.value
elif temp_filepath.endswith(".ppt"):
return UrlTypes.PPT.value
elif temp_filepath.endswith(".jpg") or temp_filepath.endswith(".jpeg") or temp_filepath.endswith(".png") or \
temp_filepath.endswith(".gif") or temp_filepath.endswith(".bmp"):
return UrlTypes.IMG.value
else:
logging.warn(f'Could not determine the content-type of the {url}')
return None
except requests.exceptions.RequestException:
logging.error(f'Exception occurred. Could not determine the content-type of the {url}')
return None
3 changes: 2 additions & 1 deletion lambda_fns/extract_docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ six==1.16.0
urllib3==1.26.6
Pillow==8.3.2
lxml==4.7.1
deep-parser @ git+https://github.com/the-deep/deepExt
sentry-sdk==1.5.8
deep-parser @ git+https://github.com/the-deep/deepex
Loading