From adad0a67beffb829fa89b5810c800596c0fa8c9f Mon Sep 17 00:00:00 2001
From: Ranjan Shrestha <ranjan.shrestha.np@gmail.com>
Date: Wed, 27 Apr 2022 11:42:42 +0545
Subject: [PATCH 01/13] lambda functions updated with sentry

---
 lambda_fns/entry_predict/app.py                          | 7 +++++++
 lambda_fns/entry_predict/requirements.txt                | 3 ++-
 lambda_fns/entry_predict_output_request/app.py           | 7 +++++++
 lambda_fns/entry_predict_output_request/requirements.txt | 3 ++-
 lambda_fns/extract_docs/app.py                           | 7 +++++++
 lambda_fns/extract_docs/requirements.txt                 | 3 ++-
 lambda_fns/extract_docs_output_request/app.py            | 7 +++++++
 lambda_fns/extract_docs_output_request/requirements.txt  | 3 ++-
 8 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/lambda_fns/entry_predict/app.py b/lambda_fns/entry_predict/app.py
index e50fdded..319c7c10 100644
--- a/lambda_fns/entry_predict/app.py
+++ b/lambda_fns/entry_predict/app.py
@@ -6,6 +6,8 @@
 import logging
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
+import sentry_sdk
+
 from postprocess_raw_preds import get_predictions_all, get_clean_thresholds, get_clean_ratios
 
 logging.getLogger().setLevel(logging.INFO)
@@ -19,6 +21,9 @@
 RELIABILITY_FN_NAME = os.environ.get("RELIABILITY_FN_NAME")
 MODEL_INFO_FN_NAME = os.environ.get("MODEL_INFO_FN_NAME")
 
+SENTRY_URL = os.environ.get("SENTRY_URL")
+ENVIRONMENT = os.environ.get("ENVIRONMENT")
+
 sqs_client = boto3.client('sqs', region_name=AWS_REGION)
 
 sagemaker_rt = boto3.client("runtime.sagemaker", region_name="us-east-1")  # todo: update the region later.
@@ -26,6 +31,8 @@
 reliability_client = boto3.client("lambda", region_name="us-east-1")
 model_info_client = boto3.client("lambda", region_name=AWS_REGION)
 
+sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0)
+
 
 class PredictionStatus(Enum):
     FAILED = 0
diff --git a/lambda_fns/entry_predict/requirements.txt b/lambda_fns/entry_predict/requirements.txt
index 4f5b8993..a8661b56 100644
--- a/lambda_fns/entry_predict/requirements.txt
+++ b/lambda_fns/entry_predict/requirements.txt
@@ -1 +1,2 @@
-requests==2.26.0
\ No newline at end of file
+requests==2.26.0
+sentry-sdk==1.5.8
\ No newline at end of file
diff --git a/lambda_fns/entry_predict_output_request/app.py b/lambda_fns/entry_predict_output_request/app.py
index 721446c5..ffb3d2fd 100644
--- a/lambda_fns/entry_predict_output_request/app.py
+++ b/lambda_fns/entry_predict_output_request/app.py
@@ -1,6 +1,8 @@
+import os
 import requests
 import json
 import logging
+import sentry_sdk
 from mappings.tags_mapping import get_all_mappings, get_categories
 try:
     from lambda_fns.model_info.app import lambda_handler
@@ -8,6 +10,11 @@
 except ImportError:
     pass
 
+SENTRY_URL = os.environ.get("SENTRY_URL")
+ENVIRONMENT = os.environ.get("ENVIRONMENT")
+
+sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0)
+
 logging.getLogger().setLevel(logging.INFO)
 
 mappings = get_all_mappings()
diff --git a/lambda_fns/entry_predict_output_request/requirements.txt b/lambda_fns/entry_predict_output_request/requirements.txt
index 4f5b8993..a8661b56 100644
--- a/lambda_fns/entry_predict_output_request/requirements.txt
+++ b/lambda_fns/entry_predict_output_request/requirements.txt
@@ -1 +1,2 @@
-requests==2.26.0
\ No newline at end of file
+requests==2.26.0
+sentry-sdk==1.5.8
\ No newline at end of file
diff --git a/lambda_fns/extract_docs/app.py b/lambda_fns/extract_docs/app.py
index 337c258b..41df9fe0 100644
--- a/lambda_fns/extract_docs/app.py
+++ b/lambda_fns/extract_docs/app.py
@@ -12,6 +12,8 @@
 import tempfile
 import signal
 
+import sentry_sdk
+
 from deep_parser import TextFromFile
 from deep_parser import TextFromWeb
 
@@ -32,12 +34,17 @@
 
 domain_name = os.environ.get("EXTRACTOR_DOMAIN_NAME", "http://extractor:8001")
 
+SENTRY_URL = os.environ.get("SENTRY_URL")
+ENVIRONMENT = os.environ.get("ENVIRONMENT")
+
 s3_client = boto3.client('s3', region_name=aws_region)
 sqs_client = boto3.client('sqs', region_name=aws_region)
 lambda_client = boto3.client('lambda', region_name="us-east-1")
 
 extract_content_type = ExtractContentType()
 
+sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0)
+
 
 class ExtractionStatus(Enum):
     FAILED = 0
diff --git a/lambda_fns/extract_docs/requirements.txt b/lambda_fns/extract_docs/requirements.txt
index b7d263c1..a016f5d9 100644
--- a/lambda_fns/extract_docs/requirements.txt
+++ b/lambda_fns/extract_docs/requirements.txt
@@ -10,4 +10,5 @@ six==1.16.0
 urllib3==1.26.6
 Pillow==8.3.2
 lxml==4.7.1
-deep-parser @ git+https://github.com/the-deep/deepExt
\ No newline at end of file
+deep-parser @ git+https://github.com/the-deep/deepExt
+sentry-sdk==1.5.8
\ No newline at end of file
diff --git a/lambda_fns/extract_docs_output_request/app.py b/lambda_fns/extract_docs_output_request/app.py
index d4166b55..7123f848 100644
--- a/lambda_fns/extract_docs_output_request/app.py
+++ b/lambda_fns/extract_docs_output_request/app.py
@@ -6,6 +6,8 @@
 from botocore.exceptions import ClientError
 from botocore.client import Config
 
+import sentry_sdk
+
 logging.getLogger().setLevel(logging.INFO)
 
 REQUEST_TIMEOUT = 60
@@ -14,6 +16,11 @@
 aws_region = os.environ.get("AWS_REGION", DEFAULT_AWS_REGION)
 signed_url_expiry_secs = os.environ.get("SIGNED_URL_EXPIRY_SECS")
 
+SENTRY_URL = os.environ.get("SENTRY_URL")
+ENVIRONMENT = os.environ.get("ENVIRONMENT")
+
+sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0)
+
 s3_client = boto3.client(
     's3',
     region_name=aws_region,
diff --git a/lambda_fns/extract_docs_output_request/requirements.txt b/lambda_fns/extract_docs_output_request/requirements.txt
index 4f5b8993..a8661b56 100644
--- a/lambda_fns/extract_docs_output_request/requirements.txt
+++ b/lambda_fns/extract_docs_output_request/requirements.txt
@@ -1 +1,2 @@
-requests==2.26.0
\ No newline at end of file
+requests==2.26.0
+sentry-sdk==1.5.8
\ No newline at end of file

From 423a13a220dd381cfdf6dd77f7d5e3071f32cf96 Mon Sep 17 00:00:00 2001
From: Ranjan Shrestha <ranjan.shrestha.np@gmail.com>
Date: Wed, 27 Apr 2022 11:52:39 +0545
Subject: [PATCH 02/13] terraform configs updated with sentry_url passed as env
 variable; updated the configs in gateway for prod, staging envs;

---
 main.tf                                       |  9 +++++++
 modules/api_gateway/main.tf                   |  7 +++---
 modules/api_gateway/variables.tf              |  1 +
 modules/reserved_lambda_entry_predict/main.tf | 25 ++++++++++++-------
 .../variables.tf                              |  4 ++-
 .../reserved_sqs_lambda_extract_docs/main.tf  | 24 ++++++++++--------
 .../variables.tf                              |  4 ++-
 modules/sqs_lambda_entry_predict/main.tf      |  8 ++++++
 modules/sqs_lambda_entry_predict/variables.tf |  4 ++-
 modules/sqs_lambda_extract_docs/main.tf       |  6 ++++-
 modules/sqs_lambda_extract_docs/variables.tf  |  4 ++-
 variables.tf                                  |  6 ++++-
 12 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/main.tf b/main.tf
index 38f4ed20..8f124655 100644
--- a/main.tf
+++ b/main.tf
@@ -44,6 +44,8 @@ module "sqs_lambda_module" {
   reserved_input_queue_arn = "${module.reserved_sqs_lambda_module.reserved_input_queue_arn}"
 
   environment = var.environment
+
+  sentry_url = var.sentry_url
 }
 
 module "reserved_sqs_lambda_module" {
@@ -57,6 +59,8 @@ module "reserved_sqs_lambda_module" {
   docs_convert_lambda_fn_name = var.docs_convert_lambda_fn_name
 
   environment = var.environment
+
+  sentry_url = var.sentry_url
 }
 
 module "sqs_lambda_predict_module" {
@@ -73,6 +77,8 @@ module "sqs_lambda_predict_module" {
 
   aws_region = var.aws_region
   environment = var.environment
+
+  sentry_url = var.sentry_url
 }
 
 module "reserved_sqs_lambda_predict_module" {
@@ -86,6 +92,8 @@ module "reserved_sqs_lambda_predict_module" {
 
   aws_region = var.aws_region
   environment = var.environment
+
+  sentry_url = var.sentry_url
 }
 
 module "apigateway_module" {
@@ -94,6 +102,7 @@ module "apigateway_module" {
   api_gateway_name = var.api_gateway_name
 
   vpce_id = var.vpce_id
+  vpc_id = var.vpc_id
 
   predict_entry_invoke_arn = "${module.sqs_lambda_predict_module.entry_input_pred_request_predict_invoke_arn}"
   process_doc_invoke_arn = "${module.sqs_lambda_module.extract_doc_invoke_arn}"
diff --git a/modules/api_gateway/main.tf b/modules/api_gateway/main.tf
index 3b4f06da..2e60f135 100644
--- a/modules/api_gateway/main.tf
+++ b/modules/api_gateway/main.tf
@@ -137,10 +137,11 @@ resource "aws_api_gateway_rest_api_policy" "api_policy" {
         Effect = "Deny",
         Principal = "*",
         Action = "execute-api:Invoke",
-        Resource = "execute-api:/*/*/*",
+        Resource = "execute-api:/${var.environment}/*/*",
         Condition = {
           StringNotEquals = {
-            "aws:sourceVpce": var.vpce_id
+            "aws:sourceVpce": var.vpce_id,
+            "aws:sourceVpc": var.vpc_id
           }
         }
       },
@@ -148,7 +149,7 @@ resource "aws_api_gateway_rest_api_policy" "api_policy" {
         Effect = "Allow",
         Principal = "*",
         Action = "execute-api:Invoke",
-        Resource = "execute-api:/*/*/*"
+        Resource = "execute-api:/${var.environment}/*/*"
       }
     ]
   })
diff --git a/modules/api_gateway/variables.tf b/modules/api_gateway/variables.tf
index 0f631683..f7bde9d9 100644
--- a/modules/api_gateway/variables.tf
+++ b/modules/api_gateway/variables.tf
@@ -23,6 +23,7 @@ variable "input_te_lambda_fn_alias_arn" {}
 variable "input_te_lambda_fn_alias_name" {}
 
 variable "vpce_id" {}
+variable "vpc_id" {}
 
 variable "predict_entry_invoke_arn" {}
 
diff --git a/modules/reserved_lambda_entry_predict/main.tf b/modules/reserved_lambda_entry_predict/main.tf
index 9e782061..ebfc6924 100644
--- a/modules/reserved_lambda_entry_predict/main.tf
+++ b/modules/reserved_lambda_entry_predict/main.tf
@@ -131,7 +131,7 @@ module "reserved_predict_entry_fn" {
         ]
     })
 
-    provisioned_concurrent_executions = 10
+    provisioned_concurrent_executions = var.environment == "dev" ? 1 : 10
     reserved_concurrent_executions = 30
 
     environment_variables = {
@@ -140,12 +140,14 @@ module "reserved_predict_entry_fn" {
         GEOLOCATION_FN_NAME = var.geolocation_fn_name
         RELIABILITY_FN_NAME = var.reliability_fn_name
         MODEL_INFO_FN_NAME = "${var.model_info_fn_name}-${var.environment}"
+        ENVIRONMENT = "${var.environment}"
+        SENTRY_URL = "${var.sentry_url}"
     }
 }
 
 resource "aws_appautoscaling_target" "reserved_predict_entry_fn_autoscale" {
-    max_capacity       = 10
-    min_capacity       = 5
+    max_capacity       = var.environment == "dev" ? 1 : 10
+    min_capacity       = var.environment == "dev" ? 1 : 5
     resource_id        = "function:${module.reserved_predict_entry_fn.lambda_function_name}:${module.reserved_predict_entry_fn.lambda_function_version}"
     scalable_dimension = "lambda:function:ProvisionedConcurrency"
     service_namespace  = "lambda"
@@ -200,13 +202,18 @@ module "reserved_entry_predict_output_fn" {
         ]
     })
 
-    provisioned_concurrent_executions = 5
+    provisioned_concurrent_executions = var.environment == "dev" ? 1 : 5
 
     layers = ["${aws_lambda_layer_version.reserved_lambda_layer_mappings.arn}"]
 
     build_in_docker = true
     #store_on_s3 = true
     #s3_bucket = "${var.processed_docs_bucket}"
+
+    environment_variables = {
+        ENVIRONMENT = "${var.environment}"
+        SENTRY_URL = "${var.sentry_url}"
+    }
 }
 
 resource "aws_lambda_layer_version" "reserved_lambda_layer_mappings" {
@@ -218,8 +225,8 @@ resource "aws_lambda_layer_version" "reserved_lambda_layer_mappings" {
 }
 
 resource "aws_appautoscaling_target" "reserved_entry_predict_output_fn_autoscale" {
-    max_capacity       = 5
-    min_capacity       = 2
+    max_capacity       = var.environment == "dev" ? 1 : 5
+    min_capacity       = var.environment == "dev" ? 1 : 2
     resource_id        = "function:${module.reserved_entry_predict_output_fn.lambda_function_name}:${module.reserved_entry_predict_output_fn.lambda_function_version}"
     scalable_dimension = "lambda:function:ProvisionedConcurrency"
     service_namespace  = "lambda"
@@ -266,7 +273,7 @@ module "reserved_entry_predict_transfer_dlq_msg" {
         ]
     })
 
-    provisioned_concurrent_executions = 5
+    provisioned_concurrent_executions = var.environment == "dev" ? 1 : 5
 
     environment_variables = {
         PREDICTION_QUEUE = aws_sqs_queue.reserved_entry_input_processed_queue_predict.id
@@ -274,8 +281,8 @@ module "reserved_entry_predict_transfer_dlq_msg" {
 }
 
 resource "aws_appautoscaling_target" "reserved_entry_predict_transfer_dlq_msg_autoscale" {
-    max_capacity       = 5
-    min_capacity       = 2
+    max_capacity       = var.environment == "dev" ? 1 : 5
+    min_capacity       = var.environment == "dev" ? 1 : 2
     resource_id        = "function:${module.reserved_entry_predict_transfer_dlq_msg.lambda_function_name}:${module.reserved_entry_predict_transfer_dlq_msg.lambda_function_version}"
     scalable_dimension = "lambda:function:ProvisionedConcurrency"
     service_namespace  = "lambda"
diff --git a/modules/reserved_lambda_entry_predict/variables.tf b/modules/reserved_lambda_entry_predict/variables.tf
index caa3db1e..d02dcd0b 100644
--- a/modules/reserved_lambda_entry_predict/variables.tf
+++ b/modules/reserved_lambda_entry_predict/variables.tf
@@ -10,4 +10,6 @@ variable reliability_fn_name {}
 
 variable model_info_fn_name {}
 
-variable processed_docs_bucket {}
\ No newline at end of file
+variable processed_docs_bucket {}
+
+variable sentry_url {}
\ No newline at end of file
diff --git a/modules/reserved_sqs_lambda_extract_docs/main.tf b/modules/reserved_sqs_lambda_extract_docs/main.tf
index 19fa3342..62fc6f7e 100644
--- a/modules/reserved_sqs_lambda_extract_docs/main.tf
+++ b/modules/reserved_sqs_lambda_extract_docs/main.tf
@@ -131,15 +131,17 @@ module "reserved_extract_docs_fn" {
         ]
     })
 
-    provisioned_concurrent_executions = 10
+    provisioned_concurrent_executions = var.environment == "dev" ? 1 : 10
     reserved_concurrent_executions = 30
 
     build_in_docker = true
     environment_variables = {
         INPUT_QUEUE = aws_sqs_queue.reserved_input_queue.id
         DEST_S3_BUCKET = "${var.processed_docs_bucket}"
-        PROCESSED_QUEUE = aws_sqs_queue.reserved_processed_queue.id,
+        PROCESSED_QUEUE = aws_sqs_queue.reserved_processed_queue.id
         DOCS_CONVERT_LAMBDA_FN_NAME = "${var.docs_convert_lambda_fn_name}"
+        ENVIRONMENT = "${var.environment}"
+        SENTRY_URL = "${var.sentry_url}"
     }
 }
 
@@ -156,8 +158,8 @@ module "reserved_extract_docs_fn" {
 # }
 
 resource "aws_appautoscaling_target" "reserved_extract_docs_autoscale" {
-    max_capacity       = 10
-    min_capacity       = 5
+    max_capacity       = var.environment == "dev" ? 1 : 10
+    min_capacity       = var.environment == "dev" ? 1 : 5
     resource_id        = "function:${module.reserved_extract_docs_fn.lambda_function_name}:${module.reserved_extract_docs_fn.lambda_function_version}"
     scalable_dimension = "lambda:function:ProvisionedConcurrency"
     service_namespace  = "lambda"
@@ -204,7 +206,7 @@ module "reserved_output_request_fn" {
         ]
     })
 
-    provisioned_concurrent_executions = 5
+    provisioned_concurrent_executions = var.environment == "dev" ? 1 : 5
 
     build_in_docker = true
     #store_on_s3 = true
@@ -212,12 +214,14 @@ module "reserved_output_request_fn" {
 
     environment_variables = {
         SIGNED_URL_EXPIRY_SECS = "${var.signed_url_expiry_secs}"
+        ENVIRONMENT = "${var.environment}"
+        SENTRY_URL = "${var.sentry_url}"
     }
 }
 
 resource "aws_appautoscaling_target" "reserved_output_fn_autoscale" {
-    max_capacity       = 5
-    min_capacity       = 2
+    max_capacity       = var.environment == "dev" ? 1 : 5
+    min_capacity       = var.environment == "dev" ? 1 : 2
     resource_id        = "function:${module.reserved_output_request_fn.lambda_function_name}:${module.reserved_output_request_fn.lambda_function_version}"
     scalable_dimension = "lambda:function:ProvisionedConcurrency"
     service_namespace  = "lambda"
@@ -265,7 +269,7 @@ module "reserved_transfer_dlq_msg" {
         ]
     })
 
-    provisioned_concurrent_executions = 5
+    provisioned_concurrent_executions = var.environment == "dev" ? 1 : 5
 
     environment_variables = {
         PROCESSED_QUEUE = aws_sqs_queue.reserved_processed_queue.id
@@ -273,8 +277,8 @@ module "reserved_transfer_dlq_msg" {
 }
 
 resource "aws_appautoscaling_target" "reserved_transfer_dlq_msg_autoscale" {
-    max_capacity       = 5
-    min_capacity       = 2
+    max_capacity       = var.environment == "dev" ? 1 : 5
+    min_capacity       = var.environment == "dev" ? 1 : 2
     resource_id        = "function:${module.reserved_transfer_dlq_msg.lambda_function_name}:${module.reserved_transfer_dlq_msg.lambda_function_version}"
     scalable_dimension = "lambda:function:ProvisionedConcurrency"
     service_namespace  = "lambda"
diff --git a/modules/reserved_sqs_lambda_extract_docs/variables.tf b/modules/reserved_sqs_lambda_extract_docs/variables.tf
index 7d28e964..2b183b0b 100644
--- a/modules/reserved_sqs_lambda_extract_docs/variables.tf
+++ b/modules/reserved_sqs_lambda_extract_docs/variables.tf
@@ -13,4 +13,6 @@ variable "processed_docs_bucket_arn" {}
 
 variable "docs_extract_fn_image_name" {}
 
-variable "docs_convert_lambda_fn_name" {}
\ No newline at end of file
+variable "docs_convert_lambda_fn_name" {}
+
+variable sentry_url {}
\ No newline at end of file
diff --git a/modules/sqs_lambda_entry_predict/main.tf b/modules/sqs_lambda_entry_predict/main.tf
index 6693d3ff..7c944d2d 100644
--- a/modules/sqs_lambda_entry_predict/main.tf
+++ b/modules/sqs_lambda_entry_predict/main.tf
@@ -106,6 +106,7 @@ module "predict_entry_fn" {
     source_path = [
     {
         path = "${path.module}/../../lambda_fns/entry_predict"
+        pip_requirements = "${path.module}/../../lambda_fns/entry_predict/requirements.txt"
     }
     ]
 
@@ -145,6 +146,8 @@ module "predict_entry_fn" {
         GEOLOCATION_FN_NAME = var.geolocation_fn_name
         RELIABILITY_FN_NAME = var.reliability_fn_name
         MODEL_INFO_FN_NAME = "${var.model_info_fn_name}-${var.environment}"
+        ENVIRONMENT = "${var.environment}"
+        SENTRY_URL = "${var.sentry_url}"
     }
 }
 
@@ -200,6 +203,11 @@ module "entry_predict_output_fn" {
     build_in_docker = true
     #store_on_s3 = true
     #s3_bucket = "${var.processed_docs_bucket}"
+
+    environment_variables = {
+        ENVIRONMENT = "${var.environment}"
+        SENTRY_URL = "${var.sentry_url}"
+    }
 }
 
 resource "aws_lambda_layer_version" "lambda_layer_mappings" {
diff --git a/modules/sqs_lambda_entry_predict/variables.tf b/modules/sqs_lambda_entry_predict/variables.tf
index 3ba01b47..2fd12100 100644
--- a/modules/sqs_lambda_entry_predict/variables.tf
+++ b/modules/sqs_lambda_entry_predict/variables.tf
@@ -14,4 +14,6 @@ variable processed_docs_bucket {}
 
 variable reserved_entry_input_queue_predict_id {}
 
-variable reserved_entry_input_queue_predict_arn {}
\ No newline at end of file
+variable reserved_entry_input_queue_predict_arn {}
+
+variable sentry_url {}
\ No newline at end of file
diff --git a/modules/sqs_lambda_extract_docs/main.tf b/modules/sqs_lambda_extract_docs/main.tf
index 98ead9ca..73c8b620 100644
--- a/modules/sqs_lambda_extract_docs/main.tf
+++ b/modules/sqs_lambda_extract_docs/main.tf
@@ -143,8 +143,10 @@ module "extract_docs_fn" {
     environment_variables = {
         INPUT_QUEUE = aws_sqs_queue.input_queue.id
         DEST_S3_BUCKET = "${var.processed_docs_bucket}"
-        PROCESSED_QUEUE = aws_sqs_queue.processed_queue.id,
+        PROCESSED_QUEUE = aws_sqs_queue.processed_queue.id
         DOCS_CONVERT_LAMBDA_FN_NAME = "${var.docs_convert_lambda_fn_name}"
+        ENVIRONMENT = "${var.environment}"
+        SENTRY_URL = "${var.sentry_url}"
     }
 }
 
@@ -194,6 +196,8 @@ module "output_request_fn" {
 
     environment_variables = {
         SIGNED_URL_EXPIRY_SECS = "${var.signed_url_expiry_secs}"
+        ENVIRONMENT = "${var.environment}"
+        SENTRY_URL = "${var.sentry_url}"
     }
 }
 
diff --git a/modules/sqs_lambda_extract_docs/variables.tf b/modules/sqs_lambda_extract_docs/variables.tf
index 421d90b8..7bf2f820 100644
--- a/modules/sqs_lambda_extract_docs/variables.tf
+++ b/modules/sqs_lambda_extract_docs/variables.tf
@@ -16,4 +16,6 @@ variable "docs_extract_fn_image_name" {}
 variable "docs_convert_lambda_fn_name" {}
 
 variable reserved_input_queue_id {}
-variable reserved_input_queue_arn {}
\ No newline at end of file
+variable reserved_input_queue_arn {}
+
+variable sentry_url {}
\ No newline at end of file
diff --git a/variables.tf b/variables.tf
index a061186b..fadc34df 100644
--- a/variables.tf
+++ b/variables.tf
@@ -8,6 +8,8 @@ variable api_gateway_name {}
 
 variable vpce_id {}
 
+variable vpc_id {}
+
 variable model_endpoint_name {}
 
 variable geolocation_fn_name {}
@@ -18,4 +20,6 @@ variable model_info_fn_name {}
 
 variable docs_extract_fn_image_name {}
 
-variable docs_convert_lambda_fn_name {}
\ No newline at end of file
+variable docs_convert_lambda_fn_name {}
+
+variable sentry_url {}
\ No newline at end of file

From 8235f168796f558c8677aa1caf166c609dddbf3a Mon Sep 17 00:00:00 2001
From: Ranjan Shrestha <ranjan.shrestha.np@gmail.com>
Date: Wed, 27 Apr 2022 11:54:08 +0545
Subject: [PATCH 03/13] values updated for dev, staging, prod environments in
 their variable files;

---
 dev.tfvars     |  6 +++++-
 prod.tfvars    | 10 +++++++---
 staging.tfvars | 23 +++++++++++++++++++++++
 3 files changed, 35 insertions(+), 4 deletions(-)
 create mode 100644 staging.tfvars

diff --git a/dev.tfvars b/dev.tfvars
index 663b308e..60c204ed 100644
--- a/dev.tfvars
+++ b/dev.tfvars
@@ -5,6 +5,7 @@ environment = "dev"
 # api gateway
 api_gateway_name = "rapi"
 vpce_id = "vpce-02c7bb08b571074e1"
+vpc_id = "vpc-0e65245d5e4c2deaf"
 
 # models
 model_endpoint_name = "test-all-models-rsh"
@@ -16,4 +17,7 @@ model_info_fn_name = "model_info"
 docs_extract_fn_image_name = "extractor-tool"
 
 # docs convert lambda
-docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice"
\ No newline at end of file
+docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice"
+
+# sentry url
+sentry_url = "https://3b273f4c61ac4d94af28e85a66ea0b5a@o158798.ingest.sentry.io/1223576"
\ No newline at end of file
diff --git a/prod.tfvars b/prod.tfvars
index f6a31e95..02be5089 100644
--- a/prod.tfvars
+++ b/prod.tfvars
@@ -1,10 +1,11 @@
-aws_region = "eu-west-3"
+aws_region = "us-east-1"
 aws_profile = "default"
 environment = "prod"
 
 # api gateway
 api_gateway_name = "rapi"
-vpce_id = "vpce-000796c803825026c"
+vpce_id = "vpce-05d8c268ef4c0c443"
+vpc_id = "vpc-0947f040a9d4692a7"
 
 # models
 model_endpoint_name = "test-all-models-rsh"
@@ -16,4 +17,7 @@ model_info_fn_name = "model_info"
 docs_extract_fn_image_name = "extract-tool"
 
 # docs convert lambda
-docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice"
\ No newline at end of file
+docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice"
+
+# sentry url
+sentry_url = "https://3b273f4c61ac4d94af28e85a66ea0b5a@o158798.ingest.sentry.io/1223576"
\ No newline at end of file
diff --git a/staging.tfvars b/staging.tfvars
new file mode 100644
index 00000000..9f7b840b
--- /dev/null
+++ b/staging.tfvars
@@ -0,0 +1,23 @@
+aws_region = "us-east-1"
+aws_profile = "default"
+environment = "staging"
+
+# api gateway
+api_gateway_name = "rapi"
+vpce_id = "vpce-02c7bb08b571074e1"
+vpc_id = "vpc-0e65245d5e4c2deaf"
+
+# models
+model_endpoint_name = "test-all-models-rsh"
+geolocation_fn_name = "geolocations"
+reliability_fn_name = "reliability"
+model_info_fn_name = "model_info"
+
+# ecr image name
+docs_extract_fn_image_name = "extract-tool"
+
+# docs convert lambda
+docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice"
+
+# sentry url
+sentry_url = "https://3b273f4c61ac4d94af28e85a66ea0b5a@o158798.ingest.sentry.io/1223576"
\ No newline at end of file

From 150791be840ab82428fa28efa50ba81782f7dbd2 Mon Sep 17 00:00:00 2001
From: Ranjan Shrestha <ranjan.shrestha.np@gmail.com>
Date: Wed, 27 Apr 2022 16:29:47 +0545
Subject: [PATCH 04/13] added the missing package dependencies in terraform
 config;

---
 .github/workflows/terraform-plan.yml          | 1 +
 modules/reserved_lambda_entry_predict/main.tf | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/terraform-plan.yml b/.github/workflows/terraform-plan.yml
index bd6e95ed..8facf22d 100644
--- a/.github/workflows/terraform-plan.yml
+++ b/.github/workflows/terraform-plan.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - initial_setup
+      - feat/sentry-integration
 
 jobs:
   terraform:
diff --git a/modules/reserved_lambda_entry_predict/main.tf b/modules/reserved_lambda_entry_predict/main.tf
index ebfc6924..a8c96fc6 100644
--- a/modules/reserved_lambda_entry_predict/main.tf
+++ b/modules/reserved_lambda_entry_predict/main.tf
@@ -98,6 +98,7 @@ module "reserved_predict_entry_fn" {
     source_path = [
     {
         path = "${path.module}/../../lambda_fns/entry_predict"
+        pip_requirements = "${path.module}/../../lambda_fns/entry_predict/requirements.txt"
     }
     ]
 

From 1e6b69ea9ce6f2013df267cbad7a5c1a870326ca Mon Sep 17 00:00:00 2001
From: Ranjan Shrestha <ranjan.shrestha.np@gmail.com>
Date: Thu, 28 Apr 2022 10:19:32 +0545
Subject: [PATCH 05/13] updated the parser tool url;

---
 lambda_fns/extract_docs/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lambda_fns/extract_docs/requirements.txt b/lambda_fns/extract_docs/requirements.txt
index a016f5d9..4eb5689c 100644
--- a/lambda_fns/extract_docs/requirements.txt
+++ b/lambda_fns/extract_docs/requirements.txt
@@ -10,5 +10,5 @@ six==1.16.0
 urllib3==1.26.6
 Pillow==8.3.2
 lxml==4.7.1
-deep-parser @ git+https://github.com/the-deep/deepExt
-sentry-sdk==1.5.8
\ No newline at end of file
+sentry-sdk==1.5.8
+deep-parser @ git+https://github.com/the-deep/deepex

From f16855605c9e2a8fd275eda9c1459341db7fe5ee Mon Sep 17 00:00:00 2001
From: Ranjan Shrestha <ranjan.shrestha.np@gmail.com>
Date: Thu, 28 Apr 2022 11:44:59 +0545
Subject: [PATCH 06/13] enabled stacktrace in sentry; handles image links(and
 discards them, no extraction happens);

---
 lambda_fns/entry_predict/app.py                | 2 +-
 lambda_fns/entry_predict_output_request/app.py | 2 +-
 lambda_fns/extract_docs/app.py                 | 6 +++++-
 lambda_fns/extract_docs/content_types.py       | 5 +++++
 lambda_fns/extract_docs_output_request/app.py  | 2 +-
 5 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/lambda_fns/entry_predict/app.py b/lambda_fns/entry_predict/app.py
index 319c7c10..e4d9da91 100644
--- a/lambda_fns/entry_predict/app.py
+++ b/lambda_fns/entry_predict/app.py
@@ -31,7 +31,7 @@
 reliability_client = boto3.client("lambda", region_name="us-east-1")
 model_info_client = boto3.client("lambda", region_name=AWS_REGION)
 
-sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0)
+sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0)
 
 
 class PredictionStatus(Enum):
diff --git a/lambda_fns/entry_predict_output_request/app.py b/lambda_fns/entry_predict_output_request/app.py
index ffb3d2fd..b2a5f90b 100644
--- a/lambda_fns/entry_predict_output_request/app.py
+++ b/lambda_fns/entry_predict_output_request/app.py
@@ -13,7 +13,7 @@
 SENTRY_URL = os.environ.get("SENTRY_URL")
 ENVIRONMENT = os.environ.get("ENVIRONMENT")
 
-sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0)
+sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0)
 
 logging.getLogger().setLevel(logging.INFO)
 
diff --git a/lambda_fns/extract_docs/app.py b/lambda_fns/extract_docs/app.py
index 41df9fe0..7c351853 100644
--- a/lambda_fns/extract_docs/app.py
+++ b/lambda_fns/extract_docs/app.py
@@ -43,7 +43,7 @@
 
 extract_content_type = ExtractContentType()
 
-sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0)
+sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0)
 
 
 class ExtractionStatus(Enum):
@@ -346,6 +346,10 @@ def handle_urls(url, mock=False):
                 logging.error(f"Exception occurred {e}")
                 s3_file_path, s3_images_path, total_pages, total_words_count = None, None, -1, -1
                 extraction_status = ExtractionStatus.FAILED.value
+    elif content_type == UrlTypes.IMG.value:
+        logging.warn("Text extraction from Images is not available.")
+        s3_file_path, s3_images_path, total_pages, total_words_count = None, None, -1, -1
+        extraction_status = ExtractionStatus.FAILED.value
     else:
         raise NotImplementedError
 
diff --git a/lambda_fns/extract_docs/content_types.py b/lambda_fns/extract_docs/content_types.py
index 533f818f..6f7f37d2 100644
--- a/lambda_fns/extract_docs/content_types.py
+++ b/lambda_fns/extract_docs/content_types.py
@@ -14,6 +14,7 @@ class UrlTypes(str, Enum):
     MSWORD = 'doc'
     XLSX = 'xlsx'
     XLS = 'xls'
+    IMG = 'img'
 
 
 class ExtractContentType:
@@ -27,6 +28,7 @@ def __init__(self):
         self.content_types_ppt = ('application/vnd.ms-powerpoint')
         self.content_types_xlsx = ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
         self.content_types_xls = ('application/vnd.ms-excel')
+        self.content_types_img = ('image/jpeg', 'image/gif', 'image/png', 'image/svg+xml', 'image/webp', 'image/bmp', 'image/tiff')
 
     def get_content_type(self, url):
         try:
@@ -53,6 +55,9 @@ def get_content_type(self, url):
                 return UrlTypes.PPTX.value
             elif url.endswith(".ppt") or content_type in self.content_types_ppt:
                 return UrlTypes.PPT.value
+            elif url.endswith(".jpg") or url.endswith(".jpeg") or url.endswith(".png") or \
+                url.endswith(".gif") or url.endswith(".bmp") or content_type in self.content_types_img:
+                return UrlTypes.IMG.value
             else:
                 logging.warn(f'Could not determine the content-type of the {url}')
                 return None
diff --git a/lambda_fns/extract_docs_output_request/app.py b/lambda_fns/extract_docs_output_request/app.py
index 7123f848..8ba82606 100644
--- a/lambda_fns/extract_docs_output_request/app.py
+++ b/lambda_fns/extract_docs_output_request/app.py
@@ -19,7 +19,7 @@
 SENTRY_URL = os.environ.get("SENTRY_URL")
 ENVIRONMENT = os.environ.get("ENVIRONMENT")
 
-sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0)
+sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0)
 
 s3_client = boto3.client(
     's3',

From 74e353ad55b6030604553a795a000c4440aa1603 Mon Sep 17 00:00:00 2001
From: Ranjan Shrestha <ranjan.shrestha.np@gmail.com>
Date: Thu, 28 Apr 2022 15:10:53 +0545
Subject: [PATCH 07/13] enabled err stack;

---
 lambda_fns/extract_docs/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lambda_fns/extract_docs/app.py b/lambda_fns/extract_docs/app.py
index 7c351853..0a750919 100644
--- a/lambda_fns/extract_docs/app.py
+++ b/lambda_fns/extract_docs/app.py
@@ -399,7 +399,7 @@ def process_docs(event, context):
                 }
                 send_message2sqs(**sqs_message)
         except Exception as e:
-            logging.error(f"Exception is {e}")
+            logging.error(e, exc_info=True)
 
         signal.alarm(0)
 

From 9e09b2a22e438c230224bda4a2f9e182ad9b514a Mon Sep 17 00:00:00 2001
From: Ranjan Shrestha <ranjan.shrestha.np@gmail.com>
Date: Thu, 28 Apr 2022 16:41:14 +0545
Subject: [PATCH 08/13] updated the hashicorp aws version;

---
 main.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.tf b/main.tf
index 8f124655..00b5d9f7 100644
--- a/main.tf
+++ b/main.tf
@@ -2,7 +2,7 @@ terraform {
   required_providers {
     aws = {
       source  = "hashicorp/aws"
-      version = "4.8.0"
+      version = "4.9.0"
     }
   }
   required_version = "1.1.2"

From 5eb386212c9043be6803abeff9629c6011b0dc67 Mon Sep 17 00:00:00 2001
From: Ranjan Shrestha <ranjan.shrestha.np@gmail.com>
Date: Wed, 27 Apr 2022 11:52:39 +0545
Subject: [PATCH 09/13] terraform configs updated with sentry_url passed as env
 variable; updated the configs in gateway for prod, staging envs;


From 4f98fa5d87d44b847e6d3e379bfe488ed768397c Mon Sep 17 00:00:00 2001
From: Ranjan Shrestha <ranjan.shrestha.np@gmail.com>
Date: Thu, 28 Apr 2022 15:10:53 +0545
Subject: [PATCH 10/13] enabled err stack;


From 54e86e73c927d7b1ad416ec9c03f13374bf6f202 Mon Sep 17 00:00:00 2001
From: Ranjan Shrestha <ranjan.shrestha.np@gmail.com>
Date: Thu, 28 Apr 2022 16:41:14 +0545
Subject: [PATCH 11/13] updated the hashicorp aws version;


From 4274ac80858390f9411f04b0d93da8d61dee1fe5 Mon Sep 17 00:00:00 2001
From: Ranjan Shrestha <ranjan.shrestha.np@gmail.com>
Date: Thu, 5 May 2022 11:35:57 +0545
Subject: [PATCH 12/13] if content type is not determined with head requests,
 downloads the file to find the extension of that file;

---
 lambda_fns/extract_docs/content_types.py |  30 +-
 lambda_fns/extract_docs/wget.py          | 404 +++++++++++++++++++++++
 2 files changed, 432 insertions(+), 2 deletions(-)
 create mode 100644 lambda_fns/extract_docs/wget.py

diff --git a/lambda_fns/extract_docs/content_types.py b/lambda_fns/extract_docs/content_types.py
index 6f7f37d2..7144baaa 100644
--- a/lambda_fns/extract_docs/content_types.py
+++ b/lambda_fns/extract_docs/content_types.py
@@ -1,6 +1,11 @@
+import os
 import requests
 from enum import Enum
 import logging
+try:
+    from wget import download
+except ImportError:
+    from .wget import download
 
 logging.getLogger().setLevel(logging.INFO)
 
@@ -59,8 +64,29 @@ def get_content_type(self, url):
                 url.endswith(".gif") or url.endswith(".bmp") or content_type in self.content_types_img:
                 return UrlTypes.IMG.value
             else:
-                logging.warn(f'Could not determine the content-type of the {url}')
-                return None
+                temp_filepath = download(url, out="/tmp/")
+                if os.path.exists(temp_filepath):
+                    os.remove(temp_filepath)
+                if temp_filepath.endswith(".pdf"):
+                    return UrlTypes.PDF.value
+                elif temp_filepath.endswith(".docx"):
+                    return UrlTypes.DOCX.value
+                elif temp_filepath.endswith(".doc"):
+                    return UrlTypes.MSWORD.value
+                elif temp_filepath.endswith(".xlsx"):
+                    return UrlTypes.XLSX.value
+                elif temp_filepath.endswith(".xls"):
+                    return UrlTypes.XLS.value
+                elif temp_filepath.endswith(".pptx"):
+                    return UrlTypes.PPTX.value
+                elif temp_filepath.endswith(".ppt"):
+                    return UrlTypes.PPT.value
+                elif temp_filepath.endswith(".jpg") or temp_filepath.endswith(".jpeg") or temp_filepath.endswith(".png") or \
+                    temp_filepath.endswith(".gif") or temp_filepath.endswith(".bmp"):
+                    return UrlTypes.IMG.value
+                else:
+                    logging.warn(f'Could not determine the content-type of the {url}')
+                    return None
         except requests.exceptions.RequestException:
             logging.error(f'Exception occurred. Could not determine the content-type of the {url}')
             return None
diff --git a/lambda_fns/extract_docs/wget.py b/lambda_fns/extract_docs/wget.py
new file mode 100644
index 00000000..04b31916
--- /dev/null
+++ b/lambda_fns/extract_docs/wget.py
@@ -0,0 +1,404 @@
+#!/usr/bin/env python
+"""
+Download utility as an easy way to get file from the net
+ 
+  python -m wget <URL>
+  python wget.py <URL>
+
+Downloads: http://pypi.python.org/pypi/wget/
+Development: http://bitbucket.org/techtonik/python-wget/
+
+wget.py is not option compatible with Unix wget utility,
+to make command line interface intuitive for new people.
+
+Public domain by anatoly techtonik <techtonik@gmail.com>
+Also available under the terms of MIT license
+Copyright (c) 2010-2014 anatoly techtonik
+
+** Modified by Ranjan Shrestha Line number 302 to make it work on aws lambda function **
+"""
+
+
+import sys, shutil, os
+import tempfile
+import math
+
+PY3K = sys.version_info >= (3, 0)
+if PY3K:
+  import urllib.request as urllib
+  import urllib.parse as urlparse
+else:
+  import urllib
+  import urlparse
+
+
+__version__ = "2.3-beta1"
+
+
+def filename_from_url(url):
+    """:return: detected filename or None"""
+    fname = os.path.basename(urlparse.urlparse(url).path)
+    if len(fname.strip(" \n\t.")) == 0:
+        return None
+    return fname
+
+def filename_from_headers(headers):
+    """Detect filename from Content-Disposition headers if present.
+    http://greenbytes.de/tech/tc2231/
+
+    :param: headers as dict, list or string
+    :return: filename from content-disposition header or None
+    """
+    if type(headers) == str:
+        headers = headers.splitlines()
+    if type(headers) == list:
+        headers = dict([x.split(':', 1) for x in headers])
+    cdisp = headers.get("Content-Disposition")
+    if not cdisp:
+        return None
+    cdtype = cdisp.split(';')
+    if len(cdtype) == 1:
+        return None
+    if cdtype[0].strip().lower() not in ('inline', 'attachment'):
+        return None
+    # several filename params is illegal, but just in case
+    fnames = [x for x in cdtype[1:] if x.strip().startswith('filename=')]
+    if len(fnames) > 1:
+        return None
+    name = fnames[0].split('=')[1].strip(' \t"')
+    name = os.path.basename(name)
+    if not name:
+        return None
+    return name
+
+def filename_fix_existing(filename):
+    """Expands name portion of filename with numeric ' (x)' suffix to
+    return filename that doesn't exist already.
+    """
+    dirname = '.' 
+    name, ext = filename.rsplit('.', 1)
+    names = [x for x in os.listdir(dirname) if x.startswith(name)]
+    names = [x.rsplit('.', 1)[0] for x in names]
+    suffixes = [x.replace(name, '') for x in names]
+    # filter suffixes that match ' (x)' pattern
+    suffixes = [x[2:-1] for x in suffixes
+                   if x.startswith(' (') and x.endswith(')')]
+    indexes  = [int(x) for x in suffixes
+                   if set(x) <= set('0123456789')]
+    idx = 1
+    if indexes:
+        idx += sorted(indexes)[-1]
+    return '%s (%d).%s' % (name, idx, ext)
+
+
+# --- terminal/console output helpers ---
+
+def get_console_width():
+    """Return width of available window area. Autodetection works for
+       Windows and POSIX platforms. Returns 80 for others
+
+       Code from http://bitbucket.org/techtonik/python-pager
+    """
+
+    if os.name == 'nt':
+        STD_INPUT_HANDLE  = -10
+        STD_OUTPUT_HANDLE = -11
+        STD_ERROR_HANDLE  = -12
+
+        # get console handle
+        from ctypes import windll, Structure, byref
+        try:
+            from ctypes.wintypes import SHORT, WORD, DWORD
+        except ImportError:
+            # workaround for missing types in Python 2.5
+            from ctypes import (
+                c_short as SHORT, c_ushort as WORD, c_ulong as DWORD)
+        console_handle = windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE)
+
+        # CONSOLE_SCREEN_BUFFER_INFO Structure
+        class COORD(Structure):
+            _fields_ = [("X", SHORT), ("Y", SHORT)]
+
+        class SMALL_RECT(Structure):
+            _fields_ = [("Left", SHORT), ("Top", SHORT),
+                        ("Right", SHORT), ("Bottom", SHORT)]
+
+        class CONSOLE_SCREEN_BUFFER_INFO(Structure):
+            _fields_ = [("dwSize", COORD),
+                        ("dwCursorPosition", COORD),
+                        ("wAttributes", WORD),
+                        ("srWindow", SMALL_RECT),
+                        ("dwMaximumWindowSize", DWORD)]
+
+        sbi = CONSOLE_SCREEN_BUFFER_INFO()
+        ret = windll.kernel32.GetConsoleScreenBufferInfo(console_handle, byref(sbi))
+        if ret == 0:
+            return 0
+        return sbi.srWindow.Right+1
+
+    elif os.name == 'posix':
+        from fcntl import ioctl
+        from termios import TIOCGWINSZ
+        from array import array
+
+        winsize = array("H", [0] * 4)
+        try:
+            ioctl(sys.stdout.fileno(), TIOCGWINSZ, winsize)
+        except IOError:
+            pass
+        return (winsize[1], winsize[0])[0]
+
+    return 80
+
+
+def bar_thermometer(current, total, width=80):
+    """Return thermometer style progress bar string. `total` argument
+    can not be zero. The minimum size of bar returned is 3. Example:
+
+        [..........            ]
+
+    Control and trailing symbols (\r and spaces) are not included.
+    See `bar_adaptive` for more information.
+    """
+    # number of dots on thermometer scale
+    avail_dots = width-2
+    shaded_dots = int(math.floor(float(current) / total * avail_dots))
+    return '[' + '.'*shaded_dots + ' '*(avail_dots-shaded_dots) + ']'
+
+def bar_adaptive(current, total, width=80):
+    """Return progress bar string for given values in one of three
+    styles depending on available width:
+
+        [..  ] downloaded / total
+        downloaded / total
+        [.. ]
+
+    if total value is unknown or <= 0, show bytes counter using two
+    adaptive styles:
+
+        %s / unknown
+        %s
+
+    if there is not enough space on the screen, do not display anything
+
+    returned string doesn't include control characters like \r used to
+    place cursor at the beginning of the line to erase previous content.
+
+    this function leaves one free character at the end of string to
+    avoid automatic linefeed on Windows.
+    """
+
+    # process special case when total size is unknown and return immediately
+    if not total or total < 0:
+        msg = "%s / unknown" % current
+        if len(msg) < width:    # leaves one character to avoid linefeed
+            return msg
+        if len("%s" % current) < width:
+            return "%s" % current
+
+    # --- adaptive layout algorithm ---
+    #
+    # [x] describe the format of the progress bar
+    # [x] describe min width for each data field
+    # [x] set priorities for each element
+    # [x] select elements to be shown
+    #   [x] choose top priority element min_width < avail_width
+    #   [x] lessen avail_width by value if min_width
+    #   [x] exclude element from priority list and repeat
+    
+    #  10% [.. ]  10/100
+    # pppp bbbbb sssssss
+
+    min_width = {
+      'percent': 4,  # 100%
+      'bar': 3,      # [.]
+      'size': len("%s" % total)*2 + 3, # 'xxxx / yyyy'
+    }
+    priority = ['percent', 'bar', 'size']
+
+    # select elements to show
+    selected = []
+    avail = width
+    for field in priority:
+      if min_width[field] < avail:
+        selected.append(field)
+        avail -= min_width[field]+1   # +1 is for separator or for reserved space at
+                                      # the end of line to avoid linefeed on Windows
+    # render
+    output = ''
+    for field in selected:
+
+      if field == 'percent':
+        # fixed size width for percentage
+        output += ('%s%%' % (100 * current // total)).rjust(min_width['percent'])
+      elif field == 'bar':  # [. ]
+        # bar takes its min width + all available space
+        output += bar_thermometer(current, total, min_width['bar']+avail)
+      elif field == 'size':
+        # size field has a constant width (min == max)
+        output += ("%s / %s" % (current, total)).rjust(min_width['size'])
+
+      selected = selected[1:]
+      if selected:
+        output += ' '  # add field separator
+
+    return output
+
+# --/ console helpers
+
+
+__current_size = 0  # global state variable, which exists solely as a
+                    # workaround against Python 3.3.0 regression
+                    # http://bugs.python.org/issue16409
+                    # fixed in Python 3.3.1
+def callback_progress(blocks, block_size, total_size, bar_function):
+    """callback function for urlretrieve that is called when connection is
+    created and when once for each block
+
+    draws adaptive progress bar in terminal/console
+
+    use sys.stdout.write() instead of "print,", because it allows one more
+    symbol at the line end without linefeed on Windows
+
+    :param blocks: number of blocks transferred so far
+    :param block_size: in bytes
+    :param total_size: in bytes, can be -1 if server doesn't return it
+    :param bar_function: another callback function to visualize progress
+    """
+    global __current_size
+ 
+    width = min(100, get_console_width())
+
+    if sys.version_info[:3] == (3, 3, 0):  # regression workaround
+        if blocks == 0:  # first call
+            __current_size = 0
+        else:
+            __current_size += block_size
+        current_size = __current_size
+    else:
+        current_size = min(blocks*block_size, total_size)
+    progress = bar_function(current_size, total_size, width)
+    if progress:
+        sys.stdout.write("\r" + progress)
+
+class ThrowOnErrorOpener(urllib.FancyURLopener):
+    def http_error_default(self, url, fp, errcode, errmsg, headers):
+        raise Exception("%s: %s" % (errcode, errmsg))
+
+def download(url, out=None, bar=bar_adaptive):
+    """High level function, which downloads URL into tmp file in current
+    directory and then renames it to filename autodetected from either URL
+    or HTTP headers.
+
+    :param bar: function to track download progress (visualize etc.)
+    :param out: output filename or directory
+    :return:    filename where URL is downloaded to
+    """
+    names = dict()
+    names["out"] = out or ''
+    names["url"] = filename_from_url(url)
+    # get filename for temp file in current directory
+    prefix = (names["url"] or names["out"] or ".") + "."
+    (fd, tmpfile) = tempfile.mkstemp(".tmp", prefix=prefix, dir="/tmp")
+    os.close(fd)
+    os.unlink(tmpfile)
+
+    # set progress monitoring callback
+    def callback_charged(blocks, block_size, total_size):
+        # 'closure' to set bar drawing function in callback
+        callback_progress(blocks, block_size, total_size, bar_function=bar)
+    if bar:
+        callback = callback_charged
+    else:
+        callback = None
+
+    (tmpfile, headers) = ThrowOnErrorOpener().retrieve(url, tmpfile, callback)
+    names["header"] = filename_from_headers(headers)
+    if os.path.isdir(names["out"]):
+        filename = names["header"] or names["url"]
+        filename = names["out"] + "/" + filename
+    else:
+        filename = names["out"] or names["header"] or names["url"]
+    # add numeric ' (x)' suffix if filename already exists
+    if os.path.exists(filename):
+        filename = filename_fix_existing(filename)
+    shutil.move(tmpfile, filename)
+
+    #print headers
+    return filename
+
+
+usage = """\
+usage: wget.py [options] URL
+
+options:
+  -o --output FILE|DIR   output filename or directory
+  -h --help
+  --version
+"""
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2 or "-h" in sys.argv or "--help" in sys.argv:
+        sys.exit(usage)
+    if "--version" in sys.argv:
+        sys.exit("wget.py " + __version__)
+
+    from optparse import OptionParser
+    parser = OptionParser()
+    parser.add_option("-o", "--output", dest="output")
+    (options, args) = parser.parse_args()
+
+    url = sys.argv[1]
+    filename = download(args[0], out=options.output)
+
+    print("")
+    print("Saved under %s" % filename)
+
+r"""
+features that require more tuits for urlretrieve API
+http://www.python.org/doc/2.6/library/urllib.html#urllib.urlretrieve
+
+[x] autodetect filename from URL
+[x] autodetect filename from headers - Content-Disposition
+    http://greenbytes.de/tech/tc2231/
+[ ] make HEAD request to detect temp filename from Content-Disposition
+[ ] process HTTP status codes (i.e. 404 error)
+    http://ftp.de.debian.org/debian/pool/iso-codes_3.24.2.orig.tar.bz2
+[ ] catch KeyboardInterrupt
+[ ] optionally preserve incomplete file
+[x] create temp file in current directory
+[ ] resume download (broken connection)
+[ ] resume download (incomplete file)
+[x] show progress indicator
+    http://mail.python.org/pipermail/tutor/2005-May/038797.html
+[x] do not overwrite downloaded file
+ [x] rename file automatically if exists
+[x] optionally specify path for downloaded file
+
+[ ] options plan
+ [x] -h, --help, --version (CHAOS speccy)
+[ ] clpbar progress bar style
+_ 30.0Mb at  3.0 Mbps  eta:   0:00:20   30% [=====         ]
+[ ] test "bar \r" print with \r at the end of line on Windows
+[ ] process Python 2.x urllib.ContentTooShortError exception gracefully
+    (ideally retry and continue download)
+
+    (tmpfile, headers) = urllib.urlretrieve(url, tmpfile, callback_progress)
+  File "C:\Python27\lib\urllib.py", line 93, in urlretrieve
+    return _urlopener.retrieve(url, filename, reporthook, data)
+  File "C:\Python27\lib\urllib.py", line 283, in retrieve
+    "of %i bytes" % (read, size), result)
+urllib.ContentTooShortError: retrieval incomplete: got only 15239952 out of 24807571 bytes
+
+[ ] find out if urlretrieve may return unicode headers
+[ ] test suite for unsafe filenames from url and from headers
+
+[ ] security checks
+  [ ] filename_from_url
+  [ ] filename_from_headers
+  [ ] MITM redirect from https URL
+  [ ] https certificate check
+  [ ] size+hash check helpers
+    [ ] fail if size is known and mismatch
+    [ ] fail if hash mismatch
+"""

From 2ddbcddc4e8af4888c5235e1635f8998b96985d3 Mon Sep 17 00:00:00 2001
From: Ranjan Shrestha <ranjan.shrestha.np@gmail.com>
Date: Fri, 6 May 2022 14:03:15 +0545
Subject: [PATCH 13/13] user agent added while sending get or head request;

---
 lambda_fns/extract_docs/app.py           | 10 +++++++---
 lambda_fns/extract_docs/content_types.py |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/lambda_fns/extract_docs/app.py b/lambda_fns/extract_docs/app.py
index 0a750919..f193f21d 100644
--- a/lambda_fns/extract_docs/app.py
+++ b/lambda_fns/extract_docs/app.py
@@ -45,6 +45,10 @@
 
 sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0)
 
+REQ_HEADERS = {
+    'user-agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1')
+}
+
 
 class ExtractionStatus(Enum):
     FAILED = 0
@@ -252,7 +256,7 @@ def get_extracted_text_web_links(link, file_name, mock=False):
 def handle_urls(url, mock=False):
     file_name = None
 
-    content_type = extract_content_type.get_content_type(url)
+    content_type = extract_content_type.get_content_type(url, REQ_HEADERS)
 
     file_name = EXTRACTED_FILE_NAME
 
@@ -277,7 +281,7 @@ def handle_urls(url, mock=False):
         s3_file_path = None
         s3_images_path = None
 
-        response = requests.get(url, stream=True)
+        response = requests.get(url, headers=REQ_HEADERS, stream=True)
         with tempfile.NamedTemporaryFile(mode='w+b') as temp:
             for chunk in response.iter_content(chunk_size=128):
                 temp.write(chunk)
@@ -305,7 +309,7 @@ def handle_urls(url, mock=False):
 
         ext_type = content_type
 
-        response = requests.get(url, stream=True)
+        response = requests.get(url, headers=REQ_HEADERS, stream=True)
 
         with tempfile.NamedTemporaryFile(mode='w+b') as temp:
             for chunk in response.iter_content(chunk_size=128):
diff --git a/lambda_fns/extract_docs/content_types.py b/lambda_fns/extract_docs/content_types.py
index 7144baaa..7716e55a 100644
--- a/lambda_fns/extract_docs/content_types.py
+++ b/lambda_fns/extract_docs/content_types.py
@@ -35,9 +35,9 @@ def __init__(self):
         self.content_types_xls = ('application/vnd.ms-excel')
         self.content_types_img = ('image/jpeg', 'image/gif', 'image/png', 'image/svg+xml', 'image/webp', 'image/bmp', 'image/tiff')
 
-    def get_content_type(self, url):
+    def get_content_type(self, url, req_headers):
         try:
-            response = requests.head(url)
+            response = requests.head(url, headers=req_headers)
             content_type = response.headers['Content-Type']
 
             logging.info(f'The content type of {url} is {content_type}')