From adad0a67beffb829fa89b5810c800596c0fa8c9f Mon Sep 17 00:00:00 2001 From: Ranjan Shrestha Date: Wed, 27 Apr 2022 11:42:42 +0545 Subject: [PATCH 01/13] lambda functions updated with sentry --- lambda_fns/entry_predict/app.py | 7 +++++++ lambda_fns/entry_predict/requirements.txt | 3 ++- lambda_fns/entry_predict_output_request/app.py | 7 +++++++ lambda_fns/entry_predict_output_request/requirements.txt | 3 ++- lambda_fns/extract_docs/app.py | 7 +++++++ lambda_fns/extract_docs/requirements.txt | 3 ++- lambda_fns/extract_docs_output_request/app.py | 7 +++++++ lambda_fns/extract_docs_output_request/requirements.txt | 3 ++- 8 files changed, 36 insertions(+), 4 deletions(-) diff --git a/lambda_fns/entry_predict/app.py b/lambda_fns/entry_predict/app.py index e50fdded..319c7c10 100644 --- a/lambda_fns/entry_predict/app.py +++ b/lambda_fns/entry_predict/app.py @@ -6,6 +6,8 @@ import logging from concurrent.futures import ThreadPoolExecutor, as_completed +import sentry_sdk + from postprocess_raw_preds import get_predictions_all, get_clean_thresholds, get_clean_ratios logging.getLogger().setLevel(logging.INFO) @@ -19,6 +21,9 @@ RELIABILITY_FN_NAME = os.environ.get("RELIABILITY_FN_NAME") MODEL_INFO_FN_NAME = os.environ.get("MODEL_INFO_FN_NAME") +SENTRY_URL = os.environ.get("SENTRY_URL") +ENVIRONMENT = os.environ.get("ENVIRONMENT") + sqs_client = boto3.client('sqs', region_name=AWS_REGION) sagemaker_rt = boto3.client("runtime.sagemaker", region_name="us-east-1") # todo: update the region later. @@ -26,6 +31,8 @@ reliability_client = boto3.client("lambda", region_name="us-east-1") model_info_client = boto3.client("lambda", region_name=AWS_REGION) +sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0) + class PredictionStatus(Enum): FAILED = 0 diff --git a/lambda_fns/entry_predict/requirements.txt b/lambda_fns/entry_predict/requirements.txt index 4f5b8993..a8661b56 100644 --- a/lambda_fns/entry_predict/requirements.txt +++ b/lambda_fns/entry_predict/requirements.txt @@ -1 +1,2 @@ -requests==2.26.0 \ No newline at end of file +requests==2.26.0 +sentry-sdk==1.5.8 \ No newline at end of file diff --git a/lambda_fns/entry_predict_output_request/app.py b/lambda_fns/entry_predict_output_request/app.py index 721446c5..ffb3d2fd 100644 --- a/lambda_fns/entry_predict_output_request/app.py +++ b/lambda_fns/entry_predict_output_request/app.py @@ -1,6 +1,8 @@ +import os import requests import json import logging +import sentry_sdk from mappings.tags_mapping import get_all_mappings, get_categories try: from lambda_fns.model_info.app import lambda_handler @@ -8,6 +10,11 @@ except ImportError: pass +SENTRY_URL = os.environ.get("SENTRY_URL") +ENVIRONMENT = os.environ.get("ENVIRONMENT") + +sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0) + logging.getLogger().setLevel(logging.INFO) mappings = get_all_mappings() diff --git a/lambda_fns/entry_predict_output_request/requirements.txt b/lambda_fns/entry_predict_output_request/requirements.txt index 4f5b8993..a8661b56 100644 --- a/lambda_fns/entry_predict_output_request/requirements.txt +++ b/lambda_fns/entry_predict_output_request/requirements.txt @@ -1 +1,2 @@ -requests==2.26.0 \ No newline at end of file +requests==2.26.0 +sentry-sdk==1.5.8 \ No newline at end of file diff --git a/lambda_fns/extract_docs/app.py b/lambda_fns/extract_docs/app.py index 337c258b..41df9fe0 100644 --- a/lambda_fns/extract_docs/app.py +++ b/lambda_fns/extract_docs/app.py @@ -12,6 +12,8 @@ import tempfile import signal +import sentry_sdk + from deep_parser import TextFromFile from deep_parser import TextFromWeb @@ -32,12 +34,17 @@ domain_name = os.environ.get("EXTRACTOR_DOMAIN_NAME", "http://extractor:8001") +SENTRY_URL = os.environ.get("SENTRY_URL") +ENVIRONMENT = os.environ.get("ENVIRONMENT") + s3_client = boto3.client('s3', region_name=aws_region) sqs_client = boto3.client('sqs', region_name=aws_region) lambda_client = boto3.client('lambda', region_name="us-east-1") extract_content_type = ExtractContentType() +sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0) + class ExtractionStatus(Enum): FAILED = 0 diff --git a/lambda_fns/extract_docs/requirements.txt b/lambda_fns/extract_docs/requirements.txt index b7d263c1..a016f5d9 100644 --- a/lambda_fns/extract_docs/requirements.txt +++ b/lambda_fns/extract_docs/requirements.txt @@ -10,4 +10,5 @@ six==1.16.0 urllib3==1.26.6 Pillow==8.3.2 lxml==4.7.1 -deep-parser @ git+https://github.com/the-deep/deepExt \ No newline at end of file +deep-parser @ git+https://github.com/the-deep/deepExt +sentry-sdk==1.5.8 \ No newline at end of file diff --git a/lambda_fns/extract_docs_output_request/app.py b/lambda_fns/extract_docs_output_request/app.py index d4166b55..7123f848 100644 --- a/lambda_fns/extract_docs_output_request/app.py +++ b/lambda_fns/extract_docs_output_request/app.py @@ -6,6 +6,8 @@ from botocore.exceptions import ClientError from botocore.client import Config +import sentry_sdk + logging.getLogger().setLevel(logging.INFO) REQUEST_TIMEOUT = 60 @@ -14,6 +16,11 @@ aws_region = os.environ.get("AWS_REGION", DEFAULT_AWS_REGION) signed_url_expiry_secs = os.environ.get("SIGNED_URL_EXPIRY_SECS") +SENTRY_URL = os.environ.get("SENTRY_URL") +ENVIRONMENT = os.environ.get("ENVIRONMENT") + +sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0) + s3_client = boto3.client( 's3', region_name=aws_region, diff --git a/lambda_fns/extract_docs_output_request/requirements.txt b/lambda_fns/extract_docs_output_request/requirements.txt index 4f5b8993..a8661b56 100644 --- a/lambda_fns/extract_docs_output_request/requirements.txt +++ b/lambda_fns/extract_docs_output_request/requirements.txt @@ -1 +1,2 @@ -requests==2.26.0 \ No newline at end of file +requests==2.26.0 +sentry-sdk==1.5.8 \ No newline at end of file From 423a13a220dd381cfdf6dd77f7d5e3071f32cf96 Mon Sep 17 00:00:00 2001 From: Ranjan Shrestha Date: Wed, 27 Apr 2022 11:52:39 +0545 Subject: [PATCH 02/13] terraform configs updated with sentry_url passed as env variable; updated the configs in gateway for prod, staging envs; --- main.tf | 9 +++++++ modules/api_gateway/main.tf | 7 +++--- modules/api_gateway/variables.tf | 1 + modules/reserved_lambda_entry_predict/main.tf | 25 ++++++++++++------- .../variables.tf | 4 ++- .../reserved_sqs_lambda_extract_docs/main.tf | 24 ++++++++++-------- .../variables.tf | 4 ++- modules/sqs_lambda_entry_predict/main.tf | 8 ++++++ modules/sqs_lambda_entry_predict/variables.tf | 4 ++- modules/sqs_lambda_extract_docs/main.tf | 6 ++++- modules/sqs_lambda_extract_docs/variables.tf | 4 ++- variables.tf | 6 ++++- 12 files changed, 74 insertions(+), 28 deletions(-) diff --git a/main.tf b/main.tf index 38f4ed20..8f124655 100644 --- a/main.tf +++ b/main.tf @@ -44,6 +44,8 @@ module "sqs_lambda_module" { reserved_input_queue_arn = "${module.reserved_sqs_lambda_module.reserved_input_queue_arn}" environment = var.environment + + sentry_url = var.sentry_url } module "reserved_sqs_lambda_module" { @@ -57,6 +59,8 @@ module "reserved_sqs_lambda_module" { docs_convert_lambda_fn_name = var.docs_convert_lambda_fn_name environment = var.environment + + sentry_url = var.sentry_url } module "sqs_lambda_predict_module" { @@ -73,6 +77,8 @@ module "sqs_lambda_predict_module" { aws_region = var.aws_region environment = var.environment + + sentry_url = var.sentry_url } module "reserved_sqs_lambda_predict_module" { @@ -86,6 +92,8 @@ module "reserved_sqs_lambda_predict_module" { aws_region = var.aws_region environment = var.environment + + sentry_url = var.sentry_url } module "apigateway_module" { @@ -94,6 +102,7 @@ module "apigateway_module" { api_gateway_name = var.api_gateway_name vpce_id = var.vpce_id + vpc_id = var.vpc_id predict_entry_invoke_arn = "${module.sqs_lambda_predict_module.entry_input_pred_request_predict_invoke_arn}" process_doc_invoke_arn = "${module.sqs_lambda_module.extract_doc_invoke_arn}" diff --git a/modules/api_gateway/main.tf b/modules/api_gateway/main.tf index 3b4f06da..2e60f135 100644 --- a/modules/api_gateway/main.tf +++ b/modules/api_gateway/main.tf @@ -137,10 +137,11 @@ resource "aws_api_gateway_rest_api_policy" "api_policy" { Effect = "Deny", Principal = "*", Action = "execute-api:Invoke", - Resource = "execute-api:/*/*/*", + Resource = "execute-api:/${var.environment}/*/*", Condition = { StringNotEquals = { - "aws:sourceVpce": var.vpce_id + "aws:sourceVpce": var.vpce_id, + "aws:sourceVpc": var.vpc_id } } }, @@ -148,7 +149,7 @@ resource "aws_api_gateway_rest_api_policy" "api_policy" { Effect = "Allow", Principal = "*", Action = "execute-api:Invoke", - Resource = "execute-api:/*/*/*" + Resource = "execute-api:/${var.environment}/*/*" } ] }) diff --git a/modules/api_gateway/variables.tf b/modules/api_gateway/variables.tf index 0f631683..f7bde9d9 100644 --- a/modules/api_gateway/variables.tf +++ b/modules/api_gateway/variables.tf @@ -23,6 +23,7 @@ variable "input_te_lambda_fn_alias_arn" {} variable "input_te_lambda_fn_alias_name" {} variable "vpce_id" {} +variable "vpc_id" {} variable "predict_entry_invoke_arn" {} diff --git a/modules/reserved_lambda_entry_predict/main.tf b/modules/reserved_lambda_entry_predict/main.tf index 9e782061..ebfc6924 100644 --- a/modules/reserved_lambda_entry_predict/main.tf +++ b/modules/reserved_lambda_entry_predict/main.tf @@ -131,7 +131,7 @@ module "reserved_predict_entry_fn" { ] }) - provisioned_concurrent_executions = 10 + provisioned_concurrent_executions = var.environment == "dev" ? 1 : 10 reserved_concurrent_executions = 30 environment_variables = { @@ -140,12 +140,14 @@ module "reserved_predict_entry_fn" { GEOLOCATION_FN_NAME = var.geolocation_fn_name RELIABILITY_FN_NAME = var.reliability_fn_name MODEL_INFO_FN_NAME = "${var.model_info_fn_name}-${var.environment}" + ENVIRONMENT = "${var.environment}" + SENTRY_URL = "${var.sentry_url}" } } resource "aws_appautoscaling_target" "reserved_predict_entry_fn_autoscale" { - max_capacity = 10 - min_capacity = 5 + max_capacity = var.environment == "dev" ? 1 : 10 + min_capacity = var.environment == "dev" ? 1 : 5 resource_id = "function:${module.reserved_predict_entry_fn.lambda_function_name}:${module.reserved_predict_entry_fn.lambda_function_version}" scalable_dimension = "lambda:function:ProvisionedConcurrency" service_namespace = "lambda" @@ -200,13 +202,18 @@ module "reserved_entry_predict_output_fn" { ] }) - provisioned_concurrent_executions = 5 + provisioned_concurrent_executions = var.environment == "dev" ? 1 : 5 layers = ["${aws_lambda_layer_version.reserved_lambda_layer_mappings.arn}"] build_in_docker = true #store_on_s3 = true #s3_bucket = "${var.processed_docs_bucket}" + + environment_variables = { + ENVIRONMENT = "${var.environment}" + SENTRY_URL = "${var.sentry_url}" + } } resource "aws_lambda_layer_version" "reserved_lambda_layer_mappings" { @@ -218,8 +225,8 @@ resource "aws_lambda_layer_version" "reserved_lambda_layer_mappings" { } resource "aws_appautoscaling_target" "reserved_entry_predict_output_fn_autoscale" { - max_capacity = 5 - min_capacity = 2 + max_capacity = var.environment == "dev" ? 1 : 5 + min_capacity = var.environment == "dev" ? 1 : 2 resource_id = "function:${module.reserved_entry_predict_output_fn.lambda_function_name}:${module.reserved_entry_predict_output_fn.lambda_function_version}" scalable_dimension = "lambda:function:ProvisionedConcurrency" service_namespace = "lambda" @@ -266,7 +273,7 @@ module "reserved_entry_predict_transfer_dlq_msg" { ] }) - provisioned_concurrent_executions = 5 + provisioned_concurrent_executions = var.environment == "dev" ? 1 : 5 environment_variables = { PREDICTION_QUEUE = aws_sqs_queue.reserved_entry_input_processed_queue_predict.id @@ -274,8 +281,8 @@ module "reserved_entry_predict_transfer_dlq_msg" { } resource "aws_appautoscaling_target" "reserved_entry_predict_transfer_dlq_msg_autoscale" { - max_capacity = 5 - min_capacity = 2 + max_capacity = var.environment == "dev" ? 1 : 5 + min_capacity = var.environment == "dev" ? 1 : 2 resource_id = "function:${module.reserved_entry_predict_transfer_dlq_msg.lambda_function_name}:${module.reserved_entry_predict_transfer_dlq_msg.lambda_function_version}" scalable_dimension = "lambda:function:ProvisionedConcurrency" service_namespace = "lambda" diff --git a/modules/reserved_lambda_entry_predict/variables.tf b/modules/reserved_lambda_entry_predict/variables.tf index caa3db1e..d02dcd0b 100644 --- a/modules/reserved_lambda_entry_predict/variables.tf +++ b/modules/reserved_lambda_entry_predict/variables.tf @@ -10,4 +10,6 @@ variable reliability_fn_name {} variable model_info_fn_name {} -variable processed_docs_bucket {} \ No newline at end of file +variable processed_docs_bucket {} + +variable sentry_url {} \ No newline at end of file diff --git a/modules/reserved_sqs_lambda_extract_docs/main.tf b/modules/reserved_sqs_lambda_extract_docs/main.tf index 19fa3342..62fc6f7e 100644 --- a/modules/reserved_sqs_lambda_extract_docs/main.tf +++ b/modules/reserved_sqs_lambda_extract_docs/main.tf @@ -131,15 +131,17 @@ module "reserved_extract_docs_fn" { ] }) - provisioned_concurrent_executions = 10 + provisioned_concurrent_executions = var.environment == "dev" ? 1 : 10 reserved_concurrent_executions = 30 build_in_docker = true environment_variables = { INPUT_QUEUE = aws_sqs_queue.reserved_input_queue.id DEST_S3_BUCKET = "${var.processed_docs_bucket}" - PROCESSED_QUEUE = aws_sqs_queue.reserved_processed_queue.id, + PROCESSED_QUEUE = aws_sqs_queue.reserved_processed_queue.id DOCS_CONVERT_LAMBDA_FN_NAME = "${var.docs_convert_lambda_fn_name}" + ENVIRONMENT = "${var.environment}" + SENTRY_URL = "${var.sentry_url}" } } @@ -156,8 +158,8 @@ module "reserved_extract_docs_fn" { # } resource "aws_appautoscaling_target" "reserved_extract_docs_autoscale" { - max_capacity = 10 - min_capacity = 5 + max_capacity = var.environment == "dev" ? 1 : 10 + min_capacity = var.environment == "dev" ? 1 : 5 resource_id = "function:${module.reserved_extract_docs_fn.lambda_function_name}:${module.reserved_extract_docs_fn.lambda_function_version}" scalable_dimension = "lambda:function:ProvisionedConcurrency" service_namespace = "lambda" @@ -204,7 +206,7 @@ module "reserved_output_request_fn" { ] }) - provisioned_concurrent_executions = 5 + provisioned_concurrent_executions = var.environment == "dev" ? 1 : 5 build_in_docker = true #store_on_s3 = true @@ -212,12 +214,14 @@ module "reserved_output_request_fn" { environment_variables = { SIGNED_URL_EXPIRY_SECS = "${var.signed_url_expiry_secs}" + ENVIRONMENT = "${var.environment}" + SENTRY_URL = "${var.sentry_url}" } } resource "aws_appautoscaling_target" "reserved_output_fn_autoscale" { - max_capacity = 5 - min_capacity = 2 + max_capacity = var.environment == "dev" ? 1 : 5 + min_capacity = var.environment == "dev" ? 1 : 2 resource_id = "function:${module.reserved_output_request_fn.lambda_function_name}:${module.reserved_output_request_fn.lambda_function_version}" scalable_dimension = "lambda:function:ProvisionedConcurrency" service_namespace = "lambda" @@ -265,7 +269,7 @@ module "reserved_transfer_dlq_msg" { ] }) - provisioned_concurrent_executions = 5 + provisioned_concurrent_executions = var.environment == "dev" ? 1 : 5 environment_variables = { PROCESSED_QUEUE = aws_sqs_queue.reserved_processed_queue.id @@ -273,8 +277,8 @@ module "reserved_transfer_dlq_msg" { } resource "aws_appautoscaling_target" "reserved_transfer_dlq_msg_autoscale" { - max_capacity = 5 - min_capacity = 2 + max_capacity = var.environment == "dev" ? 1 : 5 + min_capacity = var.environment == "dev" ? 1 : 2 resource_id = "function:${module.reserved_transfer_dlq_msg.lambda_function_name}:${module.reserved_transfer_dlq_msg.lambda_function_version}" scalable_dimension = "lambda:function:ProvisionedConcurrency" service_namespace = "lambda" diff --git a/modules/reserved_sqs_lambda_extract_docs/variables.tf b/modules/reserved_sqs_lambda_extract_docs/variables.tf index 7d28e964..2b183b0b 100644 --- a/modules/reserved_sqs_lambda_extract_docs/variables.tf +++ b/modules/reserved_sqs_lambda_extract_docs/variables.tf @@ -13,4 +13,6 @@ variable "processed_docs_bucket_arn" {} variable "docs_extract_fn_image_name" {} -variable "docs_convert_lambda_fn_name" {} \ No newline at end of file +variable "docs_convert_lambda_fn_name" {} + +variable sentry_url {} \ No newline at end of file diff --git a/modules/sqs_lambda_entry_predict/main.tf b/modules/sqs_lambda_entry_predict/main.tf index 6693d3ff..7c944d2d 100644 --- a/modules/sqs_lambda_entry_predict/main.tf +++ b/modules/sqs_lambda_entry_predict/main.tf @@ -106,6 +106,7 @@ module "predict_entry_fn" { source_path = [ { path = "${path.module}/../../lambda_fns/entry_predict" + pip_requirements = "${path.module}/../../lambda_fns/entry_predict/requirements.txt" } ] @@ -145,6 +146,8 @@ module "predict_entry_fn" { GEOLOCATION_FN_NAME = var.geolocation_fn_name RELIABILITY_FN_NAME = var.reliability_fn_name MODEL_INFO_FN_NAME = "${var.model_info_fn_name}-${var.environment}" + ENVIRONMENT = "${var.environment}" + SENTRY_URL = "${var.sentry_url}" } } @@ -200,6 +203,11 @@ module "entry_predict_output_fn" { build_in_docker = true #store_on_s3 = true #s3_bucket = "${var.processed_docs_bucket}" + + environment_variables = { + ENVIRONMENT = "${var.environment}" + SENTRY_URL = "${var.sentry_url}" + } } resource "aws_lambda_layer_version" "lambda_layer_mappings" { diff --git a/modules/sqs_lambda_entry_predict/variables.tf b/modules/sqs_lambda_entry_predict/variables.tf index 3ba01b47..2fd12100 100644 --- a/modules/sqs_lambda_entry_predict/variables.tf +++ b/modules/sqs_lambda_entry_predict/variables.tf @@ -14,4 +14,6 @@ variable processed_docs_bucket {} variable reserved_entry_input_queue_predict_id {} -variable reserved_entry_input_queue_predict_arn {} \ No newline at end of file +variable reserved_entry_input_queue_predict_arn {} + +variable sentry_url {} \ No newline at end of file diff --git a/modules/sqs_lambda_extract_docs/main.tf b/modules/sqs_lambda_extract_docs/main.tf index 98ead9ca..73c8b620 100644 --- a/modules/sqs_lambda_extract_docs/main.tf +++ b/modules/sqs_lambda_extract_docs/main.tf @@ -143,8 +143,10 @@ module "extract_docs_fn" { environment_variables = { INPUT_QUEUE = aws_sqs_queue.input_queue.id DEST_S3_BUCKET = "${var.processed_docs_bucket}" - PROCESSED_QUEUE = aws_sqs_queue.processed_queue.id, + PROCESSED_QUEUE = aws_sqs_queue.processed_queue.id DOCS_CONVERT_LAMBDA_FN_NAME = "${var.docs_convert_lambda_fn_name}" + ENVIRONMENT = "${var.environment}" + SENTRY_URL = "${var.sentry_url}" } } @@ -194,6 +196,8 @@ module "output_request_fn" { environment_variables = { SIGNED_URL_EXPIRY_SECS = "${var.signed_url_expiry_secs}" + ENVIRONMENT = "${var.environment}" + SENTRY_URL = "${var.sentry_url}" } } diff --git a/modules/sqs_lambda_extract_docs/variables.tf b/modules/sqs_lambda_extract_docs/variables.tf index 421d90b8..7bf2f820 100644 --- a/modules/sqs_lambda_extract_docs/variables.tf +++ b/modules/sqs_lambda_extract_docs/variables.tf @@ -16,4 +16,6 @@ variable "docs_extract_fn_image_name" {} variable "docs_convert_lambda_fn_name" {} variable reserved_input_queue_id {} -variable reserved_input_queue_arn {} \ No newline at end of file +variable reserved_input_queue_arn {} + +variable sentry_url {} \ No newline at end of file diff --git a/variables.tf b/variables.tf index a061186b..fadc34df 100644 --- a/variables.tf +++ b/variables.tf @@ -8,6 +8,8 @@ variable api_gateway_name {} variable vpce_id {} +variable vpc_id {} + variable model_endpoint_name {} variable geolocation_fn_name {} @@ -18,4 +20,6 @@ variable model_info_fn_name {} variable docs_extract_fn_image_name {} -variable docs_convert_lambda_fn_name {} \ No newline at end of file +variable docs_convert_lambda_fn_name {} + +variable sentry_url {} \ No newline at end of file From 8235f168796f558c8677aa1caf166c609dddbf3a Mon Sep 17 00:00:00 2001 From: Ranjan Shrestha Date: Wed, 27 Apr 2022 11:54:08 +0545 Subject: [PATCH 03/13] values updated for dev, staging, prod environments in their variable files; --- dev.tfvars | 6 +++++- prod.tfvars | 10 +++++++--- staging.tfvars | 23 +++++++++++++++++++++++ 3 files changed, 35 insertions(+), 4 deletions(-) create mode 100644 staging.tfvars diff --git a/dev.tfvars b/dev.tfvars index 663b308e..60c204ed 100644 --- a/dev.tfvars +++ b/dev.tfvars @@ -5,6 +5,7 @@ environment = "dev" # api gateway api_gateway_name = "rapi" vpce_id = "vpce-02c7bb08b571074e1" +vpc_id = "vpc-0e65245d5e4c2deaf" # models model_endpoint_name = "test-all-models-rsh" @@ -16,4 +17,7 @@ model_info_fn_name = "model_info" docs_extract_fn_image_name = "extractor-tool" # docs convert lambda -docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice" \ No newline at end of file +docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice" + +# sentry url +sentry_url = "https://3b273f4c61ac4d94af28e85a66ea0b5a@o158798.ingest.sentry.io/1223576" \ No newline at end of file diff --git a/prod.tfvars b/prod.tfvars index f6a31e95..02be5089 100644 --- a/prod.tfvars +++ b/prod.tfvars @@ -1,10 +1,11 @@ -aws_region = "eu-west-3" +aws_region = "us-east-1" aws_profile = "default" environment = "prod" # api gateway api_gateway_name = "rapi" -vpce_id = "vpce-000796c803825026c" +vpce_id = "vpce-05d8c268ef4c0c443" +vpc_id = "vpc-0947f040a9d4692a7" # models model_endpoint_name = "test-all-models-rsh" @@ -16,4 +17,7 @@ model_info_fn_name = "model_info" docs_extract_fn_image_name = "extract-tool" # docs convert lambda -docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice" \ No newline at end of file +docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice" + +# sentry url +sentry_url = "https://3b273f4c61ac4d94af28e85a66ea0b5a@o158798.ingest.sentry.io/1223576" \ No newline at end of file diff --git a/staging.tfvars b/staging.tfvars new file mode 100644 index 00000000..9f7b840b --- /dev/null +++ b/staging.tfvars @@ -0,0 +1,23 @@ +aws_region = "us-east-1" +aws_profile = "default" +environment = "staging" + +# api gateway +api_gateway_name = "rapi" +vpce_id = "vpce-02c7bb08b571074e1" +vpc_id = "vpc-0e65245d5e4c2deaf" + +# models +model_endpoint_name = "test-all-models-rsh" +geolocation_fn_name = "geolocations" +reliability_fn_name = "reliability" +model_info_fn_name = "model_info" + +# ecr image name +docs_extract_fn_image_name = "extract-tool" + +# docs convert lambda +docs_convert_lambda_fn_name = "libreoffice-dev-libreoffice" + +# sentry url +sentry_url = "https://3b273f4c61ac4d94af28e85a66ea0b5a@o158798.ingest.sentry.io/1223576" \ No newline at end of file From 150791be840ab82428fa28efa50ba81782f7dbd2 Mon Sep 17 00:00:00 2001 From: Ranjan Shrestha Date: Wed, 27 Apr 2022 16:29:47 +0545 Subject: [PATCH 04/13] added the missing package dependencies in terraform config; --- .github/workflows/terraform-plan.yml | 1 + modules/reserved_lambda_entry_predict/main.tf | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/terraform-plan.yml b/.github/workflows/terraform-plan.yml index bd6e95ed..8facf22d 100644 --- a/.github/workflows/terraform-plan.yml +++ b/.github/workflows/terraform-plan.yml @@ -4,6 +4,7 @@ on: push: branches: - initial_setup + - feat/sentry-integration jobs: terraform: diff --git a/modules/reserved_lambda_entry_predict/main.tf b/modules/reserved_lambda_entry_predict/main.tf index ebfc6924..a8c96fc6 100644 --- a/modules/reserved_lambda_entry_predict/main.tf +++ b/modules/reserved_lambda_entry_predict/main.tf @@ -98,6 +98,7 @@ module "reserved_predict_entry_fn" { source_path = [ { path = "${path.module}/../../lambda_fns/entry_predict" + pip_requirements = "${path.module}/../../lambda_fns/entry_predict/requirements.txt" } ] From 1e6b69ea9ce6f2013df267cbad7a5c1a870326ca Mon Sep 17 00:00:00 2001 From: Ranjan Shrestha Date: Thu, 28 Apr 2022 10:19:32 +0545 Subject: [PATCH 05/13] updated the parser tool url; --- lambda_fns/extract_docs/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lambda_fns/extract_docs/requirements.txt b/lambda_fns/extract_docs/requirements.txt index a016f5d9..4eb5689c 100644 --- a/lambda_fns/extract_docs/requirements.txt +++ b/lambda_fns/extract_docs/requirements.txt @@ -10,5 +10,5 @@ six==1.16.0 urllib3==1.26.6 Pillow==8.3.2 lxml==4.7.1 -deep-parser @ git+https://github.com/the-deep/deepExt -sentry-sdk==1.5.8 \ No newline at end of file +sentry-sdk==1.5.8 +deep-parser @ git+https://github.com/the-deep/deepex From f16855605c9e2a8fd275eda9c1459341db7fe5ee Mon Sep 17 00:00:00 2001 From: Ranjan Shrestha Date: Thu, 28 Apr 2022 11:44:59 +0545 Subject: [PATCH 06/13] enabled stacktrace in sentry; handles image links(and discards them, no extraction happens); --- lambda_fns/entry_predict/app.py | 2 +- lambda_fns/entry_predict_output_request/app.py | 2 +- lambda_fns/extract_docs/app.py | 6 +++++- lambda_fns/extract_docs/content_types.py | 5 +++++ lambda_fns/extract_docs_output_request/app.py | 2 +- 5 files changed, 13 insertions(+), 4 deletions(-) diff --git a/lambda_fns/entry_predict/app.py b/lambda_fns/entry_predict/app.py index 319c7c10..e4d9da91 100644 --- a/lambda_fns/entry_predict/app.py +++ b/lambda_fns/entry_predict/app.py @@ -31,7 +31,7 @@ reliability_client = boto3.client("lambda", region_name="us-east-1") model_info_client = boto3.client("lambda", region_name=AWS_REGION) -sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0) +sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0) class PredictionStatus(Enum): diff --git a/lambda_fns/entry_predict_output_request/app.py b/lambda_fns/entry_predict_output_request/app.py index ffb3d2fd..b2a5f90b 100644 --- a/lambda_fns/entry_predict_output_request/app.py +++ b/lambda_fns/entry_predict_output_request/app.py @@ -13,7 +13,7 @@ SENTRY_URL = os.environ.get("SENTRY_URL") ENVIRONMENT = os.environ.get("ENVIRONMENT") -sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0) +sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0) logging.getLogger().setLevel(logging.INFO) diff --git a/lambda_fns/extract_docs/app.py b/lambda_fns/extract_docs/app.py index 41df9fe0..7c351853 100644 --- a/lambda_fns/extract_docs/app.py +++ b/lambda_fns/extract_docs/app.py @@ -43,7 +43,7 @@ extract_content_type = ExtractContentType() -sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0) +sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0) class ExtractionStatus(Enum): @@ -346,6 +346,10 @@ def handle_urls(url, mock=False): logging.error(f"Exception occurred {e}") s3_file_path, s3_images_path, total_pages, total_words_count = None, None, -1, -1 extraction_status = ExtractionStatus.FAILED.value + elif content_type == UrlTypes.IMG.value: + logging.warn("Text extraction from Images is not available.") + s3_file_path, s3_images_path, total_pages, total_words_count = None, None, -1, -1 + extraction_status = ExtractionStatus.FAILED.value else: raise NotImplementedError diff --git a/lambda_fns/extract_docs/content_types.py b/lambda_fns/extract_docs/content_types.py index 533f818f..6f7f37d2 100644 --- a/lambda_fns/extract_docs/content_types.py +++ b/lambda_fns/extract_docs/content_types.py @@ -14,6 +14,7 @@ class UrlTypes(str, Enum): MSWORD = 'doc' XLSX = 'xlsx' XLS = 'xls' + IMG = 'img' class ExtractContentType: @@ -27,6 +28,7 @@ def __init__(self): self.content_types_ppt = ('application/vnd.ms-powerpoint') self.content_types_xlsx = ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet') self.content_types_xls = ('application/vnd.ms-excel') + self.content_types_img = ('image/jpeg', 'image/gif', 'image/png', 'image/svg+xml', 'image/webp', 'image/bmp', 'image/tiff') def get_content_type(self, url): try: @@ -53,6 +55,9 @@ def get_content_type(self, url): return UrlTypes.PPTX.value elif url.endswith(".ppt") or content_type in self.content_types_ppt: return UrlTypes.PPT.value + elif url.endswith(".jpg") or url.endswith(".jpeg") or url.endswith(".png") or \ + url.endswith(".gif") or url.endswith(".bmp") or content_type in self.content_types_img: + return UrlTypes.IMG.value else: logging.warn(f'Could not determine the content-type of the {url}') return None diff --git a/lambda_fns/extract_docs_output_request/app.py b/lambda_fns/extract_docs_output_request/app.py index 7123f848..8ba82606 100644 --- a/lambda_fns/extract_docs_output_request/app.py +++ b/lambda_fns/extract_docs_output_request/app.py @@ -19,7 +19,7 @@ SENTRY_URL = os.environ.get("SENTRY_URL") ENVIRONMENT = os.environ.get("ENVIRONMENT") -sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, traces_sample_rate=1.0) +sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0) s3_client = boto3.client( 's3', From 74e353ad55b6030604553a795a000c4440aa1603 Mon Sep 17 00:00:00 2001 From: Ranjan Shrestha Date: Thu, 28 Apr 2022 15:10:53 +0545 Subject: [PATCH 07/13] enabled err stack; --- lambda_fns/extract_docs/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lambda_fns/extract_docs/app.py b/lambda_fns/extract_docs/app.py index 7c351853..0a750919 100644 --- a/lambda_fns/extract_docs/app.py +++ b/lambda_fns/extract_docs/app.py @@ -399,7 +399,7 @@ def process_docs(event, context): } send_message2sqs(**sqs_message) except Exception as e: - logging.error(f"Exception is {e}") + logging.error(e, exc_info=True) signal.alarm(0) From 9e09b2a22e438c230224bda4a2f9e182ad9b514a Mon Sep 17 00:00:00 2001 From: Ranjan Shrestha Date: Thu, 28 Apr 2022 16:41:14 +0545 Subject: [PATCH 08/13] updated the hashicorp aws version; --- main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.tf b/main.tf index 8f124655..00b5d9f7 100644 --- a/main.tf +++ b/main.tf @@ -2,7 +2,7 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = "4.8.0" + version = "4.9.0" } } required_version = "1.1.2" From 5eb386212c9043be6803abeff9629c6011b0dc67 Mon Sep 17 00:00:00 2001 From: Ranjan Shrestha Date: Wed, 27 Apr 2022 11:52:39 +0545 Subject: [PATCH 09/13] terraform configs updated with sentry_url passed as env variable; updated the configs in gateway for prod, staging envs; From 4f98fa5d87d44b847e6d3e379bfe488ed768397c Mon Sep 17 00:00:00 2001 From: Ranjan Shrestha Date: Thu, 28 Apr 2022 15:10:53 +0545 Subject: [PATCH 10/13] enabled err stack; From 54e86e73c927d7b1ad416ec9c03f13374bf6f202 Mon Sep 17 00:00:00 2001 From: Ranjan Shrestha Date: Thu, 28 Apr 2022 16:41:14 +0545 Subject: [PATCH 11/13] updated the hashicorp aws version; From 4274ac80858390f9411f04b0d93da8d61dee1fe5 Mon Sep 17 00:00:00 2001 From: Ranjan Shrestha Date: Thu, 5 May 2022 11:35:57 +0545 Subject: [PATCH 12/13] if content type is not determined with head requests, downloads the file to find the extension of that file; --- lambda_fns/extract_docs/content_types.py | 30 +- lambda_fns/extract_docs/wget.py | 404 +++++++++++++++++++++++ 2 files changed, 432 insertions(+), 2 deletions(-) create mode 100644 lambda_fns/extract_docs/wget.py diff --git a/lambda_fns/extract_docs/content_types.py b/lambda_fns/extract_docs/content_types.py index 6f7f37d2..7144baaa 100644 --- a/lambda_fns/extract_docs/content_types.py +++ b/lambda_fns/extract_docs/content_types.py @@ -1,6 +1,11 @@ +import os import requests from enum import Enum import logging +try: + from wget import download +except ImportError: + from .wget import download logging.getLogger().setLevel(logging.INFO) @@ -59,8 +64,29 @@ def get_content_type(self, url): url.endswith(".gif") or url.endswith(".bmp") or content_type in self.content_types_img: return UrlTypes.IMG.value else: - logging.warn(f'Could not determine the content-type of the {url}') - return None + temp_filepath = download(url, out="/tmp/") + if os.path.exists(temp_filepath): + os.remove(temp_filepath) + if temp_filepath.endswith(".pdf"): + return UrlTypes.PDF.value + elif temp_filepath.endswith(".docx"): + return UrlTypes.DOCX.value + elif temp_filepath.endswith(".doc"): + return UrlTypes.MSWORD.value + elif temp_filepath.endswith(".xlsx"): + return UrlTypes.XLSX.value + elif temp_filepath.endswith(".xls"): + return UrlTypes.XLS.value + elif temp_filepath.endswith(".pptx"): + return UrlTypes.PPTX.value + elif temp_filepath.endswith(".ppt"): + return UrlTypes.PPT.value + elif temp_filepath.endswith(".jpg") or temp_filepath.endswith(".jpeg") or temp_filepath.endswith(".png") or \ + temp_filepath.endswith(".gif") or temp_filepath.endswith(".bmp"): + return UrlTypes.IMG.value + else: + logging.warn(f'Could not determine the content-type of the {url}') + return None except requests.exceptions.RequestException: logging.error(f'Exception occurred. Could not determine the content-type of the {url}') return None diff --git a/lambda_fns/extract_docs/wget.py b/lambda_fns/extract_docs/wget.py new file mode 100644 index 00000000..04b31916 --- /dev/null +++ b/lambda_fns/extract_docs/wget.py @@ -0,0 +1,404 @@ +#!/usr/bin/env python +""" +Download utility as an easy way to get file from the net + + python -m wget + python wget.py + +Downloads: http://pypi.python.org/pypi/wget/ +Development: http://bitbucket.org/techtonik/python-wget/ + +wget.py is not option compatible with Unix wget utility, +to make command line interface intuitive for new people. + +Public domain by anatoly techtonik +Also available under the terms of MIT license +Copyright (c) 2010-2014 anatoly techtonik + +** Modified by Ranjan Shrestha Line number 302 to make it work on aws lambda function ** +""" + + +import sys, shutil, os +import tempfile +import math + +PY3K = sys.version_info >= (3, 0) +if PY3K: + import urllib.request as urllib + import urllib.parse as urlparse +else: + import urllib + import urlparse + + +__version__ = "2.3-beta1" + + +def filename_from_url(url): + """:return: detected filename or None""" + fname = os.path.basename(urlparse.urlparse(url).path) + if len(fname.strip(" \n\t.")) == 0: + return None + return fname + +def filename_from_headers(headers): + """Detect filename from Content-Disposition headers if present. + http://greenbytes.de/tech/tc2231/ + + :param: headers as dict, list or string + :return: filename from content-disposition header or None + """ + if type(headers) == str: + headers = headers.splitlines() + if type(headers) == list: + headers = dict([x.split(':', 1) for x in headers]) + cdisp = headers.get("Content-Disposition") + if not cdisp: + return None + cdtype = cdisp.split(';') + if len(cdtype) == 1: + return None + if cdtype[0].strip().lower() not in ('inline', 'attachment'): + return None + # several filename params is illegal, but just in case + fnames = [x for x in cdtype[1:] if x.strip().startswith('filename=')] + if len(fnames) > 1: + return None + name = fnames[0].split('=')[1].strip(' \t"') + name = os.path.basename(name) + if not name: + return None + return name + +def filename_fix_existing(filename): + """Expands name portion of filename with numeric ' (x)' suffix to + return filename that doesn't exist already. + """ + dirname = '.' + name, ext = filename.rsplit('.', 1) + names = [x for x in os.listdir(dirname) if x.startswith(name)] + names = [x.rsplit('.', 1)[0] for x in names] + suffixes = [x.replace(name, '') for x in names] + # filter suffixes that match ' (x)' pattern + suffixes = [x[2:-1] for x in suffixes + if x.startswith(' (') and x.endswith(')')] + indexes = [int(x) for x in suffixes + if set(x) <= set('0123456789')] + idx = 1 + if indexes: + idx += sorted(indexes)[-1] + return '%s (%d).%s' % (name, idx, ext) + + +# --- terminal/console output helpers --- + +def get_console_width(): + """Return width of available window area. Autodetection works for + Windows and POSIX platforms. Returns 80 for others + + Code from http://bitbucket.org/techtonik/python-pager + """ + + if os.name == 'nt': + STD_INPUT_HANDLE = -10 + STD_OUTPUT_HANDLE = -11 + STD_ERROR_HANDLE = -12 + + # get console handle + from ctypes import windll, Structure, byref + try: + from ctypes.wintypes import SHORT, WORD, DWORD + except ImportError: + # workaround for missing types in Python 2.5 + from ctypes import ( + c_short as SHORT, c_ushort as WORD, c_ulong as DWORD) + console_handle = windll.kernel32.GetStdHandle(STD_OUTPUT_HANDLE) + + # CONSOLE_SCREEN_BUFFER_INFO Structure + class COORD(Structure): + _fields_ = [("X", SHORT), ("Y", SHORT)] + + class SMALL_RECT(Structure): + _fields_ = [("Left", SHORT), ("Top", SHORT), + ("Right", SHORT), ("Bottom", SHORT)] + + class CONSOLE_SCREEN_BUFFER_INFO(Structure): + _fields_ = [("dwSize", COORD), + ("dwCursorPosition", COORD), + ("wAttributes", WORD), + ("srWindow", SMALL_RECT), + ("dwMaximumWindowSize", DWORD)] + + sbi = CONSOLE_SCREEN_BUFFER_INFO() + ret = windll.kernel32.GetConsoleScreenBufferInfo(console_handle, byref(sbi)) + if ret == 0: + return 0 + return sbi.srWindow.Right+1 + + elif os.name == 'posix': + from fcntl import ioctl + from termios import TIOCGWINSZ + from array import array + + winsize = array("H", [0] * 4) + try: + ioctl(sys.stdout.fileno(), TIOCGWINSZ, winsize) + except IOError: + pass + return (winsize[1], winsize[0])[0] + + return 80 + + +def bar_thermometer(current, total, width=80): + """Return thermometer style progress bar string. `total` argument + can not be zero. The minimum size of bar returned is 3. Example: + + [.......... ] + + Control and trailing symbols (\r and spaces) are not included. + See `bar_adaptive` for more information. + """ + # number of dots on thermometer scale + avail_dots = width-2 + shaded_dots = int(math.floor(float(current) / total * avail_dots)) + return '[' + '.'*shaded_dots + ' '*(avail_dots-shaded_dots) + ']' + +def bar_adaptive(current, total, width=80): + """Return progress bar string for given values in one of three + styles depending on available width: + + [.. ] downloaded / total + downloaded / total + [.. ] + + if total value is unknown or <= 0, show bytes counter using two + adaptive styles: + + %s / unknown + %s + + if there is not enough space on the screen, do not display anything + + returned string doesn't include control characters like \r used to + place cursor at the beginning of the line to erase previous content. + + this function leaves one free character at the end of string to + avoid automatic linefeed on Windows. + """ + + # process special case when total size is unknown and return immediately + if not total or total < 0: + msg = "%s / unknown" % current + if len(msg) < width: # leaves one character to avoid linefeed + return msg + if len("%s" % current) < width: + return "%s" % current + + # --- adaptive layout algorithm --- + # + # [x] describe the format of the progress bar + # [x] describe min width for each data field + # [x] set priorities for each element + # [x] select elements to be shown + # [x] choose top priority element min_width < avail_width + # [x] lessen avail_width by value if min_width + # [x] exclude element from priority list and repeat + + # 10% [.. ] 10/100 + # pppp bbbbb sssssss + + min_width = { + 'percent': 4, # 100% + 'bar': 3, # [.] + 'size': len("%s" % total)*2 + 3, # 'xxxx / yyyy' + } + priority = ['percent', 'bar', 'size'] + + # select elements to show + selected = [] + avail = width + for field in priority: + if min_width[field] < avail: + selected.append(field) + avail -= min_width[field]+1 # +1 is for separator or for reserved space at + # the end of line to avoid linefeed on Windows + # render + output = '' + for field in selected: + + if field == 'percent': + # fixed size width for percentage + output += ('%s%%' % (100 * current // total)).rjust(min_width['percent']) + elif field == 'bar': # [. ] + # bar takes its min width + all available space + output += bar_thermometer(current, total, min_width['bar']+avail) + elif field == 'size': + # size field has a constant width (min == max) + output += ("%s / %s" % (current, total)).rjust(min_width['size']) + + selected = selected[1:] + if selected: + output += ' ' # add field separator + + return output + +# --/ console helpers + + +__current_size = 0 # global state variable, which exists solely as a + # workaround against Python 3.3.0 regression + # http://bugs.python.org/issue16409 + # fixed in Python 3.3.1 +def callback_progress(blocks, block_size, total_size, bar_function): + """callback function for urlretrieve that is called when connection is + created and when once for each block + + draws adaptive progress bar in terminal/console + + use sys.stdout.write() instead of "print,", because it allows one more + symbol at the line end without linefeed on Windows + + :param blocks: number of blocks transferred so far + :param block_size: in bytes + :param total_size: in bytes, can be -1 if server doesn't return it + :param bar_function: another callback function to visualize progress + """ + global __current_size + + width = min(100, get_console_width()) + + if sys.version_info[:3] == (3, 3, 0): # regression workaround + if blocks == 0: # first call + __current_size = 0 + else: + __current_size += block_size + current_size = __current_size + else: + current_size = min(blocks*block_size, total_size) + progress = bar_function(current_size, total_size, width) + if progress: + sys.stdout.write("\r" + progress) + +class ThrowOnErrorOpener(urllib.FancyURLopener): + def http_error_default(self, url, fp, errcode, errmsg, headers): + raise Exception("%s: %s" % (errcode, errmsg)) + +def download(url, out=None, bar=bar_adaptive): + """High level function, which downloads URL into tmp file in current + directory and then renames it to filename autodetected from either URL + or HTTP headers. + + :param bar: function to track download progress (visualize etc.) + :param out: output filename or directory + :return: filename where URL is downloaded to + """ + names = dict() + names["out"] = out or '' + names["url"] = filename_from_url(url) + # get filename for temp file in current directory + prefix = (names["url"] or names["out"] or ".") + "." + (fd, tmpfile) = tempfile.mkstemp(".tmp", prefix=prefix, dir="/tmp") + os.close(fd) + os.unlink(tmpfile) + + # set progress monitoring callback + def callback_charged(blocks, block_size, total_size): + # 'closure' to set bar drawing function in callback + callback_progress(blocks, block_size, total_size, bar_function=bar) + if bar: + callback = callback_charged + else: + callback = None + + (tmpfile, headers) = ThrowOnErrorOpener().retrieve(url, tmpfile, callback) + names["header"] = filename_from_headers(headers) + if os.path.isdir(names["out"]): + filename = names["header"] or names["url"] + filename = names["out"] + "/" + filename + else: + filename = names["out"] or names["header"] or names["url"] + # add numeric ' (x)' suffix if filename already exists + if os.path.exists(filename): + filename = filename_fix_existing(filename) + shutil.move(tmpfile, filename) + + #print headers + return filename + + +usage = """\ +usage: wget.py [options] URL + +options: + -o --output FILE|DIR output filename or directory + -h --help + --version +""" + +if __name__ == "__main__": + if len(sys.argv) < 2 or "-h" in sys.argv or "--help" in sys.argv: + sys.exit(usage) + if "--version" in sys.argv: + sys.exit("wget.py " + __version__) + + from optparse import OptionParser + parser = OptionParser() + parser.add_option("-o", "--output", dest="output") + (options, args) = parser.parse_args() + + url = sys.argv[1] + filename = download(args[0], out=options.output) + + print("") + print("Saved under %s" % filename) + +r""" +features that require more tuits for urlretrieve API +http://www.python.org/doc/2.6/library/urllib.html#urllib.urlretrieve + +[x] autodetect filename from URL +[x] autodetect filename from headers - Content-Disposition + http://greenbytes.de/tech/tc2231/ +[ ] make HEAD request to detect temp filename from Content-Disposition +[ ] process HTTP status codes (i.e. 404 error) + http://ftp.de.debian.org/debian/pool/iso-codes_3.24.2.orig.tar.bz2 +[ ] catch KeyboardInterrupt +[ ] optionally preserve incomplete file +[x] create temp file in current directory +[ ] resume download (broken connection) +[ ] resume download (incomplete file) +[x] show progress indicator + http://mail.python.org/pipermail/tutor/2005-May/038797.html +[x] do not overwrite downloaded file + [x] rename file automatically if exists +[x] optionally specify path for downloaded file + +[ ] options plan + [x] -h, --help, --version (CHAOS speccy) +[ ] clpbar progress bar style +_ 30.0Mb at 3.0 Mbps eta: 0:00:20 30% [===== ] +[ ] test "bar \r" print with \r at the end of line on Windows +[ ] process Python 2.x urllib.ContentTooShortError exception gracefully + (ideally retry and continue download) + + (tmpfile, headers) = urllib.urlretrieve(url, tmpfile, callback_progress) + File "C:\Python27\lib\urllib.py", line 93, in urlretrieve + return _urlopener.retrieve(url, filename, reporthook, data) + File "C:\Python27\lib\urllib.py", line 283, in retrieve + "of %i bytes" % (read, size), result) +urllib.ContentTooShortError: retrieval incomplete: got only 15239952 out of 24807571 bytes + +[ ] find out if urlretrieve may return unicode headers +[ ] test suite for unsafe filenames from url and from headers + +[ ] security checks + [ ] filename_from_url + [ ] filename_from_headers + [ ] MITM redirect from https URL + [ ] https certificate check + [ ] size+hash check helpers + [ ] fail if size is known and mismatch + [ ] fail if hash mismatch +""" From 2ddbcddc4e8af4888c5235e1635f8998b96985d3 Mon Sep 17 00:00:00 2001 From: Ranjan Shrestha Date: Fri, 6 May 2022 14:03:15 +0545 Subject: [PATCH 13/13] user agent added while sending get or head request; --- lambda_fns/extract_docs/app.py | 10 +++++++--- lambda_fns/extract_docs/content_types.py | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/lambda_fns/extract_docs/app.py b/lambda_fns/extract_docs/app.py index 0a750919..f193f21d 100644 --- a/lambda_fns/extract_docs/app.py +++ b/lambda_fns/extract_docs/app.py @@ -45,6 +45,10 @@ sentry_sdk.init(SENTRY_URL, environment=ENVIRONMENT, attach_stacktrace=True, traces_sample_rate=1.0) +REQ_HEADERS = { + 'user-agent': ('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1') +} + class ExtractionStatus(Enum): FAILED = 0 @@ -252,7 +256,7 @@ def get_extracted_text_web_links(link, file_name, mock=False): def handle_urls(url, mock=False): file_name = None - content_type = extract_content_type.get_content_type(url) + content_type = extract_content_type.get_content_type(url, REQ_HEADERS) file_name = EXTRACTED_FILE_NAME @@ -277,7 +281,7 @@ def handle_urls(url, mock=False): s3_file_path = None s3_images_path = None - response = requests.get(url, stream=True) + response = requests.get(url, headers=REQ_HEADERS, stream=True) with tempfile.NamedTemporaryFile(mode='w+b') as temp: for chunk in response.iter_content(chunk_size=128): temp.write(chunk) @@ -305,7 +309,7 @@ def handle_urls(url, mock=False): ext_type = content_type - response = requests.get(url, stream=True) + response = requests.get(url, headers=REQ_HEADERS, stream=True) with tempfile.NamedTemporaryFile(mode='w+b') as temp: for chunk in response.iter_content(chunk_size=128): diff --git a/lambda_fns/extract_docs/content_types.py b/lambda_fns/extract_docs/content_types.py index 7144baaa..7716e55a 100644 --- a/lambda_fns/extract_docs/content_types.py +++ b/lambda_fns/extract_docs/content_types.py @@ -35,9 +35,9 @@ def __init__(self): self.content_types_xls = ('application/vnd.ms-excel') self.content_types_img = ('image/jpeg', 'image/gif', 'image/png', 'image/svg+xml', 'image/webp', 'image/bmp', 'image/tiff') - def get_content_type(self, url): + def get_content_type(self, url, req_headers): try: - response = requests.head(url) + response = requests.head(url, headers=req_headers) content_type = response.headers['Content-Type'] logging.info(f'The content type of {url} is {content_type}')