-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Feat/parser in ecs #3
base: develop
Are you sure you want to change the base?
Conversation
…ated the configs in gateway for prod, staging envs;
…no extraction happens);
…ated the configs in gateway for prod, staging envs;
…ile to find the extension of that file;
…bucket/key payload;
local_temp_directory = pathlib.Path('/tmp', file_name) | ||
local_temp_directory.mkdir(parents=True) if not local_temp_directory.exists() else None | ||
# Note: commented for now | ||
# images.save_images(directory_path=local_temp_directory) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comment these lines
|
||
s3_file_path = f"s3://{DEST_BUCKET_NAME}/{str(s3_path_prefix)}/{file_name}" | ||
s3_images_path = f"s3://{DEST_BUCKET_NAME}/{str(s3_path_prefix)}/images" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make a general utils for this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
|
||
extracted_text = extracted_text.replace("\x00", "") # remove null chars | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lets make a util function for this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
s3_file_path = f"s3://{DEST_BUCKET_NAME}/{str(s3_path_prefix)}/{file_name}" | ||
return s3_file_path, None, total_pages, total_words_count # No images extraction (lib doesn't support?) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lets make util function
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
url, file_name | ||
) | ||
if s3_file_path: | ||
extraction_status = ExtractionStatus.SUCCESS.value | ||
else: | ||
extraction_status = ExtractionStatus.FAILED.value | ||
except Exception: | ||
logging.error(f"Error occurred during text extraction. {str(e)}", exc_info=True) | ||
s3_file_path, s3_images_path, total_pages, total_words_count = None, None, -1, -1 | ||
extraction_status = ExtractionStatus.FAILED.value | ||
elif content_type == UrlTypes.DOCX.value or content_type == UrlTypes.MSWORD.value or \ | ||
content_type == UrlTypes.XLSX.value or content_type == UrlTypes.XLS.value or \ | ||
content_type == UrlTypes.PPTX.value or content_type == UrlTypes.PPT.value: | ||
|
||
ext_type = content_type | ||
tmp_filename = f"{uuid.uuid4().hex}.{ext_type}" | ||
flag = False | ||
if upload_file_to_s3(url, key=tmp_filename, bucketname=DOCS_CONVERSION_BUCKET_NAME): | ||
payload = json.dumps({ | ||
"file": tmp_filename, | ||
"bucket": DOCS_CONVERSION_BUCKET_NAME, | ||
"ext": ext_type, | ||
"fromS3": 1 | ||
}) | ||
|
||
docs_conversion_lambda_response = lambda_client.invoke( | ||
FunctionName=DOCS_CONVERT_LAMBDA_FN_NAME, | ||
InvocationType="RequestResponse", | ||
Payload=payload | ||
) | ||
docs_conversion_lambda_response_json = json.loads( | ||
docs_conversion_lambda_response["Payload"].read().decode("utf-8") | ||
) | ||
|
||
if "statusCode" in docs_conversion_lambda_response_json and \ | ||
docs_conversion_lambda_response_json["statusCode"] == 200: | ||
bucket_name = docs_conversion_lambda_response_json["bucket"] | ||
file_path = docs_conversion_lambda_response_json["file"] | ||
filename = file_path.split("/")[-1] | ||
|
||
if download_file(file_path, bucket_name, f"/tmp/{filename}"): | ||
s3_file_path, s3_images_path, total_pages, total_words_count = get_extracted_content_links( | ||
f"/tmp/{filename}", file_name | ||
) | ||
if s3_file_path: | ||
extraction_status = ExtractionStatus.SUCCESS.value | ||
else: | ||
extraction_status = ExtractionStatus.FAILED.value | ||
else: | ||
flag = True | ||
else: | ||
logging.error(f"Error occurred during file conversion. {docs_conversion_lambda_response_json['error']}") | ||
flag = True | ||
else: | ||
logging.warn("Could not upload the file to s3.") | ||
flag = True | ||
|
||
if flag: | ||
s3_file_path, s3_images_path, total_pages, total_words_count = None, None, -1, -1 | ||
extraction_status = ExtractionStatus.FAILED.value | ||
elif content_type == UrlTypes.IMG.value: | ||
logging.warn("Text extraction from Images is not available.") | ||
s3_file_path, s3_images_path, total_pages, total_words_count = None, None, -1, -1 | ||
extraction_status = ExtractionStatus.FAILED.value | ||
else: | ||
logging.error(f"Text extraction is not available for this content type - {content_type}") | ||
s3_file_path, s3_images_path, total_pages, total_words_count = None, None, -1, -1 | ||
extraction_status = ExtractionStatus.FAILED.value | ||
|
||
logging.info(f"The extracted file path is {s3_file_path}") | ||
logging.info(f"The extracted image path is {s3_images_path}") | ||
logging.info(f"The status of the extraction is {str(extraction_status)}") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lets improve this fucntion
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
lambda_fns/entry_predict/app.py
Outdated
SENTRY_URL = os.environ.get("SENTRY_URL") | ||
ENVIRONMENT = os.environ.get("ENVIRONMENT") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe use django-environ for env configuration and load all configurations from a single app like config.py?
https://django-environ.readthedocs.io/en/latest/
example: https://github.com/the-deep/server/blob/develop/deep/settings.py#L22-L85
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is not that important to use.. just adds an extra dependency. but moved all the envs to a config.py file.
dev.tfvars
Outdated
|
||
# sentry url | ||
sentry_url = "https://[email protected]/1223576" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is sensitive info.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
stored in aws secrets.
SENTRY_URL = os.environ.get("SENTRY_URL") | ||
ENVIRONMENT = os.environ.get("ENVIRONMENT") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's define all the config in a single file.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
lambda_fns/extract_docs/Dockerfile
Outdated
&& apt-get autoremove -y \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
COPY deepex_ecs/app.py content_types.py wget.py /code/ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't need other files?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
lambda_fns/extract_docs/app.py
Outdated
key="temporaryfile.pdf", | ||
bucketname="deep-large-docs-conversion" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's not define the default value here or use the value from config.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
lambda_fns/extract_docs/app.py
Outdated
key="temporaryfile.pdf", | ||
bucketname="deep-large-docs-conversion" | ||
): | ||
try: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe define a class/object for handling upload/download/URL.
Something like this:
Definition: https://github.com/the-deep/serverless/blob/develop/src/common/s3.py
Usages: https://github.com/the-deep/serverless/blob/develop/src/functions/source_extract/models.py#L92-L109
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
elif url.endswith(".jpg") or url.endswith(".jpeg") or url.endswith(".png") or \ | ||
url.endswith(".gif") or url.endswith(".bmp") or content_type in self.content_types_img: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe use any here.
elif url.endswith(".jpg") or url.endswith(".jpeg") or url.endswith(".png") or \ | |
url.endswith(".gif") or url.endswith(".bmp") or content_type in self.content_types_img: | |
elif ( | |
content_type in self.content_types_img or | |
any([ | |
url.endswith(f".{extension}") for extension in [ | |
"jpg", "jpeg", "png", "gif", "bmp" | |
] | |
]) | |
): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
if temp_filepath.endswith(".pdf"): | ||
return UrlTypes.PDF.value | ||
elif temp_filepath.endswith(".docx"): | ||
return UrlTypes.DOCX.value | ||
elif temp_filepath.endswith(".doc"): | ||
return UrlTypes.MSWORD.value | ||
elif temp_filepath.endswith(".xlsx"): | ||
return UrlTypes.XLSX.value | ||
elif temp_filepath.endswith(".xls"): | ||
return UrlTypes.XLS.value | ||
elif temp_filepath.endswith(".pptx"): | ||
return UrlTypes.PPTX.value | ||
elif temp_filepath.endswith(".ppt"): | ||
return UrlTypes.PPT.value | ||
elif temp_filepath.endswith(".jpg") or temp_filepath.endswith(".jpeg") or temp_filepath.endswith(".png") or \ | ||
temp_filepath.endswith(".gif") or temp_filepath.endswith(".bmp"): | ||
return UrlTypes.IMG.value | ||
else: | ||
logging.warn(f'Could not determine the content-type of the {url}') | ||
return None |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe do this using dict.
if temp_filepath.endswith(".pdf"): | |
return UrlTypes.PDF.value | |
elif temp_filepath.endswith(".docx"): | |
return UrlTypes.DOCX.value | |
elif temp_filepath.endswith(".doc"): | |
return UrlTypes.MSWORD.value | |
elif temp_filepath.endswith(".xlsx"): | |
return UrlTypes.XLSX.value | |
elif temp_filepath.endswith(".xls"): | |
return UrlTypes.XLS.value | |
elif temp_filepath.endswith(".pptx"): | |
return UrlTypes.PPTX.value | |
elif temp_filepath.endswith(".ppt"): | |
return UrlTypes.PPT.value | |
elif temp_filepath.endswith(".jpg") or temp_filepath.endswith(".jpeg") or temp_filepath.endswith(".png") or \ | |
temp_filepath.endswith(".gif") or temp_filepath.endswith(".bmp"): | |
return UrlTypes.IMG.value | |
else: | |
logging.warn(f'Could not determine the content-type of the {url}') | |
return None | |
EXTENSION_TO_ENUM_MAP { | |
"pdf": UrlTypes.PDF, | |
"docx": UrlTypes.DOCX, | |
"doc": UrlTypes.MSWORD, | |
"xlsx": UrlTypes.XLSX, | |
"xls": UrlTypes.XLS, | |
"pptx": UrlTypes.PPTX, | |
"ppt": UrlTypes.PPT, | |
# Images | |
"jpg": UrlTypes.IMG, | |
"jpeg": UrlTypes.IMG, | |
"png": UrlTypes.IMG, | |
"gif": UrlTypes.IMG, | |
"bmp": UrlTypes.IMG, | |
} | |
file_extension = temp_filepath.split('.')[-1] | |
if file_extension not in EXTENSION_TO_ENUM_MAP: | |
logging.warn(f'Could not determine the content-type of the {url}') | |
return None | |
return EXTENSION_TO_ENUM_MAP[file_extension].value | |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
Addresses xxxxxx
Depends on xxxxxx
Changes
Mention related users here if any.
This PR doesn't introduce any:
print
This PR contains valid: