From c049dedc7e095d8de083d6fa23b08e43fd45eca7 Mon Sep 17 00:00:00 2001 From: chloedia Date: Fri, 22 Nov 2024 17:52:24 +0100 Subject: [PATCH 1/4] add: megaparse sdk with nats --- core/pyproject.toml | 2 +- .../processor/implementations/megaparse_processor.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/core/pyproject.toml b/core/pyproject.toml index d57730ec2cbf..466566e9f5a9 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "faiss-cpu>=1.8.0.post1", "rapidfuzz>=3.10.1", "markupsafe>=2.1.5", - "megaparse[all]== 0.0.43", + "megaparse-sdk==0.1.5" ] readme = "README.md" requires-python = ">= 3.11" diff --git a/core/quivr_core/processor/implementations/megaparse_processor.py b/core/quivr_core/processor/implementations/megaparse_processor.py index ee5dc53b8eaf..6c0b740f01c0 100644 --- a/core/quivr_core/processor/implementations/megaparse_processor.py +++ b/core/quivr_core/processor/implementations/megaparse_processor.py @@ -3,8 +3,7 @@ import tiktoken from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter -from megaparse.core.megaparse import MegaParse -from megaparse.core.parser.unstructured_parser import UnstructuredParser +from megaparse_sdk.client import MegaParseNATSClient from quivr_core.config import MegaparseConfig from quivr_core.files.file import QuivrFile @@ -75,9 +74,9 @@ def processor_metadata(self): async def process_file_inner(self, file: QuivrFile) -> list[Document]: logger.info(f"Uploading file {file.path} to MegaParse") - parser = UnstructuredParser(**self.megaparse_config.model_dump()) - megaparse = MegaParse(parser) - response = await megaparse.aload(file.path) + client = MegaParseNATSClient() + response = await client.parse_file(file=file.path) + client.close() logger.info(f"File : {response}") document = Document( page_content=response, From 5bc430a9291630a4f31b1c56eff210bb34cfb514 Mon Sep 17 00:00:00 2001 From: chloedia Date: Mon, 25 Nov 2024 15:12:59 +0100 Subject: [PATCH 2/4] add: nats --- core/pyproject.toml | 2 +- .../implementations/megaparse_processor.py | 29 ++----------------- examples/simple_question_megaparse.py | 4 ++- 3 files changed, 6 insertions(+), 29 deletions(-) diff --git a/core/pyproject.toml b/core/pyproject.toml index 466566e9f5a9..115db9ddd7bd 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "faiss-cpu>=1.8.0.post1", "rapidfuzz>=3.10.1", "markupsafe>=2.1.5", - "megaparse-sdk==0.1.5" + "megaparse-sdk==0.1.7" ] readme = "README.md" requires-python = ">= 3.11" diff --git a/core/quivr_core/processor/implementations/megaparse_processor.py b/core/quivr_core/processor/implementations/megaparse_processor.py index 6c0b740f01c0..3a851e614d39 100644 --- a/core/quivr_core/processor/implementations/megaparse_processor.py +++ b/core/quivr_core/processor/implementations/megaparse_processor.py @@ -4,6 +4,7 @@ from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter from megaparse_sdk.client import MegaParseNATSClient +from megaparse_sdk.config import ClientNATSConfig from quivr_core.config import MegaparseConfig from quivr_core.files.file import QuivrFile @@ -74,9 +75,8 @@ def processor_metadata(self): async def process_file_inner(self, file: QuivrFile) -> list[Document]: logger.info(f"Uploading file {file.path} to MegaParse") - client = MegaParseNATSClient() + client = MegaParseNATSClient(ClientNATSConfig()) response = await client.parse_file(file=file.path) - client.close() logger.info(f"File : {response}") document = Document( page_content=response, @@ -86,28 +86,3 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]: for doc in docs: doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} return docs - - # async def process_file_inner(self, file: QuivrFile) -> list[Document]: - # api_key = str(os.getenv("MEGAPARSE_API_KEY")) - # megaparse = MegaParseSDK(api_key) - # logger.info(f"Uploading file {file.path} to MegaParse") - # data = { - # "method": self.megaparse_config.method, - # "strategy": self.megaparse_config.strategy, - # "check_table": self.megaparse_config.check_table, - # "parsing_instruction": self.megaparse_config.parsing_instruction, - # "model_name": self.megaparse_config.model_name, - # } - # response = await megaparse.file.upload( - # file_path=str(file.path), - # **data, - # ) - # document = Document( - # page_content=response["result"], - # ) - # if len(response) > self.splitter_config.chunk_size: - # docs = self.text_splitter.split_documents([document]) - # for doc in docs: - # doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} - # return docs - # return [document] diff --git a/examples/simple_question_megaparse.py b/examples/simple_question_megaparse.py index 0d3c229e960e..31d8897ab44f 100644 --- a/examples/simple_question_megaparse.py +++ b/examples/simple_question_megaparse.py @@ -11,7 +11,9 @@ if __name__ == "__main__": brain = Brain.from_files( name="test_brain", - file_paths=["./tests/processor/docx/demo.docx"], + file_paths=[ + "/Users/chloed./Documents/quivr/quivr/enterprise/backend/core/core/tests/processor/pdf/sample.pdf" + ], llm=LLMEndpoint( llm_config=LLMEndpointConfig(model="gpt-4o"), llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))), From 1b91281e0c5bfd2bac8c201e52da3529f3e844fa Mon Sep 17 00:00:00 2001 From: chloedia Date: Mon, 25 Nov 2024 15:15:38 +0100 Subject: [PATCH 3/4] add: Async with --- .../processor/implementations/megaparse_processor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/quivr_core/processor/implementations/megaparse_processor.py b/core/quivr_core/processor/implementations/megaparse_processor.py index 3a851e614d39..2c46cec105a9 100644 --- a/core/quivr_core/processor/implementations/megaparse_processor.py +++ b/core/quivr_core/processor/implementations/megaparse_processor.py @@ -75,8 +75,9 @@ def processor_metadata(self): async def process_file_inner(self, file: QuivrFile) -> list[Document]: logger.info(f"Uploading file {file.path} to MegaParse") - client = MegaParseNATSClient(ClientNATSConfig()) - response = await client.parse_file(file=file.path) + async with MegaParseNATSClient(ClientNATSConfig()) as client: + response = await client.parse_file(file=file.path) + logger.info(f"File : {response}") document = Document( page_content=response, From 9437e7134d8b45c0783ba3dd7f2f84ac2e4649ca Mon Sep 17 00:00:00 2001 From: chloedia Date: Mon, 25 Nov 2024 15:26:39 +0100 Subject: [PATCH 4/4] fix: rm absolute path --- examples/simple_question_megaparse.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/simple_question_megaparse.py b/examples/simple_question_megaparse.py index 31d8897ab44f..a46267f92f12 100644 --- a/examples/simple_question_megaparse.py +++ b/examples/simple_question_megaparse.py @@ -11,9 +11,7 @@ if __name__ == "__main__": brain = Brain.from_files( name="test_brain", - file_paths=[ - "/Users/chloed./Documents/quivr/quivr/enterprise/backend/core/core/tests/processor/pdf/sample.pdf" - ], + file_paths=["./tests/processor/pdf/sample.pdf"], llm=LLMEndpoint( llm_config=LLMEndpointConfig(model="gpt-4o"), llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))),