diff --git a/core/pyproject.toml b/core/pyproject.toml index d57730ec2cbf..115db9ddd7bd 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "faiss-cpu>=1.8.0.post1", "rapidfuzz>=3.10.1", "markupsafe>=2.1.5", - "megaparse[all]== 0.0.43", + "megaparse-sdk==0.1.7" ] readme = "README.md" requires-python = ">= 3.11" diff --git a/core/quivr_core/processor/implementations/megaparse_processor.py b/core/quivr_core/processor/implementations/megaparse_processor.py index ee5dc53b8eaf..2c46cec105a9 100644 --- a/core/quivr_core/processor/implementations/megaparse_processor.py +++ b/core/quivr_core/processor/implementations/megaparse_processor.py @@ -3,8 +3,8 @@ import tiktoken from langchain_core.documents import Document from langchain_text_splitters import RecursiveCharacterTextSplitter, TextSplitter -from megaparse.core.megaparse import MegaParse -from megaparse.core.parser.unstructured_parser import UnstructuredParser +from megaparse_sdk.client import MegaParseNATSClient +from megaparse_sdk.config import ClientNATSConfig from quivr_core.config import MegaparseConfig from quivr_core.files.file import QuivrFile @@ -75,9 +75,9 @@ def processor_metadata(self): async def process_file_inner(self, file: QuivrFile) -> list[Document]: logger.info(f"Uploading file {file.path} to MegaParse") - parser = UnstructuredParser(**self.megaparse_config.model_dump()) - megaparse = MegaParse(parser) - response = await megaparse.aload(file.path) + async with MegaParseNATSClient(ClientNATSConfig()) as client: + response = await client.parse_file(file=file.path) + logger.info(f"File : {response}") document = Document( page_content=response, @@ -87,28 +87,3 @@ async def process_file_inner(self, file: QuivrFile) -> list[Document]: for doc in docs: doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} return docs - - # async def process_file_inner(self, file: QuivrFile) -> list[Document]: - # api_key = str(os.getenv("MEGAPARSE_API_KEY")) - # megaparse = MegaParseSDK(api_key) - # logger.info(f"Uploading file {file.path} to MegaParse") - # data = { - # "method": self.megaparse_config.method, - # "strategy": self.megaparse_config.strategy, - # "check_table": self.megaparse_config.check_table, - # "parsing_instruction": self.megaparse_config.parsing_instruction, - # "model_name": self.megaparse_config.model_name, - # } - # response = await megaparse.file.upload( - # file_path=str(file.path), - # **data, - # ) - # document = Document( - # page_content=response["result"], - # ) - # if len(response) > self.splitter_config.chunk_size: - # docs = self.text_splitter.split_documents([document]) - # for doc in docs: - # doc.metadata = {"chunk_size": len(self.enc.encode(doc.page_content))} - # return docs - # return [document] diff --git a/examples/simple_question_megaparse.py b/examples/simple_question_megaparse.py index 0d3c229e960e..a46267f92f12 100644 --- a/examples/simple_question_megaparse.py +++ b/examples/simple_question_megaparse.py @@ -11,7 +11,7 @@ if __name__ == "__main__": brain = Brain.from_files( name="test_brain", - file_paths=["./tests/processor/docx/demo.docx"], + file_paths=["./tests/processor/pdf/sample.pdf"], llm=LLMEndpoint( llm_config=LLMEndpointConfig(model="gpt-4o"), llm=ChatOpenAI(model="gpt-4o", api_key=str(os.getenv("OPENAI_API_KEY"))),