diff --git a/wenling/archiver.py b/wenling/archiver.py index 7899458..b84b6b5 100644 --- a/wenling/archiver.py +++ b/wenling/archiver.py @@ -74,23 +74,23 @@ async def _auto_tagging(self, paragraphs: List[Dict[str, Any]]) -> List[str]: """Leverage the LLM to auto-generate the tags based on the contents.""" contents_str = "\n".join([paragraph.get("text", "") for paragraph in paragraphs if type(paragraph) == dict]) prompt = f""" - Please help generate the tags based on the contents below: - --- - {contents_str} - --- - - Some suggested tags: - 0. No more than 3 tags. - 1. If this article is about building agent, please add the tag Agent. - 2. If this article is about LLM, please add the tag LLM. - 3. If this article is about deep learning in general, please add the tag Deep Learning. - 4. If this article is about tech philosophy, please add the tag Tech Philosophy. - 5. Please use any other tags that you think are relevant. - - Please generate the tags in the same language as the contents, and return in below json format: - {{ - "tags": ["tag1", "tag2", "tag3"] - }} + Please help generate the tags based on the contents below: + --- + {contents_str} + --- + + Some suggested tags: + 0. No more than 3 tags. + 1. If this article is about building agent, please add the tag Agent. + 2. If this article is about LLM, please add the tag LLM. + 3. If this article is about deep learning in general, please add the tag Deep Learning. + 4. If this article is about tech philosophy, please add the tag Tech Philosophy. + 5. Please use any other tags that you think are relevant. + + Please generate the tags in the same language as the contents, and return in below json format: + {{ + "tags": ["tag1", "tag2", "tag3"] + }} """ json_response_str = self.model.inference( user_prompt=prompt, max_tokens=256, temperature=0.0, response_format="json_object" @@ -708,7 +708,8 @@ async def _archive(self, url: str, notes: str | None = None) -> Coroutine[Any, A """Get the content block from the web page with the path div#content. Parse the elements and put them into a json object with list of elements. """ - summary_obj = json.loads(pdf_paper_summary(url)) + summary_obj = json.loads(pdf_paper_summary(logger=self.logger, pdf_url=url)) + self.logger.info("Summarized the paper.") article_json_obj: Dict[str, Any] = { "properties": {}, "children": [], @@ -719,25 +720,7 @@ async def _archive(self, url: str, notes: str | None = None) -> Coroutine[Any, A article_json_obj["properties"]["type"] = "Arxiv" article_json_obj["properties"]["datetime"] = get_datetime() - author_dict = summary_obj.get("authors", {}) - # Convert the author dictionary to a string, including first_two and others. - author_str = "" - if author_dict.get("first_two", ""): - first_two_obj = author_dict["first_two"] - for author in first_two_obj: - author_str += author.get("name", "") - if author.get("affiliation", ""): - affliation = author.get("affiliation", "") - author_str += f" ({affliation})" - author_str += "\n" - if author_dict.get("others", ""): - others_obj = author_dict["others"] - for author in others_obj: - author_str += f", {author.get('name', '')}" - if author.get("affiliation", ""): - affliation = author.get("affiliation", "") - author_str += f" ({affliation})" - author_str += "\n" + author_str = summary_obj.get("authors", "") paragraphs: List[Dict[str, Any]] = [ {"type": "h1", "text": summary_obj.get("title", "")}, diff --git a/wenling/common/model_utils.py b/wenling/common/model_utils.py index f4fe983..7ac7c7b 100644 --- a/wenling/common/model_utils.py +++ b/wenling/common/model_utils.py @@ -84,15 +84,16 @@ def inference( return result -def pdf_paper_summary(pdf_url: str, truncate_size: int = 4000): +def pdf_paper_summary(logger: Logger, pdf_url: str, truncate_size: int = 4000): try: # Download PDF. + logger.info(f"Downloading PDF from {pdf_url}...") pdf_path = download_pdf(pdf_url) pdf_path = os.path.expanduser(pdf_path) # Parse PDF into paragraphs. + logger.info(f"Partitioning PDF into paragraphs...") elements = partition_pdf( filename=pdf_path, - infer_table_structure=True, strategy="fast", ) @@ -102,6 +103,7 @@ def pdf_paper_summary(pdf_url: str, truncate_size: int = 4000): # Concatenate paragraphs into a single string text = "\n".join(paragraphs.values())[:truncate_size] + logger.info("Start to summarize the paper...") openai = OpenAIChatModel() sys_prompt = """ You will receive the paper text snippets (may have some noise text). Please return the paper according to the following rules for each paper name or url. @@ -117,7 +119,7 @@ def pdf_paper_summary(pdf_url: str, truncate_size: int = 4000): Extract a TL;DR of the content in 50 words or less, including who is presenting and the content being discussed into a section called SUMMARY. - AUTHOR Section: List the first 2 authors and other notable authors, each with their affiliation. + AUTHOR Section: List of string includes the first 2 authors and other notable authors, each with their affiliation in parenthesis. Extract the primary paper unique contribution into a bulleted list of no more than 50 words per bullet into a section called CONTRIBUTIONS. @@ -152,6 +154,15 @@ def pdf_paper_summary(pdf_url: str, truncate_size: int = 4000): OUTPUT INSTRUCTIONS Create the output using the formatting above. And put them into a json format. And each blob of text should be in markdown format. + For example: + {{ + "title": "The title of the paper", + "authors": "The authors of the paper", + "summary": "The summary of the paper", + "contributions": "The contributions of the paper", + "experiment": "The experiment of the paper", + "conclusion": "The conclusion of the paper" + }} The output should have the keys of title, authors, summary, contributions, experiment, conclusion. """