small-thinking · yxjiang · Feb 18, 2024 · Feb 18, 2024
diff --git a/wenling/archiver.py b/wenling/archiver.py
@@ -74,23 +74,23 @@ async def _auto_tagging(self, paragraphs: List[Dict[str, Any]]) -> List[str]:
         """Leverage the LLM to auto-generate the tags based on the contents."""
         contents_str = "\n".join([paragraph.get("text", "") for paragraph in paragraphs if type(paragraph) == dict])
         prompt = f"""
-        Please help generate the tags based on the contents below:
-        ---
-        {contents_str}
-        ---
-
-        Some suggested tags:
-        0. No more than 3 tags.
-        1. If this article is about building agent, please add the tag Agent.
-        2. If this article is about LLM, please add the tag LLM.
-        3. If this article is about deep learning in general, please add the tag Deep Learning.
-        4. If this article is about tech philosophy, please add the tag Tech Philosophy.
-        5. Please use any other tags that you think are relevant.
-
-        Please generate the tags in the same language as the contents, and return in below json format:
-        {{
-            "tags": ["tag1", "tag2", "tag3"]
-        }}
+            Please help generate the tags based on the contents below:
+            ---
+            {contents_str}
+            ---
+            
+            Some suggested tags:
+            0. No more than 3 tags.
+            1. If this article is about building agent, please add the tag Agent.
+            2. If this article is about LLM, please add the tag LLM.
+            3. If this article is about deep learning in general, please add the tag Deep Learning.
+            4. If this article is about tech philosophy, please add the tag Tech Philosophy.
+            5. Please use any other tags that you think are relevant.
+            
+            Please generate the tags in the same language as the contents, and return in below json format:
+            {{
+                "tags": ["tag1", "tag2", "tag3"]
+            }}
         """
         json_response_str = self.model.inference(
             user_prompt=prompt, max_tokens=256, temperature=0.0, response_format="json_object"
@@ -708,7 +708,8 @@ async def _archive(self, url: str, notes: str | None = None) -> Coroutine[Any, A
         """Get the content block from the web page with the path div#content.
         Parse the elements and put them into a json object with list of elements.
         """
-        summary_obj = json.loads(pdf_paper_summary(url))
+        summary_obj = json.loads(pdf_paper_summary(logger=self.logger, pdf_url=url))
+        self.logger.info("Summarized the paper.")
         article_json_obj: Dict[str, Any] = {
             "properties": {},
             "children": [],
@@ -719,25 +720,7 @@ async def _archive(self, url: str, notes: str | None = None) -> Coroutine[Any, A
         article_json_obj["properties"]["type"] = "Arxiv"
         article_json_obj["properties"]["datetime"] = get_datetime()
 
-        author_dict = summary_obj.get("authors", {})
-        # Convert the author dictionary to a string, including first_two  and others.
-        author_str = ""
-        if author_dict.get("first_two", ""):
-            first_two_obj = author_dict["first_two"]
-            for author in first_two_obj:
-                author_str += author.get("name", "")
-                if author.get("affiliation", ""):
-                    affliation = author.get("affiliation", "")
-                    author_str += f" ({affliation})"
-                author_str += "\n"
-        if author_dict.get("others", ""):
-            others_obj = author_dict["others"]
-            for author in others_obj:
-                author_str += f", {author.get('name', '')}"
-                if author.get("affiliation", ""):
-                    affliation = author.get("affiliation", "")
-                    author_str += f" ({affliation})"
-                author_str += "\n"
+        author_str = summary_obj.get("authors", "")
 
         paragraphs: List[Dict[str, Any]] = [
             {"type": "h1", "text": summary_obj.get("title", "")},

diff --git a/wenling/common/model_utils.py b/wenling/common/model_utils.py
@@ -84,15 +84,16 @@ def inference(
         return result
 
 
-def pdf_paper_summary(pdf_url: str, truncate_size: int = 4000):
+def pdf_paper_summary(logger: Logger, pdf_url: str, truncate_size: int = 4000):
     try:
         # Download PDF.
+        logger.info(f"Downloading PDF from {pdf_url}...")
         pdf_path = download_pdf(pdf_url)
         pdf_path = os.path.expanduser(pdf_path)
         # Parse PDF into paragraphs.
+        logger.info(f"Partitioning PDF into paragraphs...")
         elements = partition_pdf(
             filename=pdf_path,
-            infer_table_structure=True,
             strategy="fast",
         )
 
@@ -102,6 +103,7 @@ def pdf_paper_summary(pdf_url: str, truncate_size: int = 4000):
 
         # Concatenate paragraphs into a single string
         text = "\n".join(paragraphs.values())[:truncate_size]
+        logger.info("Start to summarize the paper...")
         openai = OpenAIChatModel()
         sys_prompt = """
             You will receive the paper text snippets (may have some noise text). Please return the paper according to the following rules for each paper name or url.
@@ -117,7 +119,7 @@ def pdf_paper_summary(pdf_url: str, truncate_size: int = 4000):
 
             Extract a TL;DR of the content in 50 words or less, including who is presenting and the content being discussed into a section called SUMMARY.
 
-            AUTHOR Section: List the first 2 authors and other notable authors, each with their affiliation.
+            AUTHOR Section: List of string includes the first 2 authors and other notable authors, each with their affiliation in parenthesis.
 
             Extract the primary paper unique contribution into a bulleted list of no more than 50 words per bullet into a section called CONTRIBUTIONS.
 
@@ -152,6 +154,15 @@ def pdf_paper_summary(pdf_url: str, truncate_size: int = 4000):
 
             OUTPUT INSTRUCTIONS
             Create the output using the formatting above. And put them into a json format. And each blob of text should be in markdown format.
+            For example:
+            {{
+                "title": "The title of the paper",
+                "authors": "The authors of the paper",
+                "summary": "The summary of the paper",
+                "contributions": "The contributions of the paper",
+                "experiment": "The experiment of the paper",
+                "conclusion": "The conclusion of the paper"
+            }}
 
             The output should have the keys of title, authors, summary, contributions, experiment, conclusion.
         """