Merge pull request #31 from small-thinking/archive-all-pdf

Support arbitrary pdf
small-thinking · Feb 19, 2024 · 9d86b18 · 9d86b18
2 parents 1b7ae47 + 4510bcf
commit 9d86b18
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 30 deletions.
diff --git a/wenling/archiver.py b/wenling/archiver.py
@@ -30,9 +30,9 @@ def __init__(self):
                 "archiver": SubstackArticleArchiver(name="SubstackArticleArchiver"),
             },
             {
-                # Match url has arxiv.org
-                "match_regex": r"^https://arxiv\.org/.*$",
-                "archiver": ArxivPaperArchiver(name="ArxivPaperArchiver"),
+                # Match url has arxiv.org or any url ends with .pdf in it.
+                "match_regex": r"^https://arxiv\.org/abs/.*$|^https://arxiv\.org/pdf/.*\.pdf$|.*\.pdf$",
+                "archiver": PdfPaperArchiver(name="PdfPaperArchiver"),
             },
         ]
         self.default_archiver = WebPageArchiver()
@@ -699,7 +699,7 @@ async def _archive(self, url: str, notes: Optional[str] = None) -> Dict[str, Any
             raise ValueError(f"Error parsing content. Details: {str(e)}")
 
 
-class ArxivPaperArchiver(Archiver):
+class PdfPaperArchiver(Archiver):
     def __init__(self, name: str, vendor_type: str = "openai", **kwargs):
         super().__init__(name=name, vendor_type=vendor_type)
         self.root_css_selector = "body"
@@ -710,8 +710,10 @@ async def _archive(self, url: str, notes: str | None = None) -> Coroutine[Any, A
         """
         if "abs" in url:
             pdf_url = url.replace("abs", "pdf") + ".pdf"
-        else:
+        elif url.endswith(".pdf"):
             pdf_url = url
+        else:
+            raise ValueError(f"The url {url} is not a valid pdf url.")
         summary_obj = json.loads(pdf_paper_summary(logger=self.logger, pdf_url=pdf_url))
         self.logger.info("Summarized the paper.")
         article_json_obj: Dict[str, Any] = {
@@ -725,22 +727,31 @@ async def _archive(self, url: str, notes: str | None = None) -> Coroutine[Any, A
         article_json_obj["properties"]["datetime"] = get_datetime()
 
         author_str = summary_obj.get("authors", "")
+        contributions: List[Dict[str, Any]] = []
+        for contribution in summary_obj.get("contributions", []):
+            contributions.append({"type": "text", "text": contribution})
+        self.logger.info(f"Contributions: {contributions}")
 
         paragraphs: List[Dict[str, Any]] = [
             {"type": "h1", "text": summary_obj.get("title", "")},
             {"type": "h3", "text": author_str},
             {"type": "h2", "text": "Summary"},
             {"type": "text", "text": summary_obj.get("summary", "")},
             {"type": "h2", "text": "Contributions"},
-            {"type": "text", "text": summary_obj.get("contributions", "")},
-            {"type": "h2", "text": "Conclusions"},
-            {"type": "text", "text": summary_obj.get("conclusions", "")},
         ]
+        paragraphs.extend(contributions)
+        paragraphs.extend(
+            [
+                {"type": "h2", "text": "Conclusions"},
+                {"type": "text", "text": summary_obj.get("conclusions", "")},
+            ]
+        )
+
         article_json_obj["children"] = paragraphs
 
         self.logger.info(f"Auto-generate tags based on the contents...")
-        tags = await self._auto_tagging(paragraphs=article_json_obj["children"])
-        article_json_obj["properties"]["tags"] = tags
+        # tags = await self._auto_tagging(paragraphs=article_json_obj["children"])
+        article_json_obj["properties"]["tags"] = []
 
         if os.environ.get("VERBOSE") == "True":
             json_object_str = json.dumps(article_json_obj, indent=2)

diff --git a/wenling/common/model_utils.py b/wenling/common/model_utils.py
@@ -109,13 +109,22 @@ def pdf_paper_summary(logger: Logger, pdf_url: str, truncate_size: int = 8000):
             You will receive the paper text snippets (may have some noise text). 
             You are a research paper analysis service focused on determining the primary findings of the paper and analyzing its scientific quality.
 
-            Take a deep breath and think step by step about how to best accomplish this goal using the following steps.
-
-            OUTPUT SECTIONS in json format.
+            Take a deep breath and think step by step about how to best accomplish this goal using the following json structure:
+            
+            {{
+                "title": "The title of the paper",
+                "authors": "The authors of the paper",
+                "summary": "The summary of the paper",
+                "contributions": ["Contribution 1", "Contribution 2", "Contribution 3"],
+                "experiment": "The experiment of the paper",
+                "conclusion": "The conclusion of the paper"
+            }}
             
             Title: Extract the title of the paper.
             
             Authors: List of string includes the first 2 authors and other notable authors, each with their affiliation in parenthesis.
+            
+            Summary: What problem does the paper solve? What is the core idea of the paper? What is the main result?
 
             Contributions: Extract the primary paper unique contribution into a bulleted list of no more than 50 words per bullet.
 
@@ -163,23 +172,7 @@ def pdf_paper_summary(logger: Logger, pdf_url: str, truncate_size: int = 8000):
             ---
 
             Conclusion:
-            You output a 50 word summary of the quality of the paper and it's likelihood of being replicated in
-            future work as one of three levels: High, Medium, or Low.
-            You put that sentence and ratign into a section called SUMMARY.
-
-            OUTPUT INSTRUCTIONS
-            Create the output using the formatting above. And put them into a json format.
-            And each blob of text should be in markdown format.
-            For example:
-            {{
-                "title": "The title of the paper",
-                "authors": "The authors of the paper",
-                "summary": "The summary of the paper",
-                "contributions": "The contributions of the paper",
-                "experiment": "The experiment of the paper",
-                "conclusion": "The conclusion of the paper"
-            }}
-            
+            You output a 50 word summary of the paper.
         """
 
         summary = openai.inference(