Skip to content

Commit

Permalink
Merge pull request #31 from small-thinking/archive-all-pdf
Browse files Browse the repository at this point in the history
Support arbitrary pdf
  • Loading branch information
yxjiang authored Feb 19, 2024
2 parents 1b7ae47 + 4510bcf commit 9d86b18
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 30 deletions.
31 changes: 21 additions & 10 deletions wenling/archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ def __init__(self):
"archiver": SubstackArticleArchiver(name="SubstackArticleArchiver"),
},
{
# Match url has arxiv.org
"match_regex": r"^https://arxiv\.org/.*$",
"archiver": ArxivPaperArchiver(name="ArxivPaperArchiver"),
# Match url has arxiv.org or any url ends with .pdf in it.
"match_regex": r"^https://arxiv\.org/abs/.*$|^https://arxiv\.org/pdf/.*\.pdf$|.*\.pdf$",
"archiver": PdfPaperArchiver(name="PdfPaperArchiver"),
},
]
self.default_archiver = WebPageArchiver()
Expand Down Expand Up @@ -699,7 +699,7 @@ async def _archive(self, url: str, notes: Optional[str] = None) -> Dict[str, Any
raise ValueError(f"Error parsing content. Details: {str(e)}")


class ArxivPaperArchiver(Archiver):
class PdfPaperArchiver(Archiver):
def __init__(self, name: str, vendor_type: str = "openai", **kwargs):
super().__init__(name=name, vendor_type=vendor_type)
self.root_css_selector = "body"
Expand All @@ -710,8 +710,10 @@ async def _archive(self, url: str, notes: str | None = None) -> Coroutine[Any, A
"""
if "abs" in url:
pdf_url = url.replace("abs", "pdf") + ".pdf"
else:
elif url.endswith(".pdf"):
pdf_url = url
else:
raise ValueError(f"The url {url} is not a valid pdf url.")
summary_obj = json.loads(pdf_paper_summary(logger=self.logger, pdf_url=pdf_url))
self.logger.info("Summarized the paper.")
article_json_obj: Dict[str, Any] = {
Expand All @@ -725,22 +727,31 @@ async def _archive(self, url: str, notes: str | None = None) -> Coroutine[Any, A
article_json_obj["properties"]["datetime"] = get_datetime()

author_str = summary_obj.get("authors", "")
contributions: List[Dict[str, Any]] = []
for contribution in summary_obj.get("contributions", []):
contributions.append({"type": "text", "text": contribution})
self.logger.info(f"Contributions: {contributions}")

paragraphs: List[Dict[str, Any]] = [
{"type": "h1", "text": summary_obj.get("title", "")},
{"type": "h3", "text": author_str},
{"type": "h2", "text": "Summary"},
{"type": "text", "text": summary_obj.get("summary", "")},
{"type": "h2", "text": "Contributions"},
{"type": "text", "text": summary_obj.get("contributions", "")},
{"type": "h2", "text": "Conclusions"},
{"type": "text", "text": summary_obj.get("conclusions", "")},
]
paragraphs.extend(contributions)
paragraphs.extend(
[
{"type": "h2", "text": "Conclusions"},
{"type": "text", "text": summary_obj.get("conclusions", "")},
]
)

article_json_obj["children"] = paragraphs

self.logger.info(f"Auto-generate tags based on the contents...")
tags = await self._auto_tagging(paragraphs=article_json_obj["children"])
article_json_obj["properties"]["tags"] = tags
# tags = await self._auto_tagging(paragraphs=article_json_obj["children"])
article_json_obj["properties"]["tags"] = []

if os.environ.get("VERBOSE") == "True":
json_object_str = json.dumps(article_json_obj, indent=2)
Expand Down
33 changes: 13 additions & 20 deletions wenling/common/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,22 @@ def pdf_paper_summary(logger: Logger, pdf_url: str, truncate_size: int = 8000):
You will receive the paper text snippets (may have some noise text).
You are a research paper analysis service focused on determining the primary findings of the paper and analyzing its scientific quality.
Take a deep breath and think step by step about how to best accomplish this goal using the following steps.
OUTPUT SECTIONS in json format.
Take a deep breath and think step by step about how to best accomplish this goal using the following json structure:
{{
"title": "The title of the paper",
"authors": "The authors of the paper",
"summary": "The summary of the paper",
"contributions": ["Contribution 1", "Contribution 2", "Contribution 3"],
"experiment": "The experiment of the paper",
"conclusion": "The conclusion of the paper"
}}
Title: Extract the title of the paper.
Authors: List of string includes the first 2 authors and other notable authors, each with their affiliation in parenthesis.
Summary: What problem does the paper solve? What is the core idea of the paper? What is the main result?
Contributions: Extract the primary paper unique contribution into a bulleted list of no more than 50 words per bullet.
Expand Down Expand Up @@ -163,23 +172,7 @@ def pdf_paper_summary(logger: Logger, pdf_url: str, truncate_size: int = 8000):
---
Conclusion:
You output a 50 word summary of the quality of the paper and it's likelihood of being replicated in
future work as one of three levels: High, Medium, or Low.
You put that sentence and ratign into a section called SUMMARY.
OUTPUT INSTRUCTIONS
Create the output using the formatting above. And put them into a json format.
And each blob of text should be in markdown format.
For example:
{{
"title": "The title of the paper",
"authors": "The authors of the paper",
"summary": "The summary of the paper",
"contributions": "The contributions of the paper",
"experiment": "The experiment of the paper",
"conclusion": "The conclusion of the paper"
}}
You output a 50 word summary of the paper.
"""

summary = openai.inference(
Expand Down

0 comments on commit 9d86b18

Please sign in to comment.