Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify author parse #29

Merged
merged 1 commit into from
Feb 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 20 additions & 37 deletions wenling/archiver.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,23 +74,23 @@ async def _auto_tagging(self, paragraphs: List[Dict[str, Any]]) -> List[str]:
"""Leverage the LLM to auto-generate the tags based on the contents."""
contents_str = "\n".join([paragraph.get("text", "") for paragraph in paragraphs if type(paragraph) == dict])
prompt = f"""
Please help generate the tags based on the contents below:
---
{contents_str}
---

Some suggested tags:
0. No more than 3 tags.
1. If this article is about building agent, please add the tag Agent.
2. If this article is about LLM, please add the tag LLM.
3. If this article is about deep learning in general, please add the tag Deep Learning.
4. If this article is about tech philosophy, please add the tag Tech Philosophy.
5. Please use any other tags that you think are relevant.

Please generate the tags in the same language as the contents, and return in below json format:
{{
"tags": ["tag1", "tag2", "tag3"]
}}
Please help generate the tags based on the contents below:
---
{contents_str}
---
Some suggested tags:
0. No more than 3 tags.
1. If this article is about building agent, please add the tag Agent.
2. If this article is about LLM, please add the tag LLM.
3. If this article is about deep learning in general, please add the tag Deep Learning.
4. If this article is about tech philosophy, please add the tag Tech Philosophy.
5. Please use any other tags that you think are relevant.
Please generate the tags in the same language as the contents, and return in below json format:
{{
"tags": ["tag1", "tag2", "tag3"]
}}
"""
json_response_str = self.model.inference(
user_prompt=prompt, max_tokens=256, temperature=0.0, response_format="json_object"
Expand Down Expand Up @@ -708,7 +708,8 @@ async def _archive(self, url: str, notes: str | None = None) -> Coroutine[Any, A
"""Get the content block from the web page with the path div#content.
Parse the elements and put them into a json object with list of elements.
"""
summary_obj = json.loads(pdf_paper_summary(url))
summary_obj = json.loads(pdf_paper_summary(logger=self.logger, pdf_url=url))
self.logger.info("Summarized the paper.")
article_json_obj: Dict[str, Any] = {
"properties": {},
"children": [],
Expand All @@ -719,25 +720,7 @@ async def _archive(self, url: str, notes: str | None = None) -> Coroutine[Any, A
article_json_obj["properties"]["type"] = "Arxiv"
article_json_obj["properties"]["datetime"] = get_datetime()

author_dict = summary_obj.get("authors", {})
# Convert the author dictionary to a string, including first_two and others.
author_str = ""
if author_dict.get("first_two", ""):
first_two_obj = author_dict["first_two"]
for author in first_two_obj:
author_str += author.get("name", "")
if author.get("affiliation", ""):
affliation = author.get("affiliation", "")
author_str += f" ({affliation})"
author_str += "\n"
if author_dict.get("others", ""):
others_obj = author_dict["others"]
for author in others_obj:
author_str += f", {author.get('name', '')}"
if author.get("affiliation", ""):
affliation = author.get("affiliation", "")
author_str += f" ({affliation})"
author_str += "\n"
author_str = summary_obj.get("authors", "")

paragraphs: List[Dict[str, Any]] = [
{"type": "h1", "text": summary_obj.get("title", "")},
Expand Down
17 changes: 14 additions & 3 deletions wenling/common/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,15 +84,16 @@ def inference(
return result


def pdf_paper_summary(pdf_url: str, truncate_size: int = 4000):
def pdf_paper_summary(logger: Logger, pdf_url: str, truncate_size: int = 4000):
try:
# Download PDF.
logger.info(f"Downloading PDF from {pdf_url}...")
pdf_path = download_pdf(pdf_url)
pdf_path = os.path.expanduser(pdf_path)
# Parse PDF into paragraphs.
logger.info(f"Partitioning PDF into paragraphs...")
elements = partition_pdf(
filename=pdf_path,
infer_table_structure=True,
strategy="fast",
)

Expand All @@ -102,6 +103,7 @@ def pdf_paper_summary(pdf_url: str, truncate_size: int = 4000):

# Concatenate paragraphs into a single string
text = "\n".join(paragraphs.values())[:truncate_size]
logger.info("Start to summarize the paper...")
openai = OpenAIChatModel()
sys_prompt = """
You will receive the paper text snippets (may have some noise text). Please return the paper according to the following rules for each paper name or url.
Expand All @@ -117,7 +119,7 @@ def pdf_paper_summary(pdf_url: str, truncate_size: int = 4000):

Extract a TL;DR of the content in 50 words or less, including who is presenting and the content being discussed into a section called SUMMARY.

AUTHOR Section: List the first 2 authors and other notable authors, each with their affiliation.
AUTHOR Section: List of string includes the first 2 authors and other notable authors, each with their affiliation in parenthesis.

Extract the primary paper unique contribution into a bulleted list of no more than 50 words per bullet into a section called CONTRIBUTIONS.

Expand Down Expand Up @@ -152,6 +154,15 @@ def pdf_paper_summary(pdf_url: str, truncate_size: int = 4000):

OUTPUT INSTRUCTIONS
Create the output using the formatting above. And put them into a json format. And each blob of text should be in markdown format.
For example:
{{
"title": "The title of the paper",
"authors": "The authors of the paper",
"summary": "The summary of the paper",
"contributions": "The contributions of the paper",
"experiment": "The experiment of the paper",
"conclusion": "The conclusion of the paper"
}}

The output should have the keys of title, authors, summary, contributions, experiment, conclusion.
"""
Expand Down
Loading