Skip to content

Commit

Permalink
Merge pull request #4 from ben-hudson/master
Browse files Browse the repository at this point in the history
PDF metadata is set according to scraped paper data
  • Loading branch information
MarkHershey authored Sep 15, 2023
2 parents a96de36 + 1081d39 commit 3851fb9
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 0 deletions.
27 changes: 27 additions & 0 deletions arxiv_dl/helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import logging
import os
import pypdf
import re
import shlex
import string
Expand Down Expand Up @@ -89,6 +90,8 @@ def download_pdf(
if out.is_file():
logger.debug(f'[Done] Paper saved to "{download_path}"')

add_pdf_metadata(paper_data, download_path)

return None


Expand Down Expand Up @@ -180,6 +183,30 @@ def command_exists(command: str) -> bool:
return which(command) is not None


def add_pdf_metadata(paper_data: PaperData, download_path: Path):
reader = pypdf.PdfReader(download_path)
writer = pypdf.PdfWriter()

# Add all pages to the writer
for page in reader.pages:
writer.add_page(page)

# Add the old metadata
metadata = reader.metadata
writer.add_metadata(metadata)

# Add the new metadata
writer.add_metadata({
"/Author": ", ".join(paper_data.authors),
"/Title": paper_data.title,
"/Subject": paper_data.abstract,
})

# Save the new PDF to a file
with open(download_path, "wb") as f:
writer.write(f)


def add_to_paper_list(paper_data: PaperData, download_dir: Union[str, Path]) -> None:
paper_list_path: Path = Path(download_dir) / "000_Paper_List.json"
paper_dict = paper_data.dict()
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
beautifulsoup4
colorlog
pydantic
pypdf
requests
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def update_version_file():
"colorlog>=4.1.0",
"requests",
"pydantic",
"pypdf>=3.10.0",
"beautifulsoup4",
],
extras_require={
Expand Down

0 comments on commit 3851fb9

Please sign in to comment.