Skip to content

Commit

Permalink
Various improvements to speed and errors (#58)
Browse files Browse the repository at this point in the history
    Replaces pypdf with PyMuPDF for speed
    Adds fallback to SerpAPI if cross-ref gives 404 for bibtex
    Greatly simplifies throttled client and fix logic on leaky bucket algo (I hope)
    Changes paperId on gsearch to be based on DOI instead of link
  • Loading branch information
blackadad authored Apr 8, 2024
1 parent fc8bc39 commit 084d4bd
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 45 deletions.
32 changes: 21 additions & 11 deletions paperscraper/lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,21 +324,30 @@ async def parse_google_scholar_metadata(
"""Parse raw paper metadata from Google Scholar into a more rich format."""
doi: str | None = paper["externalIds"].get("DOI", None)
if doi:
bibtex = await doi_to_bibtex(doi, session)
key: str = bibtex.split("{")[1].split(",")[0]
citation = format_bibtex(bibtex, key, clean=False)
else:
try:
bibtex = await doi_to_bibtex(doi, session)
key: str = bibtex.split("{")[1].split(",")[0]
citation = format_bibtex(bibtex, key, clean=False)
except DOINotFoundError:
doi = None
if not doi:
# get citation by following link
# SLOW SLOW Using SerpAPI for this
async with session.get(
paper["inline_links"]["serpapi_cite_link"]
+ f"&api_key={os.environ['SERPAPI_API_KEY']}"
) as r:
# we raise here, because something really is wrong.
r.raise_for_status()
data = await r.json()
citation = next(c["snippet"] for c in data["citations"] if c["title"] == "MLA")
bibtex_link = next(c["link"] for c in data["links"] if c["name"] == "BibTeX")
async with session.get(bibtex_link) as r:
# we may have a 443 - link expired
if r.status == 443: # noqa: PLR2004
raise RuntimeError(
f"Google scholar blocking bibtex link at {bibtex_link}"
)
r.raise_for_status()
bibtex = await r.text()
key = bibtex.split("{")[1].split(",")[0]
Expand Down Expand Up @@ -387,7 +396,10 @@ async def doi_to_bibtex(doi: str, session: ClientSession) -> str:
# get DOI via crossref
url = f"https://api.crossref.org/works/{doi}/transform/application/x-bibtex"
async with session.get(url) as r:
r.raise_for_status()
if not r.ok:
raise DOINotFoundError(
f"Per HTTP status code {r.status_code}, could not resolve DOI {doi}."
)
data = await r.text()
# must make new key
key = data.split("{")[1].split(",")[0]
Expand Down Expand Up @@ -779,7 +791,6 @@ async def a_gsearch_papers( # noqa: C901, PLR0915
"engine": "google_scholar",
"num": _limit,
"start": _offset,
# TODO - add offset and limit here # noqa: TD004
}

if year is not None:
Expand Down Expand Up @@ -865,13 +876,12 @@ async def process(paper):
else:
paper["citationCount"] = int(paper["inline_links"]["cited_by"]["total"])

# set paperId to be hex digest of link
paper["paperId"] = hashlib.md5( # noqa: S324
paper["link"].encode()
).hexdigest()[0:16]
# set paperId to be hex digest of doi
paper["paperId"] = hashlib.md5(doi.encode()).hexdigest()[0:16] # noqa: S324
return paper

papers = await asyncio.gather(*[process(p) for p in papers])
# we only process papers that have a link
papers = await asyncio.gather(*[process(p) for p in papers if "link" in p])
total_papers = data["search_information"].get("total_results", 1)
logger.info(
f"Found {total_papers} papers, analyzing {_offset} to {_offset + len(papers)}"
Expand Down
59 changes: 28 additions & 31 deletions paperscraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from typing import Optional, Union

import aiohttp
import pypdf
import fitz


class ThrottledClientSession(aiohttp.ClientSession):
Expand All @@ -20,7 +20,6 @@ class ThrottledClientSession(aiohttp.ClientSession):
replace `session = aiohttp.ClientSession()`
with `session = ThrottledClientSession(rate_limit=15)`
see https://stackoverflow.com/a/60357775/107049
"""

MIN_SLEEP = 0.001
Expand All @@ -37,7 +36,7 @@ def __init__(
if rate_limit is not None:
if rate_limit <= 0:
raise ValueError("rate_limit must be positive")
self._queue = asyncio.Queue(min(2, int(rate_limit) + 1))
self._queue = asyncio.Queue(max(2, int(rate_limit)))
self._fillerTask = asyncio.create_task(self._filler(rate_limit))

def _get_sleep(self) -> Optional[float]: # noqa: FA100
Expand All @@ -60,27 +59,23 @@ async def _filler(self, rate_limit: float = 1):
return
self.rate_limit = rate_limit
sleep = self._get_sleep()
updated_at = time.monotonic()
fraction = 0
extra_increment = 0
for i in range(self._queue.maxsize):
self._queue.put_nowait(i)
updated_at = time.perf_counter()
while True:
if not self._queue.full():
now = time.monotonic()
increment = rate_limit * (now - updated_at)
fraction += increment % 1
extra_increment = fraction // 1
items_2_add = int(
min(
self._queue.maxsize - self._queue.qsize(),
int(increment) + extra_increment,
)
)
fraction = fraction % 1
for i in range(items_2_add):
self._queue.put_nowait(i)
updated_at = now
now = time.perf_counter()
# Calculate how many tokens to add to the bucket based on elapsed time.
tokens_to_add = int((now - updated_at) * rate_limit)
# Calculate available space in the queue to avoid overfilling it.
available_space = self._queue.maxsize - self._queue.qsize()
tokens_to_add = min(
tokens_to_add, available_space
) # Only add as many tokens as there is space.

for _ in range(tokens_to_add):
self._queue.put_nowait(
None
) # Insert a token (just None) into the queue to represent a request.

updated_at = now
await asyncio.sleep(sleep)
except asyncio.CancelledError:
pass
Expand All @@ -89,9 +84,6 @@ async def _filler(self, rate_limit: float = 1):

async def _allow(self) -> None:
if self._queue is not None:
# debug
# if self._start_time == None:
# self._start_time = time.time()
await self._queue.get()
self._queue.task_done()

Expand All @@ -110,17 +102,22 @@ async def _request(self, *args, **kwargs) -> aiohttp.ClientResponse:
return response


def check_pdf(path, verbose: Union[bool, Logger] = False) -> bool: # noqa: FA100
def check_pdf(path: str, verbose: Union[bool, Logger] = False) -> bool: # noqa: FA100
if not os.path.exists(path):
return False

try:
pdf = pypdf.PdfReader(path) # noqa: F841
except (pypdf.errors.PyPdfError, ValueError) as e:
# Open the PDF file using fitz
with fitz.open(path):
pass # For now, just opening the file is our basic check

except fitz.FileDataError as e:
if verbose and isinstance(verbose, bool):
print(f"PDF at {path} is corrupt: {e}")
print(f"PDF at {path} is corrupt or unreadable: {e}")
elif verbose:
verbose.exception(f"PDF at {path} is corrupt.", exc_info=e)
verbose.exception(f"PDF at {path} is corrupt or unreadable.", exc_info=e)
return False

return True


Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ classifiers = [
"Programming Language :: Python",
]
dependencies = [
"PyMuPDF",
"aiohttp",
"pybtex",
"pypdf",
]
description = "LLM Chain for answering questions from docs"
keywords = ["question answering"]
Expand All @@ -34,7 +34,7 @@ name = "paper-scraper"
readme = "README.md"
requires-python = ">=3.8"
urls = {repository = "https://github.com/blackadad/paper-scraper"}
version = "1.6.1"
version = "1.7.0"

[tool.codespell]
check-filenames = true
Expand Down
2 changes: 1 addition & 1 deletion tests/test_paperscraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ async def test_gsearch(self):
assert paper["citationCount"]
assert paper["title"]

async def test_high_limit(self) -> None:
async def test_gsearch_high_limit(self) -> None:
papers = await paperscraper.a_gsearch_papers(
"molecular dynamics", year="2019-2023", limit=45
)
Expand Down

0 comments on commit 084d4bd

Please sign in to comment.