Various improvements to speed and errors (#58)

Replaces pypdf with PyMuPDF for speed Adds fallback to SerpAPI if cross-ref gives 404 for bibtex Greatly simplifies throttled client and fix logic on leaky bucket algo (I hope) Changes paperId on gsearch to be based on DOI instead of link
blackadad · Apr 8, 2024 · 084d4bd · 084d4bd
1 parent fc8bc39
commit 084d4bd
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 45 deletions.
diff --git a/paperscraper/lib.py b/paperscraper/lib.py
@@ -324,21 +324,30 @@ async def parse_google_scholar_metadata(
     """Parse raw paper metadata from Google Scholar into a more rich format."""
     doi: str | None = paper["externalIds"].get("DOI", None)
     if doi:
-        bibtex = await doi_to_bibtex(doi, session)
-        key: str = bibtex.split("{")[1].split(",")[0]
-        citation = format_bibtex(bibtex, key, clean=False)
-    else:
+        try:
+            bibtex = await doi_to_bibtex(doi, session)
+            key: str = bibtex.split("{")[1].split(",")[0]
+            citation = format_bibtex(bibtex, key, clean=False)
+        except DOINotFoundError:
+            doi = None
+    if not doi:
         # get citation by following link
         # SLOW SLOW Using SerpAPI for this
         async with session.get(
             paper["inline_links"]["serpapi_cite_link"]
             + f"&api_key={os.environ['SERPAPI_API_KEY']}"
         ) as r:
+            # we raise here, because something really is wrong.
             r.raise_for_status()
             data = await r.json()
         citation = next(c["snippet"] for c in data["citations"] if c["title"] == "MLA")
         bibtex_link = next(c["link"] for c in data["links"] if c["name"] == "BibTeX")
         async with session.get(bibtex_link) as r:
+            # we may have a 443 - link expired
+            if r.status == 443:  # noqa: PLR2004
+                raise RuntimeError(
+                    f"Google scholar blocking bibtex link at {bibtex_link}"
+                )
             r.raise_for_status()
             bibtex = await r.text()
         key = bibtex.split("{")[1].split(",")[0]
@@ -387,7 +396,10 @@ async def doi_to_bibtex(doi: str, session: ClientSession) -> str:
     # get DOI via crossref
     url = f"https://api.crossref.org/works/{doi}/transform/application/x-bibtex"
     async with session.get(url) as r:
-        r.raise_for_status()
+        if not r.ok:
+            raise DOINotFoundError(
+                f"Per HTTP status code {r.status_code}, could not resolve DOI {doi}."
+            )
         data = await r.text()
     # must make new key
     key = data.split("{")[1].split(",")[0]
@@ -779,7 +791,6 @@ async def a_gsearch_papers(  # noqa: C901, PLR0915
         "engine": "google_scholar",
         "num": _limit,
         "start": _offset,
-        # TODO - add offset and limit here  # noqa: TD004
     }
 
     if year is not None:
@@ -865,13 +876,12 @@ async def process(paper):
             else:
                 paper["citationCount"] = int(paper["inline_links"]["cited_by"]["total"])
 
-            # set paperId to be hex digest of link
-            paper["paperId"] = hashlib.md5(  # noqa: S324
-                paper["link"].encode()
-            ).hexdigest()[0:16]
+            # set paperId to be hex digest of doi
+            paper["paperId"] = hashlib.md5(doi.encode()).hexdigest()[0:16]  # noqa: S324
             return paper
 
-        papers = await asyncio.gather(*[process(p) for p in papers])
+        # we only process papers that have a link
+        papers = await asyncio.gather(*[process(p) for p in papers if "link" in p])
         total_papers = data["search_information"].get("total_results", 1)
         logger.info(
             f"Found {total_papers} papers, analyzing {_offset} to {_offset + len(papers)}"

diff --git a/paperscraper/utils.py b/paperscraper/utils.py
@@ -9,7 +9,7 @@
 from typing import Optional, Union
 
 import aiohttp
-import pypdf
+import fitz
 
 
 class ThrottledClientSession(aiohttp.ClientSession):
@@ -20,7 +20,6 @@ class ThrottledClientSession(aiohttp.ClientSession):
         replace `session = aiohttp.ClientSession()`
         with `session = ThrottledClientSession(rate_limit=15)`
 
-    see https://stackoverflow.com/a/60357775/107049
     """
 
     MIN_SLEEP = 0.001
@@ -37,7 +36,7 @@ def __init__(
         if rate_limit is not None:
             if rate_limit <= 0:
                 raise ValueError("rate_limit must be positive")
-            self._queue = asyncio.Queue(min(2, int(rate_limit) + 1))
+            self._queue = asyncio.Queue(max(2, int(rate_limit)))
             self._fillerTask = asyncio.create_task(self._filler(rate_limit))
 
     def _get_sleep(self) -> Optional[float]:  # noqa: FA100
@@ -60,27 +59,23 @@ async def _filler(self, rate_limit: float = 1):
                 return
             self.rate_limit = rate_limit
             sleep = self._get_sleep()
-            updated_at = time.monotonic()
-            fraction = 0
-            extra_increment = 0
-            for i in range(self._queue.maxsize):
-                self._queue.put_nowait(i)
+            updated_at = time.perf_counter()
             while True:
-                if not self._queue.full():
-                    now = time.monotonic()
-                    increment = rate_limit * (now - updated_at)
-                    fraction += increment % 1
-                    extra_increment = fraction // 1
-                    items_2_add = int(
-                        min(
-                            self._queue.maxsize - self._queue.qsize(),
-                            int(increment) + extra_increment,
-                        )
-                    )
-                    fraction = fraction % 1
-                    for i in range(items_2_add):
-                        self._queue.put_nowait(i)
-                    updated_at = now
+                now = time.perf_counter()
+                # Calculate how many tokens to add to the bucket based on elapsed time.
+                tokens_to_add = int((now - updated_at) * rate_limit)
+                # Calculate available space in the queue to avoid overfilling it.
+                available_space = self._queue.maxsize - self._queue.qsize()
+                tokens_to_add = min(
+                    tokens_to_add, available_space
+                )  # Only add as many tokens as there is space.
+
+                for _ in range(tokens_to_add):
+                    self._queue.put_nowait(
+                        None
+                    )  # Insert a token (just None) into the queue to represent a request.
+
+                updated_at = now
                 await asyncio.sleep(sleep)
         except asyncio.CancelledError:
             pass
@@ -89,9 +84,6 @@ async def _filler(self, rate_limit: float = 1):
 
     async def _allow(self) -> None:
         if self._queue is not None:
-            # debug
-            # if self._start_time == None:
-            #    self._start_time = time.time()
             await self._queue.get()
             self._queue.task_done()
 
@@ -110,17 +102,22 @@ async def _request(self, *args, **kwargs) -> aiohttp.ClientResponse:
         return response
 
 
-def check_pdf(path, verbose: Union[bool, Logger] = False) -> bool:  # noqa: FA100
+def check_pdf(path: str, verbose: Union[bool, Logger] = False) -> bool:  # noqa: FA100
     if not os.path.exists(path):
         return False
+
     try:
-        pdf = pypdf.PdfReader(path)  # noqa: F841
-    except (pypdf.errors.PyPdfError, ValueError) as e:
+        # Open the PDF file using fitz
+        with fitz.open(path):
+            pass  # For now, just opening the file is our basic check
+
+    except fitz.FileDataError as e:
         if verbose and isinstance(verbose, bool):
-            print(f"PDF at {path} is corrupt: {e}")
+            print(f"PDF at {path} is corrupt or unreadable: {e}")
         elif verbose:
-            verbose.exception(f"PDF at {path} is corrupt.", exc_info=e)
+            verbose.exception(f"PDF at {path} is corrupt or unreadable.", exc_info=e)
         return False
+
     return True
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -19,9 +19,9 @@ classifiers = [
     "Programming Language :: Python",
 ]
 dependencies = [
+    "PyMuPDF",
     "aiohttp",
     "pybtex",
-    "pypdf",
 ]
 description = "LLM Chain for answering questions from docs"
 keywords = ["question answering"]
@@ -34,7 +34,7 @@ name = "paper-scraper"
 readme = "README.md"
 requires-python = ">=3.8"
 urls = {repository = "https://github.com/blackadad/paper-scraper"}
-version = "1.6.1"
+version = "1.7.0"
 
 [tool.codespell]
 check-filenames = true

diff --git a/tests/test_paperscraper.py b/tests/test_paperscraper.py
@@ -116,7 +116,7 @@ async def test_gsearch(self):
             assert paper["citationCount"]
             assert paper["title"]
 
-    async def test_high_limit(self) -> None:
+    async def test_gsearch_high_limit(self) -> None:
         papers = await paperscraper.a_gsearch_papers(
             "molecular dynamics", year="2019-2023", limit=45
         )