flairNLP · MaxDall · Jul 12, 2024 · Jul 17, 2024 · Jul 17, 2024
diff --git a/src/fundus/scraping/article.py b/src/fundus/scraping/article.py
@@ -1,19 +1,42 @@
 from datetime import datetime
 from textwrap import TextWrapper, dedent
-from typing import Any, Dict, List, Mapping, Optional
+from typing import Any, Dict, List, Mapping, Optional, Tuple
 
 import langdetect
 import lxml.html
 from colorama import Fore, Style
 
 from fundus.logging import create_logger
 from fundus.parser import ArticleBody
+from fundus.scraping.filter import guarded_bool
 from fundus.scraping.html import HTML
 from fundus.utils.serialization import JSONVal, is_jsonable
 
 logger = create_logger(__name__)
 
 
+class Stat(Dict[str, bool]):
+    @property
+    def missing_attributes(self) -> Tuple[str, ...]:
+        return tuple(key for key, value in self.items() if value is False)
+
+    @property
+    def completeness(self) -> float:
+        return (len(self) - len(self.missing_attributes)) / len(self)
+
+    def __repr__(self) -> str:
+        if (completeness := self.completeness) < 1:
+            return f"{completeness:.2%} extracted"
+        else:
+            return "Fully extracted"
+
+    def __str__(self) -> str:
+        return self.__repr__()
+
+    def __bool__(self):
+        return not bool(self.missing_attributes)
+
+
 class AttributeView:
     def __init__(self, key: str, extraction: Mapping[str, Any]):
         self.ref = extraction
@@ -107,10 +130,14 @@ def lang(self) -> Optional[str]:
 
         return language
 
+    @property
+    def complete(self) -> Stat:
+        return Stat({key: guarded_bool(value) for key, value in self.__extraction__.items()})
+
     def to_json(self, *attributes: str) -> Dict[str, JSONVal]:
         """Converts article object into a JSON serializable dictionary.
 
-        One can specify which attributes should be included by passing attribute names as parameters.
+        One can specify which attributes should be included baddy passing attribute names as parameters.
         Default: title, plaintext, authors, publishing_date, topics, free_access + unvalidated attributes
 
         Args:

diff --git a/src/fundus/scraping/crawler.py b/src/fundus/scraping/crawler.py
@@ -1,17 +1,20 @@
 from __future__ import annotations
 
 import contextlib
+import datetime as dt
 import gzip
+import itertools
 import json
 import logging.config
+import math
 import multiprocessing
 import os
+import pickle
 import random
 import re
 import threading
 import time
 from abc import ABC, abstractmethod
-from datetime import datetime
 from functools import lru_cache, partial, wraps
 from multiprocessing import Manager
 from multiprocessing.context import TimeoutError
@@ -22,10 +25,13 @@
 from typing import (
     Any,
     Callable,
+    Dict,
     Generic,
     Iterator,
     List,
     Literal,
+    Mapping,
+    NamedTuple,
     Optional,
     Pattern,
     Set,
@@ -37,25 +43,28 @@
 )
 from urllib.parse import urljoin, urlparse
 
+import dateutil.rrule as rrule
 import dill
 import fastwarc.stream_io
+import matplotlib.pyplot as plt
 import more_itertools
+import numpy as np
 import requests
 import urllib3.exceptions
-from dateutil.rrule import MONTHLY, rrule
 from more_itertools import roundrobin
 from tqdm import tqdm
-from typing_extensions import ParamSpec, TypeAlias
+from typing_extensions import ParamSpec, Self, TypeAlias
 
 from fundus.logging import create_logger, get_current_config
 from fundus.publishers.base_objects import Publisher, PublisherGroup
-from fundus.scraping.article import Article
+from fundus.scraping.article import Article, Stat
 from fundus.scraping.delay import Delay
 from fundus.scraping.filter import ExtractionFilter, Requires, RequiresAll, URLFilter
-from fundus.scraping.html import CCNewsSource
+from fundus.scraping.html import HTML, CCNewsSource
 from fundus.scraping.scraper import CCNewsScraper, WebScraper
 from fundus.scraping.session import session_handler
 from fundus.scraping.url import URLSource
+from fundus.utils.serialization import JSONVal
 from fundus.utils.timeout import Timeout
 
 logger = create_logger(__name__)
@@ -435,8 +444,8 @@ class CCNewsCrawler(CrawlerBase):
     def __init__(
         self,
         *publishers: PublisherType,
-        start: datetime = datetime(2016, 8, 1),
-        end: datetime = datetime.now(),
+        start: dt.datetime = dt.datetime(2016, 8, 1),
+        end: dt.datetime = dt.datetime.now(),
         processes: int = -1,
         retries: int = 3,
         disable_tqdm: bool = False,
@@ -561,13 +570,13 @@ def _get_warc_paths(self) -> List[str]:
         if self.start >= self.end:
             raise ValueError("Start date has to be < end date.")
 
-        if self.start < datetime(2016, 8, 1):
+        if self.start < dt.datetime(2016, 8, 1):
             raise ValueError("The default, and earliest possible, start date is 2016/08/01.")
 
-        if self.end > datetime.now():
+        if self.end > dt.datetime.now():
             raise ValueError("The specified end date is in the future. We don't want to give spoilers, do we?")
 
-        date_sequence: List[datetime] = list(rrule(MONTHLY, dtstart=self.start, until=self.end))
+        date_sequence: List[dt.datetime] = list(rrule.rrule(rrule.MONTHLY, dtstart=self.start, until=self.end))
         urls: List[str] = [
             f"{self.server_address}crawl-data/CC-NEWS/{date.strftime('%Y/%m')}/warc.paths.gz" for date in date_sequence
         ]
@@ -629,3 +638,132 @@ def _build_article_iterator(
                 yield from self._single_crawl(warc_paths, article_task)
             else:
                 yield from self._parallel_crawl(warc_paths, article_task)
+
+    def benchmark(
+        self, sample_rate: int = rrule.MONTHLY, sample_size: Optional[int] = 1000, keep_html: bool = True
+    ) -> Benchmark:
+        if sample_rate > rrule.DAILY:
+            raise ValueError("Sample rate < rrule.DAILY are not supported")
+
+        benchmark = Benchmark(*self.publishers, keep_html=keep_html)
+
+        dates = list(rrule.rrule(freq=sample_rate, dtstart=self.start, until=self.end))
+
+        # TODO: add date filter
+        for date in tqdm(reversed(dates), total=len(dates), desc="samples", position=0, disable=self.disable_tqdm):
+            crawler = CCNewsCrawler(*self.publishers, start=date, end=date + dt.timedelta(days=1), disable_tqdm=True)
+
+            for article in tqdm(
+                crawler.crawl(max_articles=sample_size, only_complete=False),
+                total=sample_size,
+                desc="articles",
+                position=1,
+                leave=False,
+                disable=self.disable_tqdm,
+            ):
+                benchmark.add(article)
+
+        return benchmark
+
+
+class Entry(NamedTuple):
+    stat: Stat
+    html: Optional[HTML] = None
+
+    def __repr__(self) -> str:
+        return f"{self.stat!r}"
+
+
+class Series(List[Entry]):
+    def __init__(self):
+        super().__init__()
+
+    @property
+    def avg(self) -> float:
+        return sum(entry.stat.completeness for entry in self) / len(self)
+
+    def __repr__(self) -> str:
+        return f"{self.avg:.2%}"
+
+
+class TimeFrame(Dict[str, Series]):
+    def __init__(self, *publishers: str, keep_html: bool = True):
+        super().__init__({publisher: Series() for publisher in publishers})
+        self._keep_html = keep_html
+
+    def add(self, article: Article):
+        self[article.publishers].append(Entry(article.complete, article.html if self._keep_html else None))
+
+    def squeeze(self, threshold: float) -> Self:
+        for publisher, series in self.items():
+            tmp = Series()
+            for entry in series:
+                if entry.stat.completeness <= threshold:
+                    tmp.append(entry)
+            self[publisher] = tmp
+        return self
+
+    def reduce(self, percentage: float) -> Self:
+        new_length = math.ceil(len(self) * percentage)
+        while len(self) > new_length:
+            max_list = sorted(self.values(), key=len, reverse=True)[0]
+            max_list.pop()
+        return self
+
+    def trim(self, max_length: int) -> Self:
+        for publisher, series in self.items():
+            if len(series) <= max_length:
+                continue
+            random.shuffle(series)
+            self[publisher] = series[:max_length]
+        return self
+
+    def __len__(self) -> int:
+        return sum(len(entries) for entries in self.values())
+
+
+class Benchmark(Dict[dt.date, TimeFrame]):
+    def __init__(self, *publishers: Publisher, keep_html: bool = True):
+        self.keep_html = keep_html
+        self._publishers = {publisher.name for publisher in publishers}
+        super().__init__()
+
+    def add(self, article: Article):
+        record = self[article.html.crawl_date.date()]
+        record.add(article)
+
+    def squeeze(self, threshold: float) -> Self:
+        for frame in self.values():
+            frame.squeeze(threshold)
+        return self
+
+    def reduce(self, percentage: float) -> Self:
+        for frame in self.values():
+            frame.reduce(percentage)
+        return self
+
+    def trim(self, max_length: int) -> Self:
+        for frame in self.values():
+            frame.trim(max_length)
+        return self
+
+    def save(self, path: Union[Path, str]) -> None:
+        with gzip.open(path, "wb") as file:
+            file.write(pickle.dumps(self))
+
+    @classmethod
+    def load(cls, path: Union[Path, str]) -> Benchmark:
+        with gzip.open(path, "rb") as file:
+            return pickle.loads(file.read())
+
+    def plot(self, path: Union[Path, str]) -> None:
+        fig, ax = plt.subplots()
+        ax.plot(self)
+
+    def __len__(self) -> int:
+        return sum(len(record) for record in self.values())
+
+    def __missing__(self, key: dt.date) -> TimeFrame:
+        new = TimeFrame(*self._publishers, keep_html=self.keep_html)
+        self[key] = new
+        return new
diff --git a/src/fundus/scraping/filter.py b/src/fundus/scraping/filter.py
@@ -1,5 +1,5 @@
 import re
-from typing import Any, Callable, Dict, Protocol, cast
+from typing import Any, Callable, Dict, Protocol
 
 from typing_extensions import ParamSpec
 
@@ -119,7 +119,7 @@ def __bool__(self) -> bool:
         return bool(self.missing_attributes)
 
 
-def _guarded_bool(value: Any):
+def guarded_bool(value: Any) -> bool:
     if isinstance(value, bool):
         return True
     else:
@@ -151,7 +151,7 @@ def __init__(self, *required_attributes: str, eval_booleans: bool = True) -> Non
         """
         self.required_attributes = set(required_attributes)
         # somehow mypy does not recognize bool as callable :(
-        self._eval: Callable[[Any], bool] = bool if eval_booleans else _guarded_bool  # type: ignore[assignment]
+        self._eval: Callable[[Any], bool] = bool if eval_booleans else guarded_bool  # type: ignore[assignment]
 
     def __call__(self, extraction: Dict[str, Any]) -> FilterResultWithMissingAttributes:
         missing_attributes = [