links_and_dests.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from urllib.request import build_opener, HTTPCookieProcessor, HTTPSHandler, Request
from urllib.error import URLError, HTTPError
from os.path import relpath
from collections import deque
import sys
from time import sleep
import ssl
from io import BytesIO, StringIO
import json
import re

try:
    from bs4 import BeautifulSoup
    import html5lib
    import certifi
except ImportError:
    print("Please install the BeautifulSoup, html5lib, and certifi libraries using `pip install bs4 html5lib certifi`.\n")
    raise

if [int(v) for v in certifi.__version__.split('.')] < [2021, 5, 30]:
    print("Please upgrade certifi using `pip install --upgrade certifi`.\n")
    sys.exit(1)

GITHUB_LINE_FRAGMENT = re.compile('L[0-9]+')


def get_links_and_destinations_from_pdf(f):
    try:
        from PyPDF2 import PdfFileReader
    except ImportError:
        print("Please install the PyPDF2 library using `pip install PyPDF2`.\n")
        raise

    # Based on <https://stackoverflow.com/a/5978161/393146>
    pdf = PdfFileReader(f)

    links = set()
    for pg in range(pdf.getNumPages()):
        obj = pdf.getPage(pg).getObject()

        for annotation in obj.get('/Annots', []):
            uri = annotation.getObject().get('/A', {}).get('/URI', None)
            if uri is not None and uri not in links:
                links.add(uri)

    dests = pdf.getNamedDestinations().keys()

    return (links, dests)


def get_links_and_destinations_from_html(f):
    links = set()
    internal = set()
    dests = set()

    soup = BeautifulSoup(f.read(), "html5lib")

    # First try to find this: <script type="application/json" data-target="react-app.embeddedData">
    # If it exists, its content is some JSON that we need to parse to get the real content.
    for script in soup.find_all('script'):
        if script.get('data-target') == "react-app.embeddedData":
            content = json.loads(script.string).get('payload', {}).get('blob', {}).get('richText')
            if content is not None:
                (links, dests) = get_links_and_destinations_from_html(StringIO(content))
                break

    for link in soup.find_all('a'):
        if link.has_attr('href'):
            url = link['href']
            (internal if url.startswith('#') else links).add(url)

        if link.has_attr('name'):
            dests.add(link['name'])

    for link in soup.find_all(id=True):
        dests.add(link['id'])
        # GitHub's rendering of .mediawiki files puts 'id="user-content-<ANCHOR>"' in the source
        # and dynamically creates a corresponding link #<ANCHOR>.
        if link['id'].startswith("user-content-"):
            dests.add(link['id'][13:])

    internal.difference_update(['#' + d for d in dests])  # ignore internal links satisfied by a dest
    links.update(internal)
    return (links, dests)


def main(args):
    if len(args) < 2:
        print("Usage: ./links_and_dests.py [--check] [--print-dests] <file.pdf|html|xhtml>")
        return 1

    check = '--check' in args[1:]
    print_dests = '--print-dests' in args[1:]
    paths = [arg for arg in args[1:] if not arg.startswith('--')]

    all_links = {}  # url -> pdf_paths
    all_dests = {}  # url -> dests

    errors = deque()

    print("Reading files...")
    for path in paths:
        print(path, end=" ")
        sys.stdout.flush()

        with open(path, 'rb') as f:
            if path.endswith(".html") or path.endswith(".xhtml"):
                (links, dests) = get_links_and_destinations_from_html(f)
            elif path.endswith(".pdf"):
                (links, dests) = get_links_and_destinations_from_pdf(f)
            else:
                errors.append("Unrecognized file type: " + path)
                continue

        path = relpath(path)
        for l in links:
            refs = all_links.get(l, None)
            if refs is None:
                all_links[l] = refs = deque()
            refs.append(path)

        all_dests["https://zips.z.cash/" + path] = dests
        if path.endswith(".html"):
            all_dests["https://zips.z.cash/" + path[:-5]] = dests

    print("\n")
    print("Links:")

    last_url = None
    content = None
    content_type = None
    dests = None

    for (l, p) in sorted(all_links.items()):
        print(l, end=" ")
        sys.stdout.flush()
        what = "%s (occurs in %s)" % (l, " and ".join(p)) if len(paths) > 1 else l
        status = ""

        if ":" not in l:
            l = "https://zips.z.cash/" + l

        if l.startswith("mailto:"):
            status = "(not checked)"
        elif l.startswith("https:") or l.startswith("HTTP:"):  # use uppercase HTTP: for links with no https: equivalent
            (url, _, fragment) = l.partition("#")

            if url in all_dests:
                if fragment and fragment not in all_dests[url]:
                    errors.append("Missing link target: " + what)
                    status = "❌"
                else:
                    status = "✓"
            elif check:
                # If url == last_url, there is no need to refetch content. This is an optimization when
                # checking URLs with the same site but different fragments (which will be sorted together).
                if url != last_url:
                    headers = {"User-Agent": "Mozilla/5.0"}
                    https_handler = HTTPSHandler(context=ssl.create_default_context(cafile=certifi.where()))

                    # Some DOI links (i.e. to https://doi.org/) redirect to link.springer.com
                    # in a way that requires cookies (booo!). We allow this for DOI links,
                    # but for all other links we simulate a client that never sets cookies.
                    if l.startswith("https://doi.org/"):
                        opener = build_opener(HTTPCookieProcessor(), https_handler)
                    else:
                        opener = build_opener(https_handler)

                    for retry in range(2):
                        try:
                            response = opener.open(Request(url=l, headers=headers))
                            content_type = response.info().get_content_type()
                            content = response.read()
                            last_url = url
                        except URLError as e:
                            if retry == 0 and isinstance(e, HTTPError) and e.code == 429:
                                try:
                                    delay = int(e.headers['Retry-After'], 10) + 1
                                except Exception:
                                    delay = 60

                                print("(waiting %ds due to rate limiting)" % (delay,), end=" ")
                                sys.stdout.flush()
                                sleep(delay)
                                continue

                            errors.append("Could not open link: %s due to %r" % (what, e))
                            status = "❌"
                            content_type = None
                            content = None
                            last_url = None

                        dests = None
                        break

                if content is not None:
                    if fragment:
                        if dests is None:
                            if content_type in ('text/html', 'application/xhtml+xml'):
                                (_, dests) = get_links_and_destinations_from_html(BytesIO(content))
                            elif content_type == 'application/pdf':
                                (_, dests) = get_links_and_destinations_from_pdf(BytesIO(content))

                        if dests is None:
                            print("(link target not checked)", end=" ")
                            status = "✓"
                        elif fragment not in dests:
                            # Filter out known false positive GitHub fragments that we can't check.
                            if last_url.startswith("https://github.com/") and (fragment.startswith('diff-') or GITHUB_LINE_FRAGMENT.match(fragment) is not None):
                                print("(link target not checked)", end=" ")
                                status = "✓"
                            else:
                                errors.append("Missing link target: " + what)
                                status = "❌"
                        else:
                            status = "✓"
                    else:
                        status = "✓"
        else:
            errors.append("Insecure or unrecognized protocol in link: " + what)
            status = "❌"

        print(status)

    if print_dests:
        for (path, dests) in all_dests.items():
            if path + ".html" not in all_dests:  # avoid duplication
                print("\nDestinations for %s:" % (path,))
                for d in dests:
                    print(d)

    if errors:
        print("\nErrors:")
        for e in errors:
            print(e)

    return 0


if __name__ == '__main__':
    sys.exit(main(sys.argv))