-
Notifications
You must be signed in to change notification settings - Fork 156
/
links_and_dests.py
executable file
·244 lines (197 loc) · 8.94 KB
/
links_and_dests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from urllib.request import build_opener, HTTPCookieProcessor, HTTPSHandler, Request
from urllib.error import URLError, HTTPError
from os.path import relpath
from collections import deque
import sys
from time import sleep
import ssl
from io import BytesIO, StringIO
import json
import re
try:
from bs4 import BeautifulSoup
import html5lib
import certifi
except ImportError:
print("Please install the BeautifulSoup, html5lib, and certifi libraries using `pip install bs4 html5lib certifi`.\n")
raise
if [int(v) for v in certifi.__version__.split('.')] < [2021, 5, 30]:
print("Please upgrade certifi using `pip install --upgrade certifi`.\n")
sys.exit(1)
GITHUB_LINE_FRAGMENT = re.compile('L[0-9]+')
def get_links_and_destinations_from_pdf(f):
try:
from PyPDF2 import PdfFileReader
except ImportError:
print("Please install the PyPDF2 library using `pip install PyPDF2`.\n")
raise
# Based on <https://stackoverflow.com/a/5978161/393146>
pdf = PdfFileReader(f)
links = set()
for pg in range(pdf.getNumPages()):
obj = pdf.getPage(pg).getObject()
for annotation in obj.get('/Annots', []):
uri = annotation.getObject().get('/A', {}).get('/URI', None)
if uri is not None and uri not in links:
links.add(uri)
dests = pdf.getNamedDestinations().keys()
return (links, dests)
def get_links_and_destinations_from_html(f):
links = set()
internal = set()
dests = set()
soup = BeautifulSoup(f.read(), "html5lib")
# First try to find this: <script type="application/json" data-target="react-app.embeddedData">
# If it exists, its content is some JSON that we need to parse to get the real content.
for script in soup.find_all('script'):
if script.get('data-target') == "react-app.embeddedData":
content = json.loads(script.string).get('payload', {}).get('blob', {}).get('richText')
if content is not None:
(links, dests) = get_links_and_destinations_from_html(StringIO(content))
break
for link in soup.find_all('a'):
if link.has_attr('href'):
url = link['href']
(internal if url.startswith('#') else links).add(url)
if link.has_attr('name'):
dests.add(link['name'])
for link in soup.find_all(id=True):
dests.add(link['id'])
# GitHub's rendering of .mediawiki files puts 'id="user-content-<ANCHOR>"' in the source
# and dynamically creates a corresponding link #<ANCHOR>.
if link['id'].startswith("user-content-"):
dests.add(link['id'][13:])
internal.difference_update(['#' + d for d in dests]) # ignore internal links satisfied by a dest
links.update(internal)
return (links, dests)
def main(args):
if len(args) < 2:
print("Usage: ./links_and_dests.py [--check] [--print-dests] <file.pdf|html|xhtml>")
return 1
check = '--check' in args[1:]
print_dests = '--print-dests' in args[1:]
paths = [arg for arg in args[1:] if not arg.startswith('--')]
all_links = {} # url -> pdf_paths
all_dests = {} # url -> dests
errors = deque()
print("Reading files...")
for path in paths:
print(path, end=" ")
sys.stdout.flush()
with open(path, 'rb') as f:
if path.endswith(".html") or path.endswith(".xhtml"):
(links, dests) = get_links_and_destinations_from_html(f)
elif path.endswith(".pdf"):
(links, dests) = get_links_and_destinations_from_pdf(f)
else:
errors.append("Unrecognized file type: " + path)
continue
path = relpath(path)
for l in links:
refs = all_links.get(l, None)
if refs is None:
all_links[l] = refs = deque()
refs.append(path)
all_dests["https://zips.z.cash/" + path] = dests
if path.endswith(".html"):
all_dests["https://zips.z.cash/" + path[:-5]] = dests
print("\n")
print("Links:")
last_url = None
content = None
content_type = None
dests = None
for (l, p) in sorted(all_links.items()):
print(l, end=" ")
sys.stdout.flush()
what = "%s (occurs in %s)" % (l, " and ".join(p)) if len(paths) > 1 else l
status = ""
if ":" not in l:
l = "https://zips.z.cash/" + l
if l.startswith("mailto:"):
status = "(not checked)"
elif l.startswith("https:") or l.startswith("HTTP:"): # use uppercase HTTP: for links with no https: equivalent
(url, _, fragment) = l.partition("#")
if url in all_dests:
if fragment and fragment not in all_dests[url]:
errors.append("Missing link target: " + what)
status = "❌"
else:
status = "✓"
elif check:
# If url == last_url, there is no need to refetch content. This is an optimization when
# checking URLs with the same site but different fragments (which will be sorted together).
if url != last_url:
headers = {"User-Agent": "Mozilla/5.0"}
https_handler = HTTPSHandler(context=ssl.create_default_context(cafile=certifi.where()))
# Some DOI links (i.e. to https://doi.org/) redirect to link.springer.com
# in a way that requires cookies (booo!). We allow this for DOI links,
# but for all other links we simulate a client that never sets cookies.
if l.startswith("https://doi.org/"):
opener = build_opener(HTTPCookieProcessor(), https_handler)
else:
opener = build_opener(https_handler)
for retry in range(2):
try:
response = opener.open(Request(url=l, headers=headers))
content_type = response.info().get_content_type()
content = response.read()
last_url = url
except URLError as e:
if retry == 0 and isinstance(e, HTTPError) and e.code == 429:
try:
delay = int(e.headers['Retry-After'], 10) + 1
except Exception:
delay = 60
print("(waiting %ds due to rate limiting)" % (delay,), end=" ")
sys.stdout.flush()
sleep(delay)
continue
errors.append("Could not open link: %s due to %r" % (what, e))
status = "❌"
content_type = None
content = None
last_url = None
dests = None
break
if content is not None:
if fragment:
if dests is None:
if content_type in ('text/html', 'application/xhtml+xml'):
(_, dests) = get_links_and_destinations_from_html(BytesIO(content))
elif content_type == 'application/pdf':
(_, dests) = get_links_and_destinations_from_pdf(BytesIO(content))
if dests is None:
print("(link target not checked)", end=" ")
status = "✓"
elif fragment not in dests:
# Filter out known false positive GitHub fragments that we can't check.
if last_url.startswith("https://github.com/") and (fragment.startswith('diff-') or GITHUB_LINE_FRAGMENT.match(fragment) is not None):
print("(link target not checked)", end=" ")
status = "✓"
else:
errors.append("Missing link target: " + what)
status = "❌"
else:
status = "✓"
else:
status = "✓"
else:
errors.append("Insecure or unrecognized protocol in link: " + what)
status = "❌"
print(status)
if print_dests:
for (path, dests) in all_dests.items():
if path + ".html" not in all_dests: # avoid duplication
print("\nDestinations for %s:" % (path,))
for d in dests:
print(d)
if errors:
print("\nErrors:")
for e in errors:
print(e)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv))