-
Notifications
You must be signed in to change notification settings - Fork 16
/
oreilly_free.py
executable file
·94 lines (82 loc) · 3.14 KB
/
oreilly_free.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
"""
Created on 5/21/16
@author: '[email protected]'
This code mainly parse & download free books from oreily(www.oreilly.com/programming/free/)
To get the latest code, please visit my github: https://github.com/Jiezhi/just-a-spider
Reference:
"""
import os
import requests
from bs4 import BeautifulSoup
import threading
import re
def get_keyword(url):
"""
Return catelog and book
"""
# TODO error handler
reg=r"http://www\.oreilly\.com/([^/]+)/free/([^.]+)\.csp.*"
m = re.match(reg,url)
return m.groups()
def download_file(url):
"""
Just download a small file by url
This code snip come from http://stackoverflow.com/a/16696317/5425709
:param url: The file url
:return: The downloaded file name
"""
local_filename = url.split('/')[-1]
dir_name = 'oreilly' + os.path.sep + url.split('/')[-4]
if not os.path.exists(dir_name):
os.makedirs(dir_name)
local_filename = os.path.join(dir_name, local_filename)
if os.path.exists(local_filename):
print('file already downloaded: ', local_filename)
return local_filename
print('downloading ', url)
# NOTE the stream=True parameter
r = requests.get(url, stream=True)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
#f.flush() commented by recommendation from J.F.Sebastian
return local_filename
def get_free_book(content, file_format='pdf'):
"""
Parse free book information from html content
:param content: the content of what your get from oreily free book web page
:param file_format epub mobi or pdf
:return:
"""
soup = BeautifulSoup(content, 'lxml')
# books = soup.find_all('div', {'class': 'product-row cover-showcase'})
# TODO handle error
books = soup.find_all('a', {'data-toggle': 'popover'})
print('Find %d book(s)...' % len(books))
for book in books:
href = book['href']
if not href or 'player.oreilly.com' in href or not '.csp' in href:
print("this page will be igored: ", href)
continue
try:
catelog,book_name = get_keyword(href)
book_url = 'http://www.oreilly.com/%s/free/files/%s.%s' % (catelog, book_name, file_format)
t = threading.Thread(target=download_file,args=(book_url,))
t.start()
except Exception as e:
print("Downloading from {} failed".format(href),e)
if __name__ == '__main__':
free_oreilly = ['http://www.oreilly.com/programming/free/',
'http://www.oreilly.com/web-platform/free/',
'http://www.oreilly.com/security/free/',
'http://www.oreilly.com/business/free/',
'http://www.oreilly.com/data/free/',
'http://www.oreilly.com/iot/free/',
'http://www.oreilly.com/design/free/',
'http://www.oreilly.com/webops-perf/free/',
]
for free in free_oreilly:
html = requests.get(free)
get_free_book(html.content)