-
Notifications
You must be signed in to change notification settings - Fork 0
/
douban.py
225 lines (185 loc) · 8.47 KB
/
douban.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import requests
from bs4 import BeautifulSoup
import re
import random
import string
from utils import email
from database import *
def write_notion(id, movie_info):
token = 'secret_feFoTGfTuwtdjxCzpsdutFWQtzW5stxZQkRucn1AUGC'
database_id = '38a93f2a26284b19bc749cbf7464dabc'
headers = {
'Notion-Version': '2021-05-13',# 在新版中必须加入版本信息
'Authorization': 'Bearer '+token,# 这一行也必须要有
}
# 构建上传数据的格式
body = {
'parent': {'type': 'database_id', 'database_id': database_id},
}
# 获取数据结构
default_page = "0fcffd8d-9743-46fc-ad5f-4144cfe1a44e"
url_default_page = "https://api.notion.com/v1/pages/"+default_page
notion_response = requests.get(url_default_page,headers=headers)
# 写入数据
title, year, subtitle, rating, poster, voter = movie_info
properties = notion_response.json()['properties']
properties['评分']['rich_text'][0]['text']['content'] = rating
properties['ID']['title'][0]['text']['content'] = id
properties['标题']['rich_text'][0]['text']['content'] = title
properties['年份']['rich_text'][0]['text']['content'] = year
properties['副标题']['rich_text'][0]['text']['content'] = subtitle
properties['海报']['rich_text'][0]['text']['content'] = poster
properties['评分人数']['rich_text'][0]['text']['content'] = voter
body['properties'] = properties
url_notion_additem = 'https://api.notion.com/v1/pages'
notion_additem = requests.post(url_notion_additem,headers=headers,json=body)
print(notion_additem.json())
# 分析网页
def analysis_movie(soup):
# 查找标题
heading = soup.find(name='h1')
if heading == None:
exit()
else :
heading = heading.findAll(name='span')
# 查找年份
year = ' ' if len(heading) == 1 else heading[1].string[1:5]
title = heading[0].string
# 查找导演
directors_temp = soup.findAll(name='a', attrs={"rel" :"v:directedBy"})
if len(directors_temp) == 0 :
directors = ' '
elif directors_temp[0].string == None :
directors = ' '
else :
directors = [director.string for director in directors_temp]
# 查找评分人数
voter_temp = soup.find(name='span', attrs={"property" :"v:votes"})
voter = 0 if voter_temp == None else voter_temp.string
# 查找演员
actors_temp = soup.findAll(name='a', attrs={"rel" :"v:starring"})
actors = ' ' if actors_temp == None else [actor.string for actor in actors_temp if actor.string != None]
print(actors)
# 查找类型
kinds_temp = soup.findAll(name='span', attrs={"property" :"v:genre"})
kinds = ' ' if kinds_temp == None else [kind.string for kind in kinds_temp]
# 查找国家
country = soup.find_all(name='span',text='制片国家/地区:')
country = ' ' if len(country) == 0 else country[0].next_sibling
# 拼接副标题
subtitle = " {} / {} / {} / {} ".format(
",".join(country.split(" / ")),
','.join(kinds),
','.join(directors),
','.join(actors))
# 查找评分
rating = soup.find(id='interest_sectl').find(name='strong').string
# 查找电影海报
poster_temp = soup.find(id='mainpic')
poster = ' ' if poster_temp == None else poster_temp.find(name='img').attrs['src']
return title, year, subtitle, rating, poster, voter
def find_movie_recommendations(soup):
# 寻找相邻网页链接
# 如果没有找到就跳过
if soup.find(id='recommendations') == None:
return []
recommendations = soup.find(id='recommendations').findAll('a')
all_subject_id = []
for item in recommendations:
item_url = item.attrs['href']
subject_id = re.match(".*subject/(\d+)", item_url)[1]
all_subject_id.append(int(subject_id))
return all_subject_id
def fetch_one_subject(timeout=20):
# 从数据库中选取1条未爬取的数据
# 如果数据库中没有爬取过的数据,那么生成一个7位数的随机数
result = curs.execute(SELECT_STATUS.format(status=0))
result = list(result)
if len(result)>0:
seed = result[0][0]
else:
seed = 1292403
url = url_base + str(seed)
print(url)
# 请求数据,如果请求失败,停止此次任务,下次继续。
headers['Cookie']=Cookie.format(bid = "bid=%s" % "".join(random.sample(string.ascii_letters + string.digits, 11)))
# headers['Cookie'] = ''
try:
response_movie = requests.get(url=url, headers=headers, timeout=timeout)
except:
exit()
if response_movie.status_code != 200:
print(response_movie)
exit()
# 初始化soup库
soup = BeautifulSoup(response_movie.text, 'html.parser')
# 解析并数据
movie_info = analysis_movie(soup)
write_notion(str(seed),movie_info)
csv_writer(str(seed),movie_info)
print(movie_info)
# 更新数据库
# 如果数据库中没有这个条目,那么将此条目写入数据库
result = curs.execute(SELECT_ID.format(id = seed))
if len(list(result))==0:
curs.execute(INSERT.format(id = seed))
connection.commit()
# 更新数据库中此条目为已读
curs.execute(UPDATE.format(id = seed))
connection.commit()
# 统计页面中的其他条目
all_subject_id = find_movie_recommendations(soup)
for id in all_subject_id:
# 判断是否已经存在
result = curs.execute(SELECT_ID.format(id = id))
if len(list(result))==0:
curs.execute(INSERT.format(id = id))
connection.commit()
else:
pass # 如果已存在,那么不再添加
# 请求头
url_base = "https://movie.douban.com/subject/"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.52',
'Cookie': "ll=\"118371\"; bid=ujBcNwnZ9Lc; __utmz=30149280.1668276173.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __gads=ID=b6acfc64c8d441ac-22b7559f40d80002:T=1668276176:RT=1668276176:S=ALNI_MZdL-d_l65y7cW4EOfY4WKGi83LKw; push_noty_num=0; push_doumail_num=0; __yadk_uid=3qqwL8vX4Y42dTT0SerwrQAVqnfcTES4; __utmv=30149280.6250; __gpi=UID=00000b7ae4562923:T=1668276176:RT=1668350550:S=ALNI_MaVl14ymNIBMp9Kma3BBYiS7-E2Cw; _pk_id.100001.8cb4=78c23fb0f9637844.1668306835.6.1668434978.1668350547.; _pk_ses.100001.8cb4=*; __utma=30149280.657680781.1668276173.1668350552.1668434981.12; __utmc=30149280; __utmt=1; __utmb=30149280.1.10.1668434981"
}
Cookie = "ll=\"118371\"; {bid:s}; __utmz=30149280.1668276173.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __gads=ID=b6acfc64c8d441ac-22b7559f40d80002:T=1668276176:RT=1668276176:S=ALNI_MZdL-d_l65y7cW4EOfY4WKGi83LKw; push_noty_num=0; push_doumail_num=0; __yadk_uid=3qqwL8vX4Y42dTT0SerwrQAVqnfcTES4; __utmv=30149280.6250; __gpi=UID=00000b7ae4562923:T=1668276176:RT=1668350550:S=ALNI_MaVl14ymNIBMp9Kma3BBYiS7-E2Cw; _pk_id.100001.8cb4=78c23fb0f9637844.1668306835.6.1668434978.1668350547.; _pk_ses.100001.8cb4=*; __utma=30149280.657680781.1668276173.1668350552.1668434981.12; __utmc=30149280; __utmt=1; __utmb=30149280.1.10.1668434981"
proxies = {
'http': 'http://127.0.0.1:7890',
'https': 'http://127.0.0.1:7890'
}
import time
connection, curs = init_database()
for i in range(200):
# time.sleep(2)
result = curs.execute("SELECT id, status from MOVIE")
message = "Number of items: " + str(len(list(result)))
print(i, message)
if i % 100 == 0 :
email(message,'【豆瓣爬虫】爬取到的索引数量','马鹏飞','马鹏飞','[email protected]')
fetch_one_subject()
result = curs.execute("SELECT id, status from MOVIE")
message = "Number of items: " + str(len(list(result)))
print(message)
# import sqlite3
# connection = sqlite3.connect('test.db')
# print ("数据库打开成功")
# c = connection.cursor()
# c.execute('''CREATE TABLE MOVIE
# (ID INT PRIMARY KEY NOT NULL,
# STATUS INT NOT NULL);''')
# print ("数据表创建成功")
# connection.commit()
# connection.close()
# connection = sqlite3.connect('test.db')
# c = connection.cursor()
# print ("数据库打开成功")
# curs.execute("INSERT INTO MOVIE (ID,STATUS) VALUES (100, 0)")
# connection.commit()
# connection.close()
# result = curs.execute("SELECT id, status from MOVIE")
# for row in result:
# print("ID = ", row[0])
# print("NAME = ", row[1])
# print ("数据操作成功")
# connection.close()