-
Notifications
You must be signed in to change notification settings - Fork 49
/
main.py
99 lines (91 loc) · 3.29 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import requests
import re
import sys
import json
import os
import time
#根据文件决定函数
y = 0
def doc(url):
doc_id = re.findall('view/(.*).html', url)[0]
html = requests.get(url).text
lists=re.findall('(https.*?0.json.*?)\\\\x22}',html)
lenth = (len(lists)//2)
NewLists = lists[:lenth]
for i in range(len(NewLists)) :
NewLists[i] = NewLists[i].replace('\\','')
txts=requests.get(NewLists[i]).text
txtlists = re.findall('"c":"(.*?)".*?"y":(.*?),',txts)
for i in range(0,len(txtlists)):
global y
print(txtlists[i][0].encode('utf-8').decode('unicode_escape','ignore'))
if y != txtlists[i][1]:
y = txtlists[i][1]
n = '\n'
else:
n = ''
filename = doc_id + '.txt'
with open(filename,'a',encoding='utf-8') as f:
f.write(n+txtlists[i][0].encode('utf-8').decode('unicode_escape','ignore').replace('\\',''))
print("DOC文档保存在"+filename)
def ppt(url):
doc_id = re.findall('view/(.*).html',url)[0]
url = "https://wenku.baidu.com/browse/getbcsurl?doc_id="+doc_id+"&pn=1&rn=99999&type=ppt"
html = requests.get(url).text
lists=re.findall('{"zoom":"(.*?)","page"',html)
for i in range(0,len(lists)):
lists[i] = lists[i].replace("\\",'')
try:
os.mkdir(doc_id)
except:
pass
for i in range(0,len(lists)):
img=requests.get(lists[i]).content
with open(doc_id+'\img'+str(i)+'.jpg','wb') as m:
m.write(img)
print("PPT图片保存在" + doc_id +"文件夹")
def txt(url):
doc_id = re.findall('view/(.*).html', url)[0]
url = "https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id="+doc_id
html = requests.get(url).text
md5 = re.findall('"md5sum":"(.*?)"',html)[0]
pn = re.findall('"totalPageNum":"(.*?)"',html)[0]
rsign = re.findall('"rsign":"(.*?)"',html)[0]
NewUrl = 'https://wkretype.bdimg.com/retype/text/'+doc_id+'?rn='+pn+'&type=txt'+md5+'&rsign='+rsign
txt = requests.get(NewUrl).text
jsons = json.loads(txt)
texts=re.findall("'c': '(.*?)',",str(jsons))
print(texts)
filename=doc_id+'.txt'
with open(filename,'a',encoding='utf-8') as f:
for i in range(0,len(texts)):
texts[i] = texts[i].replace('\\r','\r')
texts[i] = texts[i].replace('\\n','\n')
f.write(texts[i])
print("TXT文档保存在" + filename)
def pdf(url):
doc_id = re.findall('view/(.*).html',url)[0]
url = "https://wenku.baidu.com/browse/getbcsurl?doc_id="+doc_id+"&pn=1&rn=99999&type=ppt"
html = requests.get(url).text
lists=re.findall('{"zoom":"(.*?)","page"',html)
for i in range(0,len(lists)):
lists[i] = lists[i].replace("\\",'')
try:
os.mkdir(doc_id)
except:
pass
for i in range(0,len(lists)):
img=requests.get(lists[i]).content
with open(doc_id+'\img'+str(i)+'.jpg','wb') as m:
m.write(img)
print("PDF图片保存在" + doc_id + "文件夹")
def get_type(url):
data = requests.get(url)
typee = re.findall("'docType': '(.*?)',",data.text)
typee_str = typee[0]
return typee_str
url = input("URL:")
print("文档类型:"+get_type(url))
eval(get_type(url))(url)
print("3秒后退出")
time.sleep(3)