-
Notifications
You must be signed in to change notification settings - Fork 10
/
seeddeal.py
80 lines (57 loc) · 1.74 KB
/
seeddeal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#coding:utf-8
import daemon
filter = daemon.Filter()
from pybloomfilter import BloomFilter
done_sites_fname='done_sites.bin'
bfdone = BloomFilter.open(done_sites_fname)
def filter_fix(urls):
ls = set()
for url in urls:
if not url.startswith("http"):
continue
_continue = False
try:
chars = url.decode('utf8')
for c in chars:
if not c in "/.abcdefghijklmnopqrstuvwxyz0123456789-_[]:": # this can speed up if order by frequence
ic = ord(c)
if ic < 0x4e00 or ic > 0x9fa5: # besides the normal format url, we only allow chinese url
#print url
_continue = True
break
# we can not decode the url because it is malformated with non utf8 code, so we ignore them
except:
#print url
continue
if _continue:
continue
#when go here,means it is a normal url
ls.add( url )
return ls
def filter1():
f = open('./okinwrong.txt').read()
urls = f.strip().split('\n')
#print "org wrong: %d" % len(urls)
ret = filter.filter_urls('www.baidu.com', urls)
#print "ret: %d" % len(ret)
ret = filter_fix(ret)
for url in ret:
print url
return
f = "seed995k.txt"
fp = open(f, 'w')
for url in ret:
fp.write(url + '\n')
fp.close()
def filterdone():
# filter from above and continue filter with done_site.bin
#
cnt = 0
print
urls = open('okinwrong.txt').read().strip().split('\n')
for url in urls:
url = url[7:]
if url in bfdone:
cnt += 1
print cnt
filterdone()