-
Notifications
You must be signed in to change notification settings - Fork 1
/
newegg.py
105 lines (77 loc) · 2.76 KB
/
newegg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import re
def newegg(page_soup):
# create a new file in write mode
f = open('data/newegg-laptops.csv', 'w')
# write a header, columns name
f.write('Brand,Name,Price-was,Current-Price,Discount(%),RAM(GB),Storage(GB/TB),Refurbished(0/1),URL\n')
# grabs each item-container
conts = page_soup.findAll('div', {'class': 'item-container'})
for cont in conts:
# getting brand name from the title of img tag
try:
brand = cont.find('div', 'item-info').div.a.img['title']
except:
brand = 'NaN'
# to filter products pop up shown in recently viewed items at the end of the page
# print(brand)
# if not brand:
# continue
# getting title/name of the product
title = cont.findAll('a', {'class': 'item-title'})[0].text
# if product is refurbished/renewed
if ('refurbished' in title.lower()) or ('renewed' in title.lower()):
refurb = '1'
else:
refurb = '0'
# link of the product
link = cont.findAll('a', {'class': 'item-title'})[0]['href']
# process title to extract ram, storage
features = title.replace(' ', '')
features = re.search('.*(?:\)|\w|-)(\d+)GB.*(?:y|\+|-|M)(\d+)', features)
# some products doesn't contain this inforamtion
try:
ram = features.group(1)
hdd = features.group(2)
except:
ram = 'NaN'
hdd = 'NaN'
# PRICE
try:
# previous price, if available
pw = cont.find('div', 'item-info').find('div','item-action').ul.li.span.text
# discount on the product, if available
dc = d = cont.find('div', 'item-info').find('div','item-action').ul.find('li','price-save').find('span','price-save-percent').text[:-1]
except:
pw = 'NaN'
dc = '0'
# current price of the product
try:
pc = cont.find('div', 'item-info').find('div','item-action').ul.find('li','price-current').text
pc = pc.replace(',', '')
pc = re.search('.+\s([0-9]+).+', pc).group(1)
except:
pc = 'NaN'
# write the row into csv file
f.write(brand + ',' + title.replace(',', ' ')
+ ',' + pw + ',' + pc + ',' + dc + ',' + ram + ',' + hdd + ',' + refurb + ',' + link + '\n')
f.close()
print('Scraped Newegg\'s page')
def eggUrls(page_soup, url):
'''
Extract the last page number from page_soup and create urls upto that page using predefined template of the newegg urls
'''
# get the last page number
last = eggLast(page_soup)
urls = list()
# Create urls to return using url template of flipkart
for i in range(2, last+1):
url = 'https://www.newegg.com/global/in-en/Laptops-Notebooks/SubCategory/ID-32/Page-' + str(i) + '?Tid=1297918'
urls.append(url)
return urls
def eggLast(page_soup):
'''
Return the last page number from the page
'''
last = page_soup.findAll('div', {'class': 'page_NavigationBar'})[1]
last = last.select_one('div:nth-of-type(10)').text.strip()
return int(last)