-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
260 lines (204 loc) · 9.39 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
#!/usr/bin/env python
# encoding: utf-8
# ---------------------------------------------------------------------------------------------------------------------
# Name: scraper.py
# Version: 0.1.3
# Summary: Zap Imóveis Scraper
# A scraper that gathers data from Zap Imóveis website using BeautifulSoup.
#
# Author: Alexsander Lopes Camargos
# Author-email: [email protected]
#
# License: MIT
# ---------------------------------------------------------------------------------------------------------------------
"""
Zap Imóveis Scraper
A scraper that gathers data from Zap Imóveis website using BeautifulSoup.
"""
import json
import logging
import re
import time
from enum import Enum
from random import randint
import pandas as pd
import urllib3
from bs4 import BeautifulSoup
from slugify import slugify
STATES = ['mg']
# TOWNS = ['Belo Horizonte', 'Uberlândia', 'Contagem', 'Juiz de Fora', 'Betim']
TOWNS = ['Belo Horizonte']
# URL templates to make searches.
DOMAIN_NAME = 'www.zapimoveis.com.br'
PATH = '/%(action)s/%(unit_type)s/%(state)s+%(city)s/?pagina=%(page)s'
PORT = 443
CERT_REQS = 'CERT_NONE'
QUANTITY_TO_FETCH = 2
RANDINT_STARTING = 1
RANDINT_FINAL = 1
class BusinessFilter(Enum):
"""Filters groups enumeration."""
Comprar = 'Venda'
Alugar = 'Aluguel'
Lancamentos = 'Lançamentos'
class UnitType(Enum):
"""Unit type filter enumeration."""
Todos = 'Imoveis'
Casas = 'Casas'
Apartamentos = 'Apartamentos'
Quitinetes = 'Quitinetes'
class ListedItem:
"""A listed item on zapimoveis.com.br."""
Price = None
Condominium_fee = None
IPTU_fee = None
Floor_size = None
Number_bedrooms = None
Number_bathrooms = None
Parking_spaces = None
Address = None
Neighborhood = None
City = None
State = None
Longitude = None
Latitude = None
Title = None
Description = None
Link = None
Publisher = None
Item_ID = None
Created = None
Unit_types = None
class DataScraper:
"""Extracting selected data from the site zapimoveis.com.br and save for analysis."""
def __init__(self, *args):
"""Initiating an empty DataScraper."""
super(DataScraper, self).__init__(*args)
@staticmethod
def __parser_description(description):
"""Parse text description for remove a html tag"""
# Remove all html tags.
description_parsed = re.sub(r'^\s*$', '', description)
# Remove empty lines.
description_parsed = re.sub(r'\n\s*\n', '\n', description_parsed, re.MULTILINE)
return description_parsed
@staticmethod
def __fetch_data(action, unit_type, city, page):
"""Fetch data on the zapimoveis.com.br website using selection criteria."""
# Constructing the query using selection criteria.
query_path = PATH % ({'action': slugify(action),
'unit_type': slugify(unit_type),
'state': STATES[0],
'city': slugify(city),
'page': page})
pool = urllib3.HTTPSConnectionPool(DOMAIN_NAME, PORT, CERT_REQS)
# Make a request.
page = pool.request('GET', query_path)
logging.info(f'GET >> {pool.host}{query_path}\tSTATUS: {page.status}')
soup = BeautifulSoup(page.data.decode('utf-8'), 'html.parser')
page_data = soup.find(
lambda tag: tag.name == "script" and isinstance(tag.string, str) and tag.string.startswith("window"))
json_string = page_data.string.replace("window.__INITIAL_STATE__=", "").replace(
";(function(){var s;(s=document.currentScript||document.scripts["
"document.scripts.length-1]).parentNode.removeChild(s);}());",
"")
return json.loads(json_string)['results']['listings']
@staticmethod
def __data_to_csv(data):
"""Write data to a comma-separated values (csv) file."""
# Data frame constructor.
data_frame = pd.DataFrame([(item.Price,
item.Condominium_fee,
item.Floor_size,
item.Number_bedrooms,
item.Number_bathrooms,
item.Parking_spaces,
item.IPTU_fee,
item.Address,
item.Neighborhood,
item.City,
item.State,
item.Longitude,
item.Latitude,
item.Title,
item.Description,
item.Link,
item.Publisher,
item.Item_ID,
item.Created,
item.Unit_types) for item in [item for item in data]],
columns=['Price',
'Condominium',
'FloorSize',
'NumberOfBedrooms',
'NumberOfBathrooms',
'ParkingSpaces',
'IPTU',
'Address',
'Neighborhood',
'City',
'State',
'Longitude',
'Latitude',
'Title',
'Description',
'Link',
'Publisher',
'ItemID',
'Created',
'UnitTypes'])
# Write data frame to a comma-separated values (csv) file.
data_frame.to_csv('data.csv', index=False)
def __data_scraper(self, data):
"""Processing the data obtained from the zapimoveis.com.br website."""
item = ListedItem()
item.Price = float(data['listing']['pricingInfos'][0].get('price', 0) if len(
data['listing']['pricingInfos']) > 0 else 0)
item.Condominium_fee = float(data['listing']['pricingInfos'][0].get('monthlyCondoFee', 0) if len(
data['listing']['pricingInfos']) > 0 else 0)
item.IPTU_fee = float(data['listing']['pricingInfos'][0].get('yearlyIptu', 0) if len(
data['listing']['pricingInfos']) > 0 else 0)
item.Floor_size = float(data['listing']['usableAreas'][0] if len(data['listing']['usableAreas']) > 0 else 0)
item.Number_bedrooms = int(data['listing']['bedrooms'][0] if len(data['listing']['bedrooms']) > 0 else 0)
item.Number_bathrooms = int(data['listing']['bathrooms'][0] if len(data['listing']['bathrooms']) > 0 else 0)
item.Parking_spaces = int(
data['listing']['parkingSpaces'][0] if len(data['listing']['parkingSpaces']) > 0 else 0)
item.Address = f"{data['link']['data']['street'].strip()}, {data['link']['data']['streetNumber'].strip()}"
item.Neighborhood = data['link']['data']['neighborhood'].strip()
item.City = data['link']['data']['city'].strip()
item.State = data['link']['data']['state'].strip()
item.Longitude = data['listing']['address']['point']['lat'] if data['listing']['address'].get('point') else None
item.Latitude = data['listing']['address']['point']['lat'] if data['listing']['address'].get('point') else None
item.Title = data['listing']['title'].strip()
item.Description = self.__parser_description(data['listing']['description'].replace('<br>', '\n').strip())
item.Link = DOMAIN_NAME + data['link']['href']
item.Publisher = data['account']['name'].strip()
item.Item_ID = data['listing']['id'].strip()
item.Created = data['listing']['createdAt'].strip()
item.Unit_types = data['listing']['unitTypes'][0].strip()
return item
def __get_data(self):
"""Processing the pagination of results."""
listed_items = []
for query_town in TOWNS:
for unit_type in UnitType:
for page in range(1, QUANTITY_TO_FETCH):
sleep_time = randint(RANDINT_STARTING, RANDINT_FINAL)
results = self.__fetch_data(BusinessFilter.Comprar.value, unit_type.value, query_town, page)
for result in results:
listed_items.append(self.__data_scraper(result))
logging.info(f'Delay execution: {sleep_time}')
time.sleep(sleep_time)
return listed_items
def execute(self):
"""Write data to a comma-separated values (csv) file."""
self.__data_to_csv(self.__get_data())
def main():
"""Execute when the module is not initialized from an import statement."""
logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO)
logging.info('Zap Imóveis Scraper --- Started')
data = DataScraper()
data.execute()
logging.info('Zap Imóveis Scraper --- Finished')
if __name__ == "__main__":
main()