-
Notifications
You must be signed in to change notification settings - Fork 185
/
apify_zillow_scraper.py
100 lines (79 loc) · 3.77 KB
/
apify_zillow_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""This module defines the main entry point for the Apify Actor.
Feel free to modify this file to suit your specific needs.
To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation:
https://docs.apify.com/sdk/python
"""
from apify import Actor
import httpx
from lxml import html
import asyncio
async def fetch_properties(url, headers):
async with httpx.AsyncClient() as client:
response = await client.get(url, headers=headers)
if response.status_code != 200:
Actor.log.error(f"Failed to fetch the HTML content. Status code: {response.status_code}")
return []
# Parsing the response content using lxml
tree = html.fromstring(response.content)
properties = []
# Using XPath to select property cards
property_cards = tree.xpath('//li[contains(@class, "ListItem-c11n-8-105-0")]')
for card in property_cards:
obj = {}
try:
# Address
obj["Address"] = card.xpath('.//a/address/text()')[0].strip()
except IndexError:
obj["Address"] = None
try:
# Price
obj["Price"] = card.xpath('.//span[@data-test="property-card-price"]/text()')[0].strip()
except IndexError:
obj["Price"] = None
# Extracting and splitting Bds, Baths, and Sqft data
try:
details = card.xpath('.//ul[contains(@class, "StyledPropertyCardHomeDetailsList-c11n-8-105-0__sc-1j0som5-0 ldtVy")]')
if details:
details_list = details[0].xpath('.//li/b/text()')
obj["Bds"] = details_list[0].strip() if len(details_list) > 0 else None
obj["Baths"] = details_list[1].strip() if len(details_list) > 1 else None
obj["Sqft"] = details_list[2].strip() if len(details_list) > 2 else None
else:
obj["Bds"] = obj["Baths"] = obj["Sqft"] = None
except IndexError:
obj["Bds"] = obj["Baths"] = obj["Sqft"] = None
properties.append(obj)
return properties
async def main() -> None:
"""Main entry point for the Apify Actor."""
async with Actor:
Actor.log.info('Starting the Zillow scraping actor...')
base_url = "https://www.zillow.com/new-york-ny/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"upgrade-insecure-requests": "1"
}
all_properties = []
page_number = 1
properties_to_collect = 20
while True:
url = f"{base_url}?page={page_number}"
Actor.log.info(f"Fetching page {page_number}...")
properties = await fetch_properties(url, headers)
if not properties:
Actor.log.info("No more properties found or unable to fetch page.")
break
valid_properties = [p for p in properties if p["Address"] and p["Price"] and p["Bds"] and p["Baths"] and p["Sqft"]]
all_properties.extend(valid_properties)
if len(all_properties) >= properties_to_collect:
break
page_number += 1
await asyncio.sleep(2)
# Log the total number of properties scraped
Actor.log.info(f"Successfully scraped {len(all_properties)} properties.")
# Running the script
if __name__ == "__main__":
asyncio.run(main())