非常感谢您的支持,我正在使用Python BeautifulSoup:
我只需要在多个页面上运行这段代码(也就是在第1到1290页上刮取相同的数据)。我是新来的,我可以想象它并没有那么复杂,因为URL与页码非常直接
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url = 'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page=1&t=3000'
#Opening the connection and grabbing the page
uClient = uReq(my_url)
#offload page content into a variable
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div",{"class":"card__content"})
contain_cards = cards[0]
#file creation
filename = "propertyfinder.csv"
f = open(filename, "w")
headers = "title,address,area,bedrooms,bathrooms,price\n"
f.write(headers)
##DATA
for contain_cards in cards:
#TITLE
title_container = contain_cards.findAll("h2",{"class":"card__title card__title-link"})
title = title_container[0].text
#ADDRESS
address_container = contain_cards.findAll("span",{"class":"card__location-text"})
address = address_container[0].text
#PRICE
price_container = contain_cards.findAll("span",{"class":"card__price-value"})
price = (price_container[0].text.strip()).replace("EGP","")
#BEDROOMS
bedrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bedrooms"})
bedrooms = bedrooms_container[0].text.strip()
#BATHROOMS
bathrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bathrooms"})
bathrooms = bathrooms_container[0].text.strip()
#AREA
area_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--area"})
area = area_container[0].text
#CLOSING
print (title)
print (address)
print (area)
print (bedrooms)
print (bathrooms)
print (price)
f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "\n" )
f.close()
试着这样做:
注意:我用
try
和except
绕过了一些UnicodeEncodeError
,但我给您介绍了如何在脚本中运行多页计算出以下内容供任何人参考: