调整python（Beauty soup）代码以刮取多个页面

2条回答

网友

1楼 · 编辑于 2024-04-23 08:26:02

试着这样做：

from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq

# file creation
num = 1
filename = "propertyfinder.csv"
with open(filename, 'w') as f:
    headers = "title,address,area,bedrooms,bathrooms,price\n"
    f.write(headers)
    while True:
        my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={num}&t=3000'

        uClient = uReq(my_url)

        page_html = uClient.read()
        uClient.close()

        page_soup = soup(page_html, "html.parser")
        cards = page_soup.findAll("div", {"class": "card__content"})
        contain_cards = cards[0]
        try:
            for contain_cards in cards:
                # TITLE
                title_container = contain_cards.findAll("h2", {"class": "card__title card__title-link"})
                title = title_container[0].text

                # ADDRESS
                address_container = contain_cards.findAll("span", {"class": "card__location-text"})
                address = address_container[0].text

                # PRICE
                price_container = contain_cards.findAll("span", {"class": "card__price-value"})
                price = (price_container[0].text.strip()).replace("EGP", "")

                # BEDROOMS
                bedrooms_container = contain_cards.findAll("p",
                                                           {"class": "card__property-amenity card__property-amenity bedrooms"})
                bedrooms = bedrooms_container[0].text.strip()

                # BATHROOMS
                bathrooms_container = contain_cards.findAll("p",
                                                            {"class": "card__property-amenity card__property-amenity bathrooms"})
                bathrooms = bathrooms_container[0].text.strip()

                # AREA
                area_container = contain_cards.findAll("p", {"class": "card__property-amenity card__property-amenity area"})
                area = area_container[0].text

                # CLOSING
                print(title)
                print(address)
                print(area)
                print(bedrooms)
                print(bathrooms)
                print(price)
                f.write(title.replace(",", "|") + "," + address.replace(",",
                                                                        "|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(
                    ",", "") + "\n")
        except:
            pass
        num+=1
        if num > 1290:
            break

注意：我用try和except绕过了一些UnicodeEncodeError，但我给您介绍了如何在脚本中运行多页

网友

2楼 · 编辑于 2024-04-23 08:26:02

计算出以下内容供任何人参考：

&13；第13部分,；

from bs4 import BeautifulSoup
import requests

def scrape_properties(page):
    my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={page}&t=3000'

    #Opening the connection and grabbing the page
    headers = {
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        'referer': 'https://google.com',
    }
    response = requests.get(my_url, headers=headers)

    #html parsing
    page_soup = BeautifulSoup(response.text, "html.parser")
    cards = page_soup.find_all("div",{"class":"card__content"})
    contain_cards = cards[0]

    #file creation
    filename = "propertyfinder.csv"
    if page == 1:
        f = open(filename, "w")

        headers = "title,address,area,bedrooms,bathrooms,price,ptype\n"
        f.write(headers)
    else:
        f = open(filename, "a")
    ##DATA

    for contain_cards in cards:
        try:

            #TITLE
            title_container = contain_cards.find_all("h2",{"class":"card__title card__title-link"})
            title = title_container[0].text.strip()

            #ADDRESS
            address_container = contain_cards.find_all("span",{"class":"card__location-text"})
            address = address_container[0].text.strip()

            #PRICE
            price_container = contain_cards.find_all("span",{"class":"card__price-value"})
            price = (price_container[0].text.strip()).replace("EGP","").strip()

            #BEDROOMS
            bedrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity bedrooms"})
            bedrooms = bedrooms_container[0].text.strip().strip()

            #BATHROOMS
            bathrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity bathrooms"})
            bathrooms = bathrooms_container[0].text.strip()

            #AREA
            area_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity area"})
            area = area_container[0].text.strip()

            #PTYPE
            ptype_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity property-type"})
            ptype = ptype_container[0].text.strip()        

            #CLOSING
            print (title)
            print (address)
            print (area)
            print (bedrooms)
            print (bathrooms)
            print (price)
            print (ptype)
            f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area.replace(",","") + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "," + ptype + "\n" )
        except:
            pass
    
    f.close()

for page in range(1, 100):
    scrape_properties(page)

；

和#13；

相关问题更多 >

编程相关推荐

热门问题

热门文章

调整python（Beauty soup）代码以刮取多个页面

相关问题 更多 >

编程相关推荐

热门问题

热门文章

相关问题更多 >