调整python(Beauty soup)代码以刮取多个页面

2024-04-23 08:26:02 发布

您现在位置:Python中文网/ 问答频道 /正文

非常感谢您的支持,我正在使用Python BeautifulSoup:

我只需要在多个页面上运行这段代码(也就是在第1到1290页上刮取相同的数据)。我是新来的,我可以想象它并没有那么复杂,因为URL与页码非常直接

from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uReq my_url = 'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page=1&t=3000' #Opening the connection and grabbing the page uClient = uReq(my_url) #offload page content into a variable page_html = uClient.read() uClient.close() #html parsing page_soup = soup(page_html, "html.parser") cards = page_soup.findAll("div",{"class":"card__content"}) contain_cards = cards[0] #file creation filename = "propertyfinder.csv" f = open(filename, "w") headers = "title,address,area,bedrooms,bathrooms,price\n" f.write(headers) ##DATA for contain_cards in cards: #TITLE title_container = contain_cards.findAll("h2",{"class":"card__title card__title-link"}) title = title_container[0].text #ADDRESS address_container = contain_cards.findAll("span",{"class":"card__location-text"}) address = address_container[0].text #PRICE price_container = contain_cards.findAll("span",{"class":"card__price-value"}) price = (price_container[0].text.strip()).replace("EGP","") #BEDROOMS bedrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bedrooms"}) bedrooms = bedrooms_container[0].text.strip() #BATHROOMS bathrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bathrooms"}) bathrooms = bathrooms_container[0].text.strip() #AREA area_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--area"}) area = area_container[0].text #CLOSING print (title) print (address) print (area) print (bedrooms) print (bathrooms) print (price) f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "\n" ) f.close()

Tags: texttitleaddresscontainerpageareapropertycard
2条回答

试着这样做:

from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq

# file creation
num = 1
filename = "propertyfinder.csv"
with open(filename, 'w') as f:
    headers = "title,address,area,bedrooms,bathrooms,price\n"
    f.write(headers)
    while True:
        my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={num}&t=3000'

        uClient = uReq(my_url)

        page_html = uClient.read()
        uClient.close()

        page_soup = soup(page_html, "html.parser")
        cards = page_soup.findAll("div", {"class": "card__content"})
        contain_cards = cards[0]
        try:
            for contain_cards in cards:
                # TITLE
                title_container = contain_cards.findAll("h2", {"class": "card__title card__title-link"})
                title = title_container[0].text

                # ADDRESS
                address_container = contain_cards.findAll("span", {"class": "card__location-text"})
                address = address_container[0].text

                # PRICE
                price_container = contain_cards.findAll("span", {"class": "card__price-value"})
                price = (price_container[0].text.strip()).replace("EGP", "")

                # BEDROOMS
                bedrooms_container = contain_cards.findAll("p",
                                                           {"class": "card__property-amenity card__property-amenity bedrooms"})
                bedrooms = bedrooms_container[0].text.strip()

                # BATHROOMS
                bathrooms_container = contain_cards.findAll("p",
                                                            {"class": "card__property-amenity card__property-amenity bathrooms"})
                bathrooms = bathrooms_container[0].text.strip()

                # AREA
                area_container = contain_cards.findAll("p", {"class": "card__property-amenity card__property-amenity area"})
                area = area_container[0].text

                # CLOSING
                print(title)
                print(address)
                print(area)
                print(bedrooms)
                print(bathrooms)
                print(price)
                f.write(title.replace(",", "|") + "," + address.replace(",",
                                                                        "|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(
                    ",", "") + "\n")
        except:
            pass
        num+=1
        if num > 1290:
            break

注意:我用tryexcept绕过了一些UnicodeEncodeError,但我给您介绍了如何在脚本中运行多页

计算出以下内容供任何人参考:

&13; 第13部分,;
from bs4 import BeautifulSoup
import requests

def scrape_properties(page):
    my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={page}&t=3000'

    #Opening the connection and grabbing the page
    headers = {
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
        'referer': 'https://google.com',
    }
    response = requests.get(my_url, headers=headers)

    #html parsing
    page_soup = BeautifulSoup(response.text, "html.parser")
    cards = page_soup.find_all("div",{"class":"card__content"})
    contain_cards = cards[0]

    #file creation
    filename = "propertyfinder.csv"
    if page == 1:
        f = open(filename, "w")

        headers = "title,address,area,bedrooms,bathrooms,price,ptype\n"
        f.write(headers)
    else:
        f = open(filename, "a")
    ##DATA

    for contain_cards in cards:
        try:

            #TITLE
            title_container = contain_cards.find_all("h2",{"class":"card__title card__title-link"})
            title = title_container[0].text.strip()

            #ADDRESS
            address_container = contain_cards.find_all("span",{"class":"card__location-text"})
            address = address_container[0].text.strip()

            #PRICE
            price_container = contain_cards.find_all("span",{"class":"card__price-value"})
            price = (price_container[0].text.strip()).replace("EGP","").strip()

            #BEDROOMS
            bedrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity bedrooms"})
            bedrooms = bedrooms_container[0].text.strip().strip()

            #BATHROOMS
            bathrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity bathrooms"})
            bathrooms = bathrooms_container[0].text.strip()

            #AREA
            area_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity area"})
            area = area_container[0].text.strip()

            #PTYPE
            ptype_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity property-type"})
            ptype = ptype_container[0].text.strip()        

            #CLOSING
            print (title)
            print (address)
            print (area)
            print (bedrooms)
            print (bathrooms)
            print (price)
            print (ptype)
            f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area.replace(",","") + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "," + ptype + "\n" )
        except:
            pass
    
    f.close()

for page in range(1, 100):
    scrape_properties(page)
和#13;
和#13;

相关问题 更多 >