刮削视图函数会记住它以前的迭代

2024-06-07 05:47:39 发布

您现在位置:Python中文网/ 问答频道 /正文

我使用以下视图函数来刮取数据:

def results(request):
    if request.method == 'POST':
        form = RoomForm(request.POST)

        if form.is_valid():
            form_city = form.cleaned_data['city'].title()
            form_country = form.cleaned_data['country'].title()
            form_arrival_date = form.cleaned_data['arrival_date']
            form_departure_date = form.cleaned_data['departure_date']
            form_pages_to_scrape = form.cleaned_data['pages_to_scrape']

    #launch scraper   
    scraper = AIRBNB_scraper(city=form_city, country=form_country, arrival_date=str(form_arrival_date), departure_date=str(form_departure_date))
    scraped_dataframe = scraper.scrape_multiple_pages(last_page_selector_number=form_pages_to_scrape)
    scraped_dataframe_sorted = scraped_dataframe.sort_values('prices')
    print(scraped_dataframe_sorted)

    #convert scraped dataframe into lists
    prices = scraped_dataframe_sorted['prices'].tolist()
    listings_links = scraped_dataframe_sorted['listings_links'].tolist()
    listings_names = scraped_dataframe_sorted['listings_names'].tolist()
    photo_links = scraped_dataframe_sorted['photo_links'].tolist()

    dictionary = zip(prices, listings_links, listings_names, photo_links)

    context = {'dictionary': dictionary}
    return render(request, 'javascript/results.html', context)

在表单提交时,使用AJAX将post请求发送到此函数:

var frm = $('#login-form');
frm.submit(function () {
    $.ajax({
        type: "POST",
        url: "/results",
        data: frm.serialize(),
        success: function (data) {
            $("#table").html(data);
            $('#go_back').remove();
        },
        error: function(data) {
            $("#table").html("Something went wrong!");
        }
    });
    return false;
});

之后,在表单所在的同一页面上,所刮取的数据显示为HTML表

问题是每次表单提交完成时,被刮下的项目数量都会翻一番。因此,例如,如果第一次单击按钮时被刮下的项目数是16,那么输出将是16,但是第二次运行时,它将是32,然后是64,依此类推

这就像应用程序记得以前的表单提交,但我看不出任何原因。在这个函数的末尾,我尝试了clearin,pandas数据框用来存储被刮取的数据和作为上下文传递的字典,但是没有用

表格为:

class RoomForm(forms.Form):
    city = forms.CharField(max_length=100)
    country = forms.CharField(max_length=100)
    arrival_date = forms.DateField(widget=forms.DateInput(attrs=
                                {
                                    'class':'datepicker'
                                }), required=False)
    departure_date = forms.DateField(widget=forms.DateInput(attrs=
                                {
                                    'class':'datepicker'
                                }), required=False)
    pages_to_scrape = forms.IntegerField(label='Pages to scrape (max. 17)', min_value=0, max_value=17, widget=forms.NumberInput(attrs={'style':'width: 188px'}))

AIRBNB\u刮刀是:

import requests, bs4
import re
import pandas as pd

price_pattern = re.compile(r'\d*\s*?,?\s*?\d*\szł')
photo_link_pattern = re.compile(r'https.*\)')

prices = []
listings_links = []
photo_links = []
listings_names = []

class AIRBNB_scraper():

    def __init__(self, city, country, accomodation_type='homes', arrival_date='2018-03-25', departure_date='2018-04-10'):
        self.city = city
        self.country = country
        self.arrival_date = arrival_date
        self.departure_date = departure_date
        self.accomodation_type = accomodation_type

    def make_soup(self, page_number):
        url = 'https://www.airbnb.pl/s/'+ self.city +'--'+ self.country +'/'+ self.accomodation_type  +'?query='+ self.city +'%2C%20'+ self.country +'&refinement_paths%5B%5D=%2F'+ self.accomodation_type  +'&checkin=' + self.arrival_date + '&checkout=' + self.departure_date + '&section_offset=' + str(page_number)
        response = requests.get(url)  
        soup = bs4.BeautifulSoup(response.text, "html.parser")

        return soup

    def get_listings(self, page_number):

        soup = self.make_soup(page_number)
        listings = soup.select('._f21qs6')
        number_of_listings = len(listings)
        print('\n' + "Number of listings found: " + str(number_of_listings))

        while number_of_listings != 18:
            print('\n' + str(number_of_listings) + ' is not correct number of listings, it should be 18. Trying again now.')
            soup = self.make_soup(page_number)
            listings = soup.find_all('div', class_='_f21qs6')
            number_of_listings = len(listings)

        print('\n' + "All fine! The number of listings is: " + str(number_of_listings) + '. Starting scraping now')

        return listings

    def scrape_listings_per_page(self, page_number):

        listings_to_scrape = self.get_listings(page_number)

        for listing in listings_to_scrape:

            #get price
            price_container = listing.find_all('span', class_='_hylizj6')
            price_search = re.search(price_pattern, str(price_container))
            price = price_search.group()

            #get listing_link
            listing_link = 'https://www.airbnb.pl' + listing.find('a', class_='_15ns6vh')['href']

            #get photo_link
            photo_link_node = listing.find('div', class_="_1df8dftk")['style']
            photo_link_search = re.search(photo_link_pattern, str(photo_link_node))
            #~ if photo_link_search:
                #~ print('Is regex match')
            #~ else:
                #~ print('No regex match')
            photo_link_before_strip = photo_link_search.group()
            photo_link = photo_link_before_strip[:-1] #remove ") at the end of link 

            #get listing_name
            listing_name = listing.find('div', class_='_1rths372').text

            #append lists
            prices.append(price)
            listings_links.append(listing_link)
            photo_links.append(photo_link)
            listings_names.append(listing_name)

    def scrape_multiple_pages(self, last_page_selector_number):

        last_page_selector_number += 1
        for x in range(0, last_page_selector_number):#18
            self.scrape_listings_per_page(x)
            print('\n' + "INDEX OF PAGE BEING SCRAPED: " + str(x))
            scraped_data = pd.DataFrame({'prices': prices,
                                        'listings_links': listings_links,
                                        'photo_links': photo_links,
                                        'listings_names': listings_names})
        return scraped_data

Tags: selfformnumbercitydatadatepagelink
1条回答
网友
1楼 · 发布于 2024-06-07 05:47:39

您有模块级变量:priceslistings_links,等等。您在AIRBNB\u scraper实例中附加到这些变量,但它们不是该实例的一部分,并且在调用之间将保持不变。您应该使它们成为实例属性-在__init__方法中将它们定义为self.prices

相关问题 更多 >