从网站上删除Vivino.com的英文评论

#!/usr/bin/env python # coding: utf-8 # Import packages import requests import json import pandas as pd # Get request from the Vivino website def get_wine_data(wine_id, year, page): headers = { "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0", } api_url = f"https://www.vivino.com/api/wines/{wine_id}/reviews?per_page=50&year={year}&page={page}" print(api_url) data = requests.get(api_url, headers=headers).json() return data # Get request from the Vivino website r = requests.get( "https://www.vivino.com/api/explore/explore", params={ "country_code": "FR", "country_codes[]": "pt", "currency_code": "EUR", "grape_filter": "varietal", "min_rating": "1", "order_by": "price", "order": "asc", "page": 1, "price_range_max": "500", "price_range_min": "0", "wine_type_ids[]": "1", }, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0" }, ) # Variables to scrap from the Vivino website results = [ ( t["vintage"]["wine"]["winery"]["name"], t["vintage"]["year"], t["vintage"]["wine"]["id"], f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}', t["vintage"]["statistics"]["ratings_average"], t["vintage"]["statistics"]["ratings_count"], t["prices"][0]["amount"], t["vintage"]["wine"]["style"]["acidity"], t["vintage"]["wine"]["style"]["blurb"], t["vintage"]["wine"]["style"]["body"], t["vintage"]["wine"]["style"]["body_description"], t['vintage']['wine']['region']['country']['name'], t['vintage']['wine']['style']['description'], t['vintage']['wine']['style']['food'][0]['name'], t['vintage']['wine']['style']['food'][1]['name'], t['vintage']['wine']['style']['food'][2]['name'], t['vintage']['wine']['style']['food'][3]['name'], t['vintage']['wine']['style']['country']['most_used_grapes'][0]['name'], t['vintage']['wine']['region']['country']['most_used_grapes'][1]['name'], t['vintage']['wine']['region']['country']['most_used_grapes'][2]['name'], t['vintage']['wine']['taste']['structure']['acidity'], t['vintage']['wine']['taste']['structure']['calculated_structure_count'], t['vintage']['wine']['taste']['structure']['intensity'], t['vintage']['wine']['taste']['structure']['sweetness'], t['vintage']['wine']['taste']['structure']['tannin'], t['vintage']['wine']['taste']['structure']['user_structure_count'], t['vintage']['wine']['taste']['flavor'][0]['group'], t['vintage']['wine']['taste']['flavor'][0]['stats']['count'], t['vintage']['wine']['taste']['flavor'][1]['group'], t['vintage']['wine']['taste']['flavor'][1]['stats']['count'], t['vintage']['wine']['taste']['flavor'][2]['group'], t['vintage']['wine']['taste']['flavor'][2]['stats']['count'], t['vintage']['wine']['taste']['flavor'][3]['group'], t['vintage']['wine']['taste']['flavor'][3]['stats']['count'], t['vintage']['wine']['taste']['flavor'][4]['group'], t['vintage']['wine']['taste']['flavor'][4]['stats']['count'], t['vintage']['wine']['taste']['flavor'][5]['group'], t['vintage']['wine']['taste']['flavor'][5]['stats']['count'], t['vintage']['wine']['taste']['flavor'][6]['group'], t['vintage']['wine']['taste']['flavor'][6]['stats']['count'] ) for t in r.json()["explore_vintage"]["matches"] ] ### for t in r.json()["explore_vintage"]["matches"][0:2]: wine_id = t["vintage"]["wine"]["id"] year = t["vintage"]["year"], with open(f'output-wine{wine_id}-year{year}.json', 'w+') as f: json.dump(t, f, indent=4, sort_keys=True) ### # Saving the results in a dataframe dataframe = pd.DataFrame( results, columns=["Winery", "Year", "Wine ID", "Wine", "Rating", "num_review", "price", "acidity",'Blurb','Body','Body_des','country','wine_des','food_1','food_2','food_3','food_4','grape_1','grape_2','grape_3','acidity_score','calculated_structure_count','intensity','sweetness','tannin','user_structure_count', 'flavor_1', 'flavor_1_count','flavor_2', 'flavor_2_count','flavor_3', 'flavor_3_count','flavor_4', 'flavor_4_count','flavor_5', 'flavor_5_count','flavor_6', 'flavor_6_count','flavor_7', 'flavor_7_count'], ) # Scraping the reviews from the Vivino website ratings = [] for _, row in dataframe.iterrows(): ######## Ik heb hier head(2) toegevoegd om te testen zodat hij maar twee wijnen doet page = 1 while True: print( f'Getting info about wine {row["Wine ID"]}-{row["Year"]} Page {page}' ) d = get_wine_data(row["Wine ID"], row["Year"], page) if not d["reviews"]: break for r in d["reviews"]: ratings.append( [ row["Year"], row["Wine ID"], r["rating"], r["note"], r["created_at"], ] ) page += 1 ratings = pd.DataFrame( ratings, columns=["Year", "Wine ID", "User Rating", "Note", "CreatedAt"] ) # Merging the two datasets; results and ratings. df_out = ratings.merge(dataframe) df_out.to_csv("data.csv", index=False)

1条回答

网友

1楼 · 发布于 2024-05-29 11:06:46

要仅获取英文评论，您可以按"language"键进行筛选。英语评论设置为en
要获得更多国家的葡萄酒，您可以将代码放入"country_codes[]"键中。前例["pt", "es", "fr"]

# Import packages
import requests
import json
import pandas as pd

# Get request from the Vivino website
def get_wine_data(wine_id, year, page):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
    }

    api_url = f"https://www.vivino.com/api/wines/{wine_id}/reviews?per_page=50&year={year}&page={page}"
    print(api_url)

    data = requests.get(api_url, headers=headers).json()

    return data


# Get request from the Vivino website
r = requests.get(
    "https://www.vivino.com/api/explore/explore",
    params={
        "country_codes[]": ["pt", "es", "fr"],  # <  put more country codes here
        "currency_code": "EUR",
        "grape_filter": "varietal",
        "min_rating": "1",
        "order_by": "price",
        "order": "asc",
        "page": 1,
        "price_range_max": "500",
        "price_range_min": "0",
        "wine_type_ids[]": "1",
    },
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
    },
)

# Variables to scrap from the Vivino website
results = [
    (
        t["vintage"]["wine"]["winery"]["name"],
        t["vintage"]["year"],
        t["vintage"]["wine"]["id"],
        f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',
    )
    for t in r.json()["explore_vintage"]["matches"]
]

# Saving the results in a dataframe
dataframe = pd.DataFrame(
    results,
    columns=["Winery", "Year", "Wine ID", "Wine"],
)

# Scraping the reviews from the Vivino website
ratings = []

for _, row in dataframe.iterrows():
    page = 1
    while True:
        print(
            f'Getting info about wine {row["Wine ID"]}-{row["Year"]} Page {page}'
        )

        d = get_wine_data(row["Wine ID"], row["Year"], page)

        if not d["reviews"]:
            break

        for r in d["reviews"]:
            if r["language"] != "en": # <  get only english reviews
                continue

            ratings.append(
                [
                    row["Year"],
                    row["Wine ID"],
                    r["rating"],
                    r["note"],
                    r["created_at"],
                ]
            )

        page += 1


ratings = pd.DataFrame(
    ratings, columns=["Year", "Wine ID", "User Rating", "Note", "CreatedAt"]
)

# Merging the two datasets; results and ratings.
df_out = ratings.merge(dataframe)
df_out.to_csv("data.csv", index=False)

相关问题更多 >

编程相关推荐

热门问题

热门文章