从网站上删除Vivino.com的英文评论

2024-05-29 11:06:46 发布

您现在位置:Python中文网/ 问答频道 /正文

关于Vivino.com上的网络抓取信息,我有两个问题: 1.)使用下面的代码,我可以从Vivino网站上获取信息和评论,但是我希望获得英文评论,或者只获得英文评论。有办法做到这一点吗? 2.)目前我只从葡萄牙进口葡萄酒,但我想要不同国家的葡萄酒。如果我删除'country_codes[]:'pt',代码将不再工作。我怎样才能解决这个问题

有人能帮我吗?多谢各位

#!/usr/bin/env python
# coding: utf-8

# Import packages
import requests
import json
import pandas as pd

# Get request from the Vivino website
def get_wine_data(wine_id, year, page):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
    }

    api_url = f"https://www.vivino.com/api/wines/{wine_id}/reviews?per_page=50&year={year}&page={page}" 
    print(api_url)

    data = requests.get(api_url, headers=headers).json()

    return data

# Get request from the Vivino website
r = requests.get(
    "https://www.vivino.com/api/explore/explore",
    params={
        "country_code": "FR",
        "country_codes[]": "pt",
        "currency_code": "EUR",
        "grape_filter": "varietal",
        "min_rating": "1",
        "order_by": "price",
        "order": "asc",
        "page": 1,
        "price_range_max": "500",
        "price_range_min": "0",
        "wine_type_ids[]": "1",
    },
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
    },
)

# Variables to scrap from the Vivino website
results = [
    (
        t["vintage"]["wine"]["winery"]["name"],
        t["vintage"]["year"],
        t["vintage"]["wine"]["id"],
        f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',
        t["vintage"]["statistics"]["ratings_average"],
        t["vintage"]["statistics"]["ratings_count"],
        t["prices"][0]["amount"],
        t["vintage"]["wine"]["style"]["acidity"],
        t["vintage"]["wine"]["style"]["blurb"],
        t["vintage"]["wine"]["style"]["body"],
        t["vintage"]["wine"]["style"]["body_description"],
        t['vintage']['wine']['region']['country']['name'],
        t['vintage']['wine']['style']['description'],
        t['vintage']['wine']['style']['food'][0]['name'],
        t['vintage']['wine']['style']['food'][1]['name'],
        t['vintage']['wine']['style']['food'][2]['name'],
        t['vintage']['wine']['style']['food'][3]['name'],
        t['vintage']['wine']['style']['country']['most_used_grapes'][0]['name'],
        t['vintage']['wine']['region']['country']['most_used_grapes'][1]['name'],
        t['vintage']['wine']['region']['country']['most_used_grapes'][2]['name'],
        t['vintage']['wine']['taste']['structure']['acidity'],
        t['vintage']['wine']['taste']['structure']['calculated_structure_count'],
        t['vintage']['wine']['taste']['structure']['intensity'],
        t['vintage']['wine']['taste']['structure']['sweetness'],
        t['vintage']['wine']['taste']['structure']['tannin'],
        t['vintage']['wine']['taste']['structure']['user_structure_count'],
        t['vintage']['wine']['taste']['flavor'][0]['group'],
        t['vintage']['wine']['taste']['flavor'][0]['stats']['count'],
        t['vintage']['wine']['taste']['flavor'][1]['group'],
        t['vintage']['wine']['taste']['flavor'][1]['stats']['count'],
        t['vintage']['wine']['taste']['flavor'][2]['group'],
        t['vintage']['wine']['taste']['flavor'][2]['stats']['count'],
        t['vintage']['wine']['taste']['flavor'][3]['group'],
        t['vintage']['wine']['taste']['flavor'][3]['stats']['count'],
        t['vintage']['wine']['taste']['flavor'][4]['group'],
        t['vintage']['wine']['taste']['flavor'][4]['stats']['count'],
        t['vintage']['wine']['taste']['flavor'][5]['group'],
        t['vintage']['wine']['taste']['flavor'][5]['stats']['count'],
        t['vintage']['wine']['taste']['flavor'][6]['group'],
        t['vintage']['wine']['taste']['flavor'][6]['stats']['count']
    )
    for t in r.json()["explore_vintage"]["matches"]
]

###
for t in r.json()["explore_vintage"]["matches"][0:2]:
    wine_id = t["vintage"]["wine"]["id"]
    year = t["vintage"]["year"],
    
    with open(f'output-wine{wine_id}-year{year}.json', 'w+') as f:
        json.dump(t, f, indent=4, sort_keys=True)
###

# Saving the results in a dataframe
dataframe = pd.DataFrame(
    results,
    columns=["Winery", "Year", "Wine ID", "Wine", "Rating", "num_review", "price", "acidity",'Blurb','Body','Body_des','country','wine_des','food_1','food_2','food_3','food_4','grape_1','grape_2','grape_3','acidity_score','calculated_structure_count','intensity','sweetness','tannin','user_structure_count', 'flavor_1', 'flavor_1_count','flavor_2', 'flavor_2_count','flavor_3', 'flavor_3_count','flavor_4', 'flavor_4_count','flavor_5', 'flavor_5_count','flavor_6', 'flavor_6_count','flavor_7', 'flavor_7_count'],
             
)

# Scraping the reviews from the Vivino website
ratings = []
for _, row in dataframe.iterrows(): ######## Ik heb hier head(2) toegevoegd om te testen zodat hij maar twee wijnen doet
    page = 1
    while True:
        print(
            f'Getting info about wine {row["Wine ID"]}-{row["Year"]} Page {page}'
        )

        d = get_wine_data(row["Wine ID"], row["Year"], page)

        if not d["reviews"]:
            break

        for r in d["reviews"]:
            ratings.append(
                [
                    row["Year"],
                    row["Wine ID"],
                    r["rating"],
                    r["note"],
                    r["created_at"],
                ]
            )

        page += 1

ratings = pd.DataFrame(
    ratings, columns=["Year", "Wine ID", "User Rating", "Note", "CreatedAt"]
)

# Merging the two datasets; results and ratings.
df_out = ratings.merge(dataframe)
df_out.to_csv("data.csv", index=False)

Tags: thenamefoodstylecountpagegroupstructure
1条回答
网友
1楼 · 发布于 2024-05-29 11:06:46
  1. 要仅获取英文评论,您可以按"language"键进行筛选。英语评论设置为en

  2. 要获得更多国家的葡萄酒,您可以将代码放入"country_codes[]"键中。前例["pt", "es", "fr"]

# Import packages
import requests
import json
import pandas as pd

# Get request from the Vivino website
def get_wine_data(wine_id, year, page):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0",
    }

    api_url = f"https://www.vivino.com/api/wines/{wine_id}/reviews?per_page=50&year={year}&page={page}"
    print(api_url)

    data = requests.get(api_url, headers=headers).json()

    return data


# Get request from the Vivino website
r = requests.get(
    "https://www.vivino.com/api/explore/explore",
    params={
        "country_codes[]": ["pt", "es", "fr"],  # <  put more country codes here
        "currency_code": "EUR",
        "grape_filter": "varietal",
        "min_rating": "1",
        "order_by": "price",
        "order": "asc",
        "page": 1,
        "price_range_max": "500",
        "price_range_min": "0",
        "wine_type_ids[]": "1",
    },
    headers={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0"
    },
)

# Variables to scrap from the Vivino website
results = [
    (
        t["vintage"]["wine"]["winery"]["name"],
        t["vintage"]["year"],
        t["vintage"]["wine"]["id"],
        f'{t["vintage"]["wine"]["name"]} {t["vintage"]["year"]}',
    )
    for t in r.json()["explore_vintage"]["matches"]
]

# Saving the results in a dataframe
dataframe = pd.DataFrame(
    results,
    columns=["Winery", "Year", "Wine ID", "Wine"],
)

# Scraping the reviews from the Vivino website
ratings = []

for _, row in dataframe.iterrows():
    page = 1
    while True:
        print(
            f'Getting info about wine {row["Wine ID"]}-{row["Year"]} Page {page}'
        )

        d = get_wine_data(row["Wine ID"], row["Year"], page)

        if not d["reviews"]:
            break

        for r in d["reviews"]:
            if r["language"] != "en": # <  get only english reviews
                continue

            ratings.append(
                [
                    row["Year"],
                    row["Wine ID"],
                    r["rating"],
                    r["note"],
                    r["created_at"],
                ]
            )

        page += 1


ratings = pd.DataFrame(
    ratings, columns=["Year", "Wine ID", "User Rating", "Note", "CreatedAt"]
)

# Merging the two datasets; results and ratings.
df_out = ratings.merge(dataframe)
df_out.to_csv("data.csv", index=False)

相关问题 更多 >

    热门问题