
2024-05-15 16:38:26 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图将三个3个字段从webpage中分离出来,如nameunitmeasure。我使用BeautifulSoup解析成分容器,然后重新模块化以分离unitmeasure。这是the portion在那个网站上,我有兴趣从中获取这三个字段


import re
import requests
from bs4 import BeautifulSoup

link = 'https://www.delicious.com.au/recipes/gnocchi-walnut-rosemary-pecorino-pesto/1b0defa9-53c8-4e9c-8c93-fb96a5348b31?r=recipes/gallery/opvo6a3l'

def get_content(s,link):
    r = s.get(link)
    soup = BeautifulSoup(r.text,"lxml")
    for item in soup.select("ul.ingredient > li"):
        ingr_container = item.get_text(strip=True)
        ingr_unit_container = re.search(r"[\d.⁄a-z]+",ingr_container).group(0)
        ingr_name = re.sub(ingr_unit_container,"",ingr_container).strip()
        ingr_unit = re.sub(r"[a-z]+","",ingr_unit_container).strip()
        ingr_measure = re.sub(r"[\d.⁄]+","",ingr_unit_container).strip()
        yield ingr_name,ingr_unit,ingr_measure

if __name__ == '__main__':
    with requests.Session() as s:
        s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
        for item in get_content(s,link):


500g potato gnocchi
2 tbs extra virgin olive oil
Finely grated zest and juice of 1 lemon
1⁄2 bunch basil, leaves picked
1 tbs finely chopped rosemary, plus fried rosemary leaves to serve
2 garlic cloves, crushed
50g grated pecorino, (or parmesan) plus extra to serve
50g roasted and chopped walnuts, plus extra to serve
100ml extra virgin olive oil


('potato gnocchi', '500', 'g')
('tbs extra virgin olive oil', '2', '')
('F grated zest and juice of 1 lemon', '', 'inely')
('bunch basil, leaves picked', '1⁄2', '')
('tbs finely chopped rosemary, plus fried rosemary leaves to serve', '1', '')
('garlic cloves, crushed', '2', '')
('grated pecorino, (or parmesan) plus extra to serve', '50', 'g')
('roasted and chopped walnuts, plus extra to serve', '50', 'g')
('extra virgin olive oil', '100', 'ml')


('potato gnocchi', '500', 'g')
('extra virgin olive oil', '2', 'tbs')
('Finely grated zest and juice of', '1', 'lemon')
('basil, leaves picked', '1⁄2', 'bunch')
('finely chopped rosemary, plus fried rosemary leaves to serve', '1', 'tbs')
('cloves, crushed', '2', 'garlic')
('grated pecorino, (or parmesan) plus extra to serve', '50', 'g')
('roasted and chopped walnuts, plus extra to serve', '50', 'g')
('extra virgin olive oil', '100', 'ml')

Tags: andtocontainerunitplusextraleavesoil


import re
import requests
from bs4 import BeautifulSoup

link = 'https://www.delicious.com.au/recipes/gnocchi-walnut-rosemary-pecorino-pesto/1b0defa9-53c8-4e9c-8c93-fb96a5348b31?r=recipes/gallery/opvo6a3l'

def get_content(s,link):
    r = s.get(link)
    soup = BeautifulSoup(r.text,"lxml")
    for item in soup.select("ul.ingredient > li"):
        ingr_container = item.get_text(strip=True)
        unit_container = re.search(r'[\d.⁄]+\s*?[a-zA-Z]+\s*?',ingr_container).group(0)
        ingr_name = ingr_container.replace(unit_container,"").strip()
        ingr_unit = re.search(r'[\d.⁄]+',unit_container).group(0)
        ingr_measure = unit_container.replace(ingr_unit,"").strip()
        yield ingr_name,ingr_unit,ingr_measure

if __name__ == '__main__':
    with requests.Session() as s:
        s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
        for item in get_content(s,link):


('potato gnocchi', '500', 'g')
('extra virgin olive oil', '2', 'tbs')
('Finely grated zest and juice of', '1', 'lemon')
('basil, leaves picked', '1⁄2', 'bunch')
('finely chopped rosemary, plus fried rosemary leaves to serve', '1', 'tbs')
('cloves, crushed', '2', 'garlic')
('grated pecorino, (or parmesan) plus extra to serve', '50', 'g')
('roasted and chopped walnuts, plus extra to serve', '50', 'g')
('extra virgin olive oil', '100', 'ml')


import re
import requests
from bs4 import BeautifulSoup

link = 'https://www.delicious.com.au/recipes/gnocchi-walnut-rosemary-pecorino-pesto/1b0defa9-53c8-4e9c-8c93-fb96a5348b31?r=recipes/gallery/opvo6a3l'

def get_content(s,link):
    r = s.get(link)
    soup = BeautifulSoup(r.text,"lxml")
    for item in soup.select("ul.ingredient > li"):
        ingr_container = item.get_text(strip=True).split()

        for index, string in enumerate(ingr_container):
            if re.search(r'\d', string): #check for digits, or parts, that contain digits
                if not string.isdecimal(): #check if digits and characters are mixed
                    if not string.isalnum(): #check if it's a "backslash"-unit (e.g. 1/2)
                        ingr_measure = string
                        ingr_unit = ingr_container[index+1]     
                        to_remove = [index, index+1] #at this index (indices) the unit and measure is set   

                    else: #split digit and characters
                        for i, char in enumerate(string):
                            if char.isalpha():
                                ingr_measure = string[:i]
                                ingr_unit = string[i:]
                                to_remove = [index, index]  
                    ingr_measure = string
                    ingr_unit = ingr_container[index+1]
                    to_remove = [index, index+1]

        ingr_name = ' '.join(ingr_container[:to_remove[0]] + ingr_container[to_remove[1]+1:]) #ingr_name is the whole ingr_container without measure and unit

        yield ingr_name, ingr_measure, ingr_unit

if __name__ == '__main__':
    with requests.Session() as s:
        s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
        for item in get_content(s,link):


('potato gnocchi', '500', 'g')
('extra virgin olive oil', '2', 'tbs')
('Finely grated zest and juice of', '1', 'lemon')
('basil, leaves picked', '1⁄2', 'bunch')
('finely chopped rosemary, plus fried rosemary leaves to serve', '1', 'tbs')
('cloves, crushed', '2', 'garlic')
('grated pecorino, (or parmesan) plus extra to serve', '50', 'g')
('roasted and chopped walnuts, plus extra to serve', '50', 'g')
('extra virgin olive oil', '100', 'ml')

相关问题 更多 >