无法从每个成分容器中分离某些字段

import re import requests from bs4 import BeautifulSoup link = 'https://www.delicious.com.au/recipes/gnocchi-walnut-rosemary-pecorino-pesto/1b0defa9-53c8-4e9c-8c93-fb96a5348b31?r=recipes/gallery/opvo6a3l' def get_content(s,link): r = s.get(link) soup = BeautifulSoup(r.text,"lxml") for item in soup.select("ul.ingredient > li"): ingr_container = item.get_text(strip=True) ingr_unit_container = re.search(r"[\d.⁄a-z]+",ingr_container).group(0) ingr_name = re.sub(ingr_unit_container,"",ingr_container).strip() ingr_unit = re.sub(r"[a-z]+","",ingr_unit_container).strip() ingr_measure = re.sub(r"[\d.⁄]+","",ingr_unit_container).strip() yield ingr_name,ingr_unit,ingr_measure if __name__ == '__main__': with requests.Session() as s: s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36' for item in get_content(s,link): print(item)

500g potato gnocchi 2 tbs extra virgin olive oil Finely grated zest and juice of 1 lemon 1⁄2 bunch basil, leaves picked 1 tbs finely chopped rosemary, plus fried rosemary leaves to serve 2 garlic cloves, crushed 50g grated pecorino, (or parmesan) plus extra to serve 50g roasted and chopped walnuts, plus extra to serve 100ml extra virgin olive oil

('potato gnocchi', '500', 'g') ('tbs extra virgin olive oil', '2', '') ('F grated zest and juice of 1 lemon', '', 'inely') ('bunch basil, leaves picked', '1⁄2', '') ('tbs finely chopped rosemary, plus fried rosemary leaves to serve', '1', '') ('garlic cloves, crushed', '2', '') ('grated pecorino, (or parmesan) plus extra to serve', '50', 'g') ('roasted and chopped walnuts, plus extra to serve', '50', 'g') ('extra virgin olive oil', '100', 'ml')

('potato gnocchi', '500', 'g') ('extra virgin olive oil', '2', 'tbs') ('Finely grated zest and juice of', '1', 'lemon') ('basil, leaves picked', '1⁄2', 'bunch') ('finely chopped rosemary, plus fried rosemary leaves to serve', '1', 'tbs') ('cloves, crushed', '2', 'garlic') ('grated pecorino, (or parmesan) plus extra to serve', '50', 'g') ('roasted and chopped walnuts, plus extra to serve', '50', 'g') ('extra virgin olive oil', '100', 'ml')

2条回答

网友

1楼 · 编辑于 2024-05-15 16:38:26

我的正则表达式一点也不好。但是，我发现以下实现工作正常：

import re
import requests
from bs4 import BeautifulSoup

link = 'https://www.delicious.com.au/recipes/gnocchi-walnut-rosemary-pecorino-pesto/1b0defa9-53c8-4e9c-8c93-fb96a5348b31?r=recipes/gallery/opvo6a3l'

def get_content(s,link):
    r = s.get(link)
    soup = BeautifulSoup(r.text,"lxml")
    for item in soup.select("ul.ingredient > li"):
        ingr_container = item.get_text(strip=True)
        unit_container = re.search(r'[\d.⁄]+\s*?[a-zA-Z]+\s*?',ingr_container).group(0)
        ingr_name = ingr_container.replace(unit_container,"").strip()
        ingr_unit = re.search(r'[\d.⁄]+',unit_container).group(0)
        ingr_measure = unit_container.replace(ingr_unit,"").strip()
        yield ingr_name,ingr_unit,ingr_measure

if __name__ == '__main__':
    with requests.Session() as s:
        s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
        for item in get_content(s,link):
            print(item)

输出：

('potato gnocchi', '500', 'g')
('extra virgin olive oil', '2', 'tbs')
('Finely grated zest and juice of', '1', 'lemon')
('basil, leaves picked', '1⁄2', 'bunch')
('finely chopped rosemary, plus fried rosemary leaves to serve', '1', 'tbs')
('cloves, crushed', '2', 'garlic')
('grated pecorino, (or parmesan) plus extra to serve', '50', 'g')
('roasted and chopped walnuts, plus extra to serve', '50', 'g')
('extra virgin olive oil', '100', 'ml')

网友

2楼 · 编辑于 2024-05-15 16:38:26

因此，一个解决方案是搜索文本中的数字，这就是度量。这变得有点棘手，因为有时单位是度量的一部分，有时单位之间有emtpy空间。但您可以通过条件（也可能有一个正则表达式解决方案）来了解这一点：

import re
import requests
from bs4 import BeautifulSoup

link = 'https://www.delicious.com.au/recipes/gnocchi-walnut-rosemary-pecorino-pesto/1b0defa9-53c8-4e9c-8c93-fb96a5348b31?r=recipes/gallery/opvo6a3l'

def get_content(s,link):
    r = s.get(link)
    soup = BeautifulSoup(r.text,"lxml")
    for item in soup.select("ul.ingredient > li"):
        ingr_container = item.get_text(strip=True).split()

        for index, string in enumerate(ingr_container):
            if re.search(r'\d', string): #check for digits, or parts, that contain digits
                if not string.isdecimal(): #check if digits and characters are mixed
                    if not string.isalnum(): #check if it's a "backslash"-unit (e.g. 1/2)
                        ingr_measure = string
                        ingr_unit = ingr_container[index+1]     
                        to_remove = [index, index+1] #at this index (indices) the unit and measure is set   
                        break           

                    else: #split digit and characters
                        for i, char in enumerate(string):
                            if char.isalpha():
                                ingr_measure = string[:i]
                                ingr_unit = string[i:]
                                to_remove = [index, index]  
                                break
                        break
                else:
                    ingr_measure = string
                    ingr_unit = ingr_container[index+1]
                    to_remove = [index, index+1]
                    break

        ingr_name = ' '.join(ingr_container[:to_remove[0]] + ingr_container[to_remove[1]+1:]) #ingr_name is the whole ingr_container without measure and unit

        yield ingr_name, ingr_measure, ingr_unit


if __name__ == '__main__':
    with requests.Session() as s:
        s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
        for item in get_content(s,link):
            print(item)

输出：

('potato gnocchi', '500', 'g')
('extra virgin olive oil', '2', 'tbs')
('Finely grated zest and juice of', '1', 'lemon')
('basil, leaves picked', '1⁄2', 'bunch')
('finely chopped rosemary, plus fried rosemary leaves to serve', '1', 'tbs')
('cloves, crushed', '2', 'garlic')
('grated pecorino, (or parmesan) plus extra to serve', '50', 'g')
('roasted and chopped walnuts, plus extra to serve', '50', 'g')
('extra virgin olive oil', '100', 'ml')

相关问题更多 >

编程相关推荐

热门问题

热门文章