如何使用html.parser

import urllib.request import urllib.parse import re from html.parser import HTMLParser url = 'https://www.mcdelivery.com.pk/pk/browse/menu.html' values = {'daypartId': '1', 'catId': '1'} data = urllib.parse.urlencode(values) data = data.encode('utf-8') # data should be bytes req = urllib.request.Request(url, data) resp = urllib.request.urlopen(req) respData = resp.read() list1 = re.findall(r'<div class="product-cost"(.*?)</div>', str(respData)) for eachp in list1: print(eachp)

from html.parser import HTMLParser import urllib.request import html.parser # Import HTML from a URL url = urllib.request.urlopen( "https://www.mcdelivery.com.pk/pk/browse/menu.html") html = url.read().decode() url.close() class MyParser(html.parser.HTMLParser): def __init__(self, html): self.matches = [] self.match_count = 0 super().__init__() def handle_data(self, data): self.matches.append(data) self.match_count += 1 def handle_starttag(self, tag, attrs): attrs = dict(attrs) if tag == "div": if attrs.get("product-cost"): self.handle_data() else: return parser = MyParser(html) parser.feed(html) for item in parser.matches: print(item)

1条回答

网友

1楼 · 发布于 2024-04-24 09:09:31

这是一个良好的开端，可能需要进行特定的调整：

import html.parser

class MyParser(html.parser.HTMLParser):

    def __init__(self, html):
        self.matches = []
        self.match_count = 0
        super().__init__()        

    def handle_data(self, data):
        self.matches.append(data)
        self.match_count += 1

    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)
        if tag == "div":
            if attrs.get("product-cost"):
                self.handle_data()
        else: return

用法大致如下：

request_html = the_request_method(url, ...)

parser = MyParser()
parser.feed(request_html)

for item in parser.matches:
    print(item)

相关问题更多 >

编程相关推荐

热门问题

热门文章