以json格式导出html数据

import pandas as pd import numpy as np import requests from bs4 import BeautifulSoup df = pd.DataFrame({'link' : ['https://en.wikipedia.org/wiki/World%27s_funniest_joke', 'https://en.wikipedia.org/wiki/The_Funniest_Joke_in_the_World']}) def puller(mylink): doc = requests.get(mylink) return BeautifulSoup(doc.content, 'html5lib') df['parsed'] = df.link.apply(lambda x: puller(x))

df Out[32]: link \ 0 https://en.wikipedia.org/wiki/World%27s_funniest_joke 1 https://en.wikipedia.org/wiki/The_Funniest_Joke_in_the_World parsed 0 [html, [[\n, <meta charset="utf-8"/>, \n, <title>World's funniest joke - Wikipedia</title>, \n, <script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"X@pXWwpAIDEAAE2fgjIAAABR","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"World's_funniest_joke","wgTitle":"World's funniest joke","wgCurRevisionId":966811540,"wgRevisionId":966811540,"wgArticleId":647561,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["W... 1 [html, [[\n, <meta charset="utf-8"/>, \n, <title>The Funniest Joke in the World - Wikipedia</title>, \n, <script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"X@pjbgpAICMAADkWE2kAAACQ","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"The_Funniest_Joke_in_the_World","wgTitle":"The Funniest Joke in the World","wgCurRevisionId":996508425,"wgRevisionId":996508425,"wgArticleId":19164,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups...

df.to_json('myjson.json') df = pd.read_json('myjson.json') df Out[30]: link parsed 0 https://en.wikipedia.org/wiki/World%27s_funniest_joke NaN 1 https://en.wikipedia.org/wiki/The_Funniest_Joke_in_the_World NaN

3条回答

网友

1楼 · 编辑于 2024-05-20 22:16:30

看起来pandas不知道如何在json中正确处理此实例中的字符转义。如果您不需要使用熊猫，您可以执行以下操作：

import json
import requests


def write_json(data, path: str, indent: int = 4):
    with open(path, 'w') as file:
        json.dump(data, file, indent=indent)


def read_json(path: str):
    with open(path, 'r') as file:
        return json.load(file)


links = ['https://en.wikipedia.org/wiki/World%27s_funniest_joke',
         'https://en.wikipedia.org/wiki/The_Funniest_Joke_in_the_World']


html_data = {link: str(requests.get(link).content)
             for link in links}


write_json(html_data, './html_data.json')

print(json.dumps(read_json('./html_data.json'), indent=4))

网友

2楼 · 编辑于 2024-05-20 22:16:30

根据您的代码，您正在使用第三方进行web抓取。在python中使用

json.dumps({

“url”：str（url）， “uid”：str（uniqueID）， “页面内容”：htmlContent.text， “日期”：最终日期 })

网友

3楼 · 编辑于 2024-05-20 22:16:30

将bs4.BeautifulSoup类型转换为str类型

df['parsed'] = df['parsed'].map(str)
df.to_json('myjson.json')
df = pd.read_json('myjson.json')

相关问题更多 >

编程相关推荐

热门问题

热门文章