使用Python将JSON嵌套到数据帧

{ "subject": "Computer", "records": [ { "name": "Section A", "elements": [ { "type": "easy", "name": "Section A Module 1", "elements": [ { "type": "text", "name": "SectionQ1", "title": "Some question?", "readOnly": true }, { "type": "text", "name": "SectionQ2", "title": "some other question?", "isRequired": true } ] } ], "title": "Section A details" }, { "name": "Section B", "elements": [ { "type": "medium", "name": "Section B Module 1", "elements": [ { "type": "radiogroup", "name": "SectionQ1", "title": "some question?", "choices": [ { "value": "1", "text": "Choose 1" }, { "value": "2", "text": "Choose 2" }, { "value": "3", "text": "Choose 3" } ], "colCount": 3 }, { "type": "radiogroup", "name": "SectionQ2", "title": "some other question?", "description": "question 2", "isRequired": true, "choices": [ { "value": "value 1", "text": "choice 1" }, { "value": "value 2", "text": "choice 2" }, { "value": "value 3", "text": "choice 3" } ] } ], "title": "title" } ], "title": "Title" } ] }

dataframe records Name Title Section A Section A details Section B Section B details dataframe elements1 Key(records) Type Name Section A easy Section A Module 1 Section B medium Section B Module 1 dataframe elements2 Key(elements1) type name title readOnly Description isRequired colCount Section A Module 1 text SectionQ1 Some question? true Section A Module 1 text SectionQ2 some other question? true Section B Module 1 radiogroup SectionQ1 some question? 3 Section B Module 1 radiogroup SectionQ2 some other question? question 2 true dataframe choice Key(elements2) type name value text Section B Module 1 radiogroup SectionQ1 1 Choose 1 Section B Module 1 radiogroup SectionQ1 2 Choose 2 Section B Module 1 radiogroup SectionQ1 3 Choose 3 Section B Module 1 radiogroup SectionQ2 value 1 choice 1 Section B Module 1 radiogroup SectionQ2 value 2 choice 2 Section B Module 1 radiogroup SectionQ2 value 3 choice 3

1条回答

网友

1楼 · 发布于 2024-04-27 09:35:12

为了从JSON对象提取嵌套数组，我想做一些类似于MongoDB的^{}的事情。事实上，我会尝试模仿Mongo的聚合管道。包^{}真的能帮上忙

首先是explode操作，其中包含数组字段的记录被扩展为与原始记录相同的记录序列，但所讨论的键的值被相应的数组元素替换

大概是这样的：

from toolz.dicttoolz import assoc_in, get_in

def explode(d, keys):
    values = get_in(keys, d)
    if isinstance(values, list):
        for v in values:
            yield assoc_in(d, keys, v)
    else:
        yield d


record = {"x": 1, "y": [1, 2, 3]}
assert list(explode(record, ["y"])) == [{"x": 1, "y": 1}, {"x": 1, "y": 2}, {"x": 1, "y": 3}]

要创建unwind阶段，explode操作需要应用一系列记录：

from toolz.itertoolz import concat

def unwind(iterator, keys):
    return concat(map(lambda d: explode(d, keys), iterator))

records = [{"x": 1, "y": [1, 2, 3]}, {"x": 2, "y": [4, 5]}]
assert list(unwind(records, ["y"])) == [
    {"x": 1, "y": 1}, 
    {"x": 1, "y": 2}, 
    {"x": 1, "y": 3},
    {"x": 2, "y": 4},
    {"x": 2, "y": 5},
]

现在，您可以创建管道来提取所需的表：

from toolz import curried
from toolz.functoolz import pipe

sections = pipe(data["records"],
                curried.map(lambda d: {
                    "section": get_in(["name"], d),
                    "title": get_in(["title"], d),
                }),
                list,
)

modules = pipe(data["records"],
               lambda i: unwind(i, ["elements"]),
               curried.map(lambda d: {
                   "section": get_in(["name"], d),
                   "type": get_in(["elements", "type"], d),
                   "module": get_in(["elements", "name"], d),
               }),
               list,
)

questions = pipe(data["records"],
                 lambda i: unwind(i, ["elements"]),
                 lambda i: unwind(i, ["elements", "elements"]),
                 curried.map(lambda d: {
                     "module": get_in(["elements", "name"], d),
                     "type": get_in(["elements", "elements", "type"], d),
                     "question": get_in(["elements", "elements", "name"], d),
                     "title": get_in(["elements", "elements", "title"], d),
                     "readOnly": get_in(["elements", "elements", "readOnly"], d),
                     "description": get_in(["elements", "elements", "description"], d),
                     "isRequired": get_in(["elements", "elements", "isRequired"], d),
                     "colCount": get_in(["elements", "elements", "colCount"], d),
                 }),
                 list,
)

choices = pipe(data["records"],
               lambda i: unwind(i, ["elements"]),
               lambda i: unwind(i, ["elements", "elements"]),
               lambda i: unwind(i, ["elements", "elements", "choices"]),
               curried.filter(lambda d: get_in(["elements", "elements", "choices"], d) is not None),
               curried.map(lambda d: {
                   "module": get_in(["elements", "name"], d),
                   "type": get_in(["elements", "elements", "type"], d),
                   "question": get_in(["elements", "elements", "name"], d),
                   "value": get_in(["elements", "elements", "choices", "value"], d),
                   "text": get_in(["elements", "elements", "choices", "text"], d),
               }),
               list,
)

现在，您可以创建数据帧：

import pandas as pd

df_sections = pd.DataFrame.from_records(sections)
df_modules = pd.DataFrame.from_records(modules)
df_questions = pd.DataFrame.from_records(questions)
df_choices = pd.DataFrame.from_records(choices)

相关问题更多 >

编程相关推荐

热门问题

热门文章