将URL拆分到目录并转换为json

https://example.com/ https://example.com/page1.html https://example.com/cocktails/receipe/page1.html https://example.com/cocktails/receipe/page2.html https://example.com/cocktails/page3.html https://example.com/article/magazine https://example.com/article/mood/page1.html

{ "name": "/", "children": [{ "name": "page1.html" }, { "name": "cocktails", "children": [{ "name": "recipe", "children": [{ "name": "page1.html" }, { "name": "page2.html" } ] }, { "name": "page3.html" } ] }, { "name": "article", "children": [{ "name": "mood", "children": [{ "name": "page1.html" }] }, { "name": "magazine" } ] } ] }

import json import re, csv from collections import OrderedDict def run() : root = OrderedDict({ "name": "/", "children": [], }) rows = csv.DictReader(open("test.csv")) for row in rows : link = row['url'] suffix = re.sub("https?://[^/]*","", link) parts = [x for x in re.split("[/\?]", suffix) if x != ""] if len(parts) ==0 : continue if len(parts) == 1: p = parts[0] if p not in root : root[p]["children"].append(create_row(p, row)) else : page = parts[-1] parts = parts[:-1] """ SOME CODE HERE """ data = json.dumps(root, indent=4, sort_keys=False) open("readme.json", "w").write(data) def create_row(key, row) : return {"name": key, "url": row['link'].strip() } def key_exists(folders, key) : return [x for x in folders if x['name'] == key] > 0 if __name__ == "__main__" : run()

2条回答

网友

1楼 · 编辑于 2024-05-19 01:40:33

这里不需要递归。您可以通过遍历路径并在运行时附加子级来构建树。你知道吗

伪代码：

roots = []
For each row:
    parse the URL
    if part[0] not in root:
        create root node in roots
    for each part of the path in the url
        cur_node = find the corresponding root in roots
        if part is not in the children list of cur_node:
             create child entry in cur_node
             cur_node = the child entry you just created

网友

2楼 · 编辑于 2024-05-19 01:40:33

下面的程序给出了您的预期输出，我希望它对您来说不太复杂。你知道吗

import json
from pprint import pprint
a=["/",
"/page1.html",
"/cocktails/receipe/page1.html",
"/cocktails/receipe/page2.html",
"/cocktails/page3.html",
"/article/magazine",
"/article/mood/page1.html"]

def create (path,dictionaryarray):
    headarray = dictionaryarray
    for index,element in enumerate(path):
        exists = 0
        for head in headarray:
            if head['name'] == element:
                head.setdefault('children',[])
                headarray = head['children']
                exists =1
                break
        if not exists:
            if index == len(path) - 1: 
                headarray.append({'name':element})
            else:
                headarray.append({'name':element,'children':[]})
                headarray=headarray[-1]['children']
d = []        
for i in a:
    create([j for j in i.split('/') if j != ''] ,d)

data={'name':'/','children':d}
data=json.dumps(data, indent=4, sort_keys=False)
# pprint(data)
print(data)

输出

{
    "name": "/",
    "children": [
        {
            "name": "page1.html"
        },
        {
            "name": "cocktails",
            "children": [
                {
                    "name": "receipe",
                    "children": [
                        {
                            "name": "page1.html"
                        },
                        {
                            "name": "page2.html"
                        }
                    ]
                },
                {
                    "name": "page3.html"
                }
            ]
        },
        {
            "name": "article",
            "children": [
                {
                    "name": "magazine"
                },
                {
                    "name": "mood",
                    "children": [
                        {
                            "name": "page1.html"
                        }
                    ]
                }
            ]
        }
    ]
}

相关问题更多 >

编程相关推荐

热门问题

热门文章