导出为连续d的JSON文件

def scrape_post_info(url): content = get_page_content(url) title, description, post_url = get_post_details(content, url) job_dict = {} job_dict['title'] = title job_dict['Description'] = description job_dict['url'] = post_url #here json machanism json_job = json.dumps(job_dict) with open('data.json', 'r+') as f: f.write("[") f.seek(0) f.write(json_job) txt = f.readline() if txt.endswith("}"): f.write(",") def crawl_web(url): while True: post_url = get_post_url(url) for urls in post_url: urls = urls scrape_post_info(urls) # Execute the main fuction 'crawl_web' if __name__ == '__main__': crawl_web('www.examp....com')

[ { "title": "this is title", "Description": " Fendi is an Italian luxury labelarin. ", "url": "https:/~" }, { "title": " - Furrocious Elegant Style", "Description": " the Italian luxare vast. ", "url": "https://www.s" }, { "title": "Rome, Fountains and Fendi Sunglasses", "Description": " Fendi started off as a store. ", "url": "https://www.~" }, { "title": "Tipsnglasses", "Description": "Whether irregular orn season.", "url": "https://www.sooic" }, ]

1条回答

网友

1楼 · 发布于 2024-04-25 23:56:39

怎么样：

def scrape_post_info(url):
    content = get_page_content(url)
    title, description, post_url = get_post_details(content, url)
    return {"title": title, "Description": description, "url": post_url}


def crawl_web(url):
    while True:
        jobs = []
        post_urls = get_post_url(url)
        for url in post_urls:
            jobs.append(scrape_post_info(url))
            with open("data.json", "w") as f:
                json.dumps(jobs)


# Execute the main fuction 'crawl_web'
if __name__ == "__main__":
    crawl_web("www.examp....com")

请注意，这将在每次“post\u url”迭代时重写整个文件，因此对于大文件和慢I/O来说可能会变得非常慢

根据作业运行的时间和内存大小，您可能希望将文件写入移出for循环，并且只写入一次

注意：如果你真的想写JSON流，你可能想看看这个包：https://pypi.org/project/jsonstreams/，但是我建议你选择另一种更适合流式写入的格式，比如CSV

相关问题更多 >

编程相关推荐

热门问题

热门文章