python web从url列表中抓取

import asks import trio from goose3 import Goose import logging as log from goose3.configuration import ArticleContextPattern from pprint import pprint import json import time asks.init('trio') async def extractor(path, htmls, paths, session): try: r = await session.get(path, timeout=2) out = r.content htmls.append(out) paths.append(path) except Exception as e: out = str(e) htmls.append(out) paths.append(path) async def main(path_list, session): htmls = [] paths = [] async with trio.open_nursery() as n: for path in path_list: n.start_soon(extractor, path, htmls, paths, session) return htmls, paths async def run(urls, conns=50): s = asks.Session(connections=conns) g = Goose() htmls, paths = await main(urls, s) print(htmls," ",paths) cleaned = [] for html, path in zip(htmls, paths): dic = {} dic['url'] = path if html is not None: try: #g.config.known_context_pattern = ArticleContextPattern(attr='class', value='the-post') article = g.extract(raw_html=html) author=article.authors dic['goose_text'] = article.cleaned_text #print(article.cleaned_text) #dic['goose_date'] = article.publish_datetime dic['goose_title'] = article.title if author: dic['authors']=author[0] else: dic['authors'] ='' except Exception as e: raise print(e) log.info('goose found no text using html') dic['goose_html'] = html dic['goose_text'] = '' dic['goose_date'] = None dic['goose_title'] = None dic['authors'] ='' cleaned.append(dic) return cleaned async def real_main(): sss= '[{"crawl_delay_sec": 0, "name": "mining","goose_text":"","article_date":"","title":"", "story_url": "http://www.mining.com/canalaska-start-drilling-west-mcarthur-uranium-project","url": "http://www.mining.com/tag/latin-america/page/1/"},{"crawl_delay_sec": 0, "name": "mining", "story_url": "http://www.mining.com/web/tesla-fires-sound-alarms-safety-electric-car-batteries", "url": "http://www.mining.com/tag/latin-america/page/1/"}]' obj = json.loads(sss) pprint(obj) articles=[] for l in obj: articles.append(await run([l['story_url']])) #await trio.sleep(3) pprint(articles) if __name__ == "__main__": trio.run(real_main)

2条回答

网友

1楼 · 编辑于 2024-05-23 20:14:16

当状态代码为时，“asks”并不总是引发异常！= 200. 在使用响应的内容之前，需要检查其状态代码。您可能还需要增加超时时间，2秒是不够的，特别是当您并行启动多达50个连接时。你知道吗

在任何情况下，这里有一个简化的程序-所有这些Goose的东西对于显示实际的错误都是完全不必要的，两个结果数组不是一个好主意，向结果数组添加错误消息看起来是失败的。你知道吗

您还应该研究并行运行URL获取和处理。trio.open_memory_channel是你的朋友。你知道吗


import asks
asks.init('trio')

import trio
from pprint import pprint

async def extractor(path, session, results):
    try:
        r = await session.get(path, timeout=2)
        if r.status_code != 200:
            raise asks.errors.BadStatus("Not OK",r.status_code)
        out = r.content
    except Exception as e:
        # do some reasonable error handling
        print(path, repr(e))
    else:
        results.append((out, path))

async def main(path_list, session):
    results = []
    async with trio.open_nursery() as n:
        for path in path_list:
            n.start_soon(extractor, path, session, results)
    return results


async def run(conns=50):
    s = asks.Session(connections=conns)

    urls = [
            "http://www.mining.com/web/tesla-fires-sound-alarms-safety-electric-car-batteries",
            "http://www.mining.com/canalaska-start-drilling-west-mcarthur-uranium-project",
            "https://www.google.com",  # just for testing more parallel connections
            "https://www.debian.org",
            ]

    results = await main(urls, s)
    for content, path in results:
        pass  # analyze this result
    print("OK")

if __name__ == "__main__":
    trio.run(run)

网友

2楼 · 编辑于 2024-05-23 20:14:16

我缺少一些进一步的信息来深入回答您的问题，但很可能与goose在html中搜索文本的方式有关。有关详细信息，请参见以下答案：https://stackoverflow.com/a/30408761/8867146

相关问题更多 >

编程相关推荐

热门问题

热门文章