python web从url列表中抓取

2024-04-27 15:34:09 发布

您现在位置:Python中文网/ 问答频道 /正文

我对asks和python都是新手,我得到了一个示例代码。让我解释一下 我有一个网址列表,每一个都是新闻网址,每一个都有子网址。 第一个url请求并获取所有其他href并添加到列表中。 然后获取该列表中所有href的文章。 问题是某些时候文章变得空洞无物。你知道吗

在工作时尝试了单个URL的示例代码

import asks
import trio
from goose3 import Goose
import logging as log
from goose3.configuration import ArticleContextPattern
from pprint import pprint
import json
import time

asks.init('trio') 


async def extractor(path, htmls, paths, session):

    try:
        r = await session.get(path, timeout=2)
        out = r.content
        htmls.append(out)
        paths.append(path)
    except Exception as e:
        out = str(e)
        htmls.append(out)
        paths.append(path)


async def main(path_list, session):    
    htmls = []
    paths = []
    async with trio.open_nursery() as n:
        for path in path_list:
            n.start_soon(extractor, path, htmls, paths, session)

    return htmls, paths


async def run(urls, conns=50): 


    s = asks.Session(connections=conns)
    g = Goose()

    htmls, paths = await main(urls, s)
    print(htmls,"       ",paths)
    cleaned = []
    for html, path in zip(htmls, paths):
        dic = {}
        dic['url'] = path
        if html is not None:                            
            try:
                #g.config.known_context_pattern = ArticleContextPattern(attr='class', value='the-post')
                article = g.extract(raw_html=html)
                author=article.authors
                dic['goose_text'] = article.cleaned_text
                #print(article.cleaned_text)
                #dic['goose_date'] = article.publish_datetime
                dic['goose_title'] = article.title
                if author:
                    dic['authors']=author[0]
                else:
                    dic['authors'] =''
            except Exception as e:
                raise
                print(e)
                log.info('goose found no text using html')
                dic['goose_html'] = html
                dic['goose_text'] = ''
                dic['goose_date'] = None
                dic['goose_title'] = None
                dic['authors'] =''
            cleaned.append(dic)
    return cleaned




async def real_main():
    sss= '[{"crawl_delay_sec": 0, "name": "mining","goose_text":"","article_date":"","title":"", "story_url": "http://www.mining.com/canalaska-start-drilling-west-mcarthur-uranium-project","url": "http://www.mining.com/tag/latin-america/page/1/"},{"crawl_delay_sec": 0, "name": "mining", "story_url": "http://www.mining.com/web/tesla-fires-sound-alarms-safety-electric-car-batteries", "url": "http://www.mining.com/tag/latin-america/page/1/"}]'

    obj = json.loads(sss)
    pprint(obj)

    articles=[]
    for l in obj:
      articles.append(await run([l['story_url']]))
      #await trio.sleep(3)

    pprint(articles)

if __name__ == "__main__":
    trio.run(real_main)

获取文章数据而不丢失


Tags: pathtextimporturlasynctriomainhtml
2条回答

当状态代码为时,“asks”并不总是引发异常!= 200. 在使用响应的内容之前,需要检查其状态代码。您可能还需要增加超时时间,2秒是不够的,特别是当您并行启动多达50个连接时。你知道吗

在任何情况下,这里有一个简化的程序-所有这些Goose的东西对于显示实际的错误都是完全不必要的,两个结果数组不是一个好主意,向结果数组添加错误消息看起来是失败的。你知道吗

您还应该研究并行运行URL获取和处理。trio.open_memory_channel是你的朋友。你知道吗


import asks
asks.init('trio')

import trio
from pprint import pprint

async def extractor(path, session, results):
    try:
        r = await session.get(path, timeout=2)
        if r.status_code != 200:
            raise asks.errors.BadStatus("Not OK",r.status_code)
        out = r.content
    except Exception as e:
        # do some reasonable error handling
        print(path, repr(e))
    else:
        results.append((out, path))

async def main(path_list, session):
    results = []
    async with trio.open_nursery() as n:
        for path in path_list:
            n.start_soon(extractor, path, session, results)
    return results


async def run(conns=50):
    s = asks.Session(connections=conns)

    urls = [
            "http://www.mining.com/web/tesla-fires-sound-alarms-safety-electric-car-batteries",
            "http://www.mining.com/canalaska-start-drilling-west-mcarthur-uranium-project",
            "https://www.google.com",  # just for testing more parallel connections
            "https://www.debian.org",
            ]

    results = await main(urls, s)
    for content, path in results:
        pass  # analyze this result
    print("OK")

if __name__ == "__main__":
    trio.run(run)

我缺少一些进一步的信息来深入回答您的问题,但很可能与goose在html中搜索文本的方式有关。有关详细信息,请参见以下答案:https://stackoverflow.com/a/30408761/8867146

相关问题 更多 >