我对asks和python都是新手,我得到了一个示例代码。让我解释一下 我有一个网址列表,每一个都是新闻网址,每一个都有子网址。 第一个url请求并获取所有其他href并添加到列表中。 然后获取该列表中所有href的文章。 问题是某些时候文章变得空洞无物。你知道吗
在工作时尝试了单个URL的示例代码
import asks
import trio
from goose3 import Goose
import logging as log
from goose3.configuration import ArticleContextPattern
from pprint import pprint
import json
import time
asks.init('trio')
async def extractor(path, htmls, paths, session):
try:
r = await session.get(path, timeout=2)
out = r.content
htmls.append(out)
paths.append(path)
except Exception as e:
out = str(e)
htmls.append(out)
paths.append(path)
async def main(path_list, session):
htmls = []
paths = []
async with trio.open_nursery() as n:
for path in path_list:
n.start_soon(extractor, path, htmls, paths, session)
return htmls, paths
async def run(urls, conns=50):
s = asks.Session(connections=conns)
g = Goose()
htmls, paths = await main(urls, s)
print(htmls," ",paths)
cleaned = []
for html, path in zip(htmls, paths):
dic = {}
dic['url'] = path
if html is not None:
try:
#g.config.known_context_pattern = ArticleContextPattern(attr='class', value='the-post')
article = g.extract(raw_html=html)
author=article.authors
dic['goose_text'] = article.cleaned_text
#print(article.cleaned_text)
#dic['goose_date'] = article.publish_datetime
dic['goose_title'] = article.title
if author:
dic['authors']=author[0]
else:
dic['authors'] =''
except Exception as e:
raise
print(e)
log.info('goose found no text using html')
dic['goose_html'] = html
dic['goose_text'] = ''
dic['goose_date'] = None
dic['goose_title'] = None
dic['authors'] =''
cleaned.append(dic)
return cleaned
async def real_main():
sss= '[{"crawl_delay_sec": 0, "name": "mining","goose_text":"","article_date":"","title":"", "story_url": "http://www.mining.com/canalaska-start-drilling-west-mcarthur-uranium-project","url": "http://www.mining.com/tag/latin-america/page/1/"},{"crawl_delay_sec": 0, "name": "mining", "story_url": "http://www.mining.com/web/tesla-fires-sound-alarms-safety-electric-car-batteries", "url": "http://www.mining.com/tag/latin-america/page/1/"}]'
obj = json.loads(sss)
pprint(obj)
articles=[]
for l in obj:
articles.append(await run([l['story_url']]))
#await trio.sleep(3)
pprint(articles)
if __name__ == "__main__":
trio.run(real_main)
获取文章数据而不丢失
当状态代码为时,“asks”并不总是引发异常!= 200. 在使用响应的内容之前,需要检查其状态代码。您可能还需要增加超时时间,2秒是不够的,特别是当您并行启动多达50个连接时。你知道吗
在任何情况下,这里有一个简化的程序-所有这些Goose的东西对于显示实际的错误都是完全不必要的,两个结果数组不是一个好主意,向结果数组添加错误消息看起来是失败的。你知道吗
您还应该研究并行运行URL获取和处理。
trio.open_memory_channel
是你的朋友。你知道吗我缺少一些进一步的信息来深入回答您的问题,但很可能与goose在html中搜索文本的方式有关。有关详细信息,请参见以下答案:https://stackoverflow.com/a/30408761/8867146
相关问题 更多 >
编程相关推荐