在Python上不使用API键获取大量URL的Lighthouse度量数据

2024-05-29 11:04:53 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在尝试使用Google Page speed insights API捕获7k URL的lighthouse度量数据,而不使用API密钥。我已经创建了一个python脚本来执行这个过程。我能够使用此方法捕获前1000个URL数据。Python脚本对所有7k URL运行,没有任何错误,但在1000个URL之后停止写入数据。我是否需要API密钥或某些权限才能对大量URL执行此过程?。获取7k URL数据是否有任何先决条件。我甚至在python脚本中加入了多线程,以加快进程

以下是python脚本:

start = time.perf_counter()
urls= df_final['URL'].unique().tolist()
a= ['desktop','mobile']
def role_session(url,device):

    lighthouse=pd.DataFrame()
    lighthouse= pd.DataFrame(columns=['Date','URL','First Contentful Paint','Time to Interactive','Cumulative layout Shift','First Meaningful Paint','Largest Contentful Paint','Speed Index','Total Blocking Time','Java Execution Time','Remove Unused JavaScript','Server Initial Response Time','DOM_size','device'])

    x = f'https://www.googleapis.com/pagespeedonline/v5/runPagespeed?url={url}&strategy={device}'
    r=''
    while r == '' :
        try:
            r = requests.get(x, verify= True)
            final = r.json()
            date = datetime.today().date()
            date = str(date)
            urlid = final['id']

            split = urlid.split('?') # This splits the absolute url from the api key parameter
            urlid = split[0] # This reassigns urlid to the absolute url
            ID = str(urlid)
            urlfcp = final['lighthouseResult']['audits']['first-contentful-paint']['displayValue']
            FCP = str(urlfcp)
            urltti = final['lighthouseResult']['audits']['interactive']['displayValue']
            TTI = str(urltti)
            urlcls= final['lighthouseResult']['audits']['cumulative-layout-shift']['displayValue']
            CLS= str(urlcls)
            urlfmp= final['lighthouseResult']['audits']['first-meaningful-paint']['displayValue']
            FMP = str(urlfmp)
            urllcp= final['lighthouseResult']['audits']['largest-contentful-paint']['displayValue']
            LCP = str(urllcp)
            urlspeedidx= final['lighthouseResult']['audits']['speed-index']['displayValue']
            SPEED_INDEX = str(urlspeedidx)
            totalblocktime= final['lighthouseResult']['audits']['total-blocking-time']['displayValue']
            TOTAL_BLOCKING_TIME = str(totalblocktime)
            TOTAL_BLOCKING_TIME = TOTAL_BLOCKING_TIME.replace(',', '')
            executiontime=  final['lighthouseResult']['audits']['bootup-time']['displayValue']
            JAVA_EXECUTION_TIME = str(executiontime)
            JAVA_EXECUTION_TIME = JAVA_EXECUTION_TIME.replace(',','')
            unused_js = final['lighthouseResult']['audits']['unused-javascript']['displayValue']
            REMOVE_UNUSED_JS = str(unused_js)
            REMOVE_UNUSED_JS = REMOVE_UNUSED_JS.replace(',','').replace('Potential savings of ','')
            server_responsetime = final['lighthouseResult']['audits']['server-response-time']['displayValue']
            SERVER_INITIAL_RESPONSE_TIME = str(server_responsetime)
            SERVER_INITIAL_RESPONSE_TIME= SERVER_INITIAL_RESPONSE_TIME.replace(',','').replace('Root document took ','')
            dom_size = final['lighthouseResult']['audits']['dom-size']['displayValue']
            DOM_SIZE = str(dom_size)
            DOM_SIZE= DOM_SIZE.replace(',','').replace(' elements','')

            lighthouse = lighthouse.append({"Date": date, 'URL': ID, 'First Contentful Paint': FCP,'Time to Interactive': TTI,'Cumulative layout Shift': CLS,'First Meaningful Paint': FMP,'Largest Contentful Paint': LCP,'Speed Index': SPEED_INDEX,'Total Blocking Time':TOTAL_BLOCKING_TIME,'Java Execution Time':JAVA_EXECUTION_TIME,'Remove Unused JavaScript':REMOVE_UNUSED_JS,'Server Initial Response Time':SERVER_INITIAL_RESPONSE_TIME,'DOM_size':DOM_SIZE,'device':device}, ignore_index=True)
            lighthouse.drop_duplicates(keep='first',inplace=True)

            midtime = time.perf_counter()
            print("query complete Time: %s" % (midtime-start))
            break

        except requests.ConnectionError as e:
            print(f'Error is {url} and strategy {device}')
            continue
        except requests.Timeout as e:
            print(f'OOPS!! Timeout Error" {url}')
            continue
        except requests.RequestException as e:
            print(f'OOPS!! General Error" {url}')
            continue
        except KeyboardInterrupt:
            print(f'Someone closed the program" {url}')
            time.sleep(5)
            print("Was a nice sleep, now let me continue...")
            continue

    lighthouse.to_csv('testing1.csv',index = False,mode='a',header=False)


threads = [threading.Thread(target= role_session, args=(url,device)) for url,device in itertools.product(urls,a)]
print(threads)
print(len(threads))
for thread in threads:
    thread.start()
    print("\n Live long and prosper!")
    sleep(5)               # Conventional sleep() Method.
    print("\n Just let that soak in..")   
    Event().wait(5.0) # wait() Method, useable sans thread.
    print("\n Make it So! = )\n")
for thread in threads:
    thread.join()
    print("\n Live long and prosper!")
    sleep(5)               # Conventional sleep() Method.
    print("\n Just let that soak in..")   
    Event().wait(5.0) # wait() Method, useable sans thread.
    print("\n Make it So! = )\n")


end= time.perf_counter()
print("Elapsed Time: %s" % (end-start))

Tags: urltimedevicereplacefinalprintpaintstr

热门问题