用靓汤解析网站json表中的数据

2024-06-12 03:00:46 发布

您现在位置:Python中文网/ 问答频道 /正文

我相信这将是一个快速解决的人有合理的知识与网页刮美化。我试图从一个表中获取数据,但由于某些原因,它没有给我预期的输出。以下是我的代码:

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import json

def main():

    # BASE AND EXTENTIONS FOR EACH CURRENCY COLUMNWISE
    base_cols_url='https://uk.reuters.com/assets/'
    forex_cols={}
    forex_cols['GBP']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=GBP'
    forex_cols['EUR']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=EUR'
    forex_cols['USD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=USD'
    forex_cols['JPY']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=JPY'
    forex_cols['CHF']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=CHF'
    forex_cols['AUD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=AUD'
    forex_cols['CAD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=CAD'
    forex_cols['CNY']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=CNY'
    forex_cols['HKD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=HKD'

    # loop through the pages
    for sym in forex_cols:
        print(sym)
        print(base_cols_url+forex_cols[sym])
        get_data_from_page(sym,base_cols_url+forex_cols[sym])


def get_data_from_page(SYMBOL,PAGE):

    browser = webdriver.PhantomJS()
    # PARSE THE HTML
    browser.get(PAGE)
    soup = BeautifulSoup(browser.page_source, "lxml")
    rows = soup.findAll('td')

    # PARSE ALL THE COLUMN DATA
    for r in rows:
        print(r)      # this prints nothing  

    print(soup)       # this prints the page but the markups are missing and replaced with  '<tr><td>&lt'
    return        

if __name__ == '__main__':
   main()

如果我能在页面上看到我可以手动加载的话?但是,如果我只打印整个soup对象,那么标记似乎丢失了,这就解释了print(r)什么也不返回的原因。但是,我不知道如何解析出我需要的部分?(显示在基本网页的表上的数据:https://uk.reuters.com/business/currencies)。在

真的想解释一下这里发生了什么事?它看起来像是json格式,但我从来没有真正使用过它json.loads(soup)它说它不能加载soup对象,所以我尝试了json.loads(汤.文本())但我得到一个ValueError:期望值:行1列1(char 0)。在

如果有人能帮我分析数据,我会非常感激的?非常感谢您的阅读!在


Tags: fromimportjsonbasemaincallbackpageprint
1条回答
网友
1楼 · 发布于 2024-06-12 03:00:46

好吧,在json的一些尝试失败之后,我尝试了一个非常粗糙的基本字符串解析方法,但是它确实起到了作用,只是为了防止其他人想做类似的事情。在

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import json

def main():

    # BASE AND EXTENTIONS FOR EACH CURRENCY COLUMNWISE
    base_cols_url='https://uk.reuters.com/assets/'
    forex_cols={}
    forex_cols['GBP']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=GBP'
    forex_cols['EUR']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=EUR'
    forex_cols['USD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=USD'
    forex_cols['JPY']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=JPY'
    forex_cols['CHF']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=CHF'
    forex_cols['AUD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=AUD'
    forex_cols['CAD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=CAD'
    forex_cols['CNY']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=CNY'
    forex_cols['HKD']='jsonCurrencyPairs?callback=drawCurrencyPairs&srcCurr=HKD'

    for sym in forex_cols:
        print(sym)
        print(base_cols_url+forex_cols[sym])
        get_data_from_page(sym,base_cols_url+forex_cols[sym])


def get_data_from_page(SYMBOL,PAGE):

    browser = webdriver.PhantomJS()
    # PARSE THE HTML
    browser.get(PAGE)
    soup = BeautifulSoup(browser.page_source, "lxml")
    rows = str(soup).split('"row"')

    # PARSE ALL THE COLUMN DATA
    for r in rows:
        # PARSE OUT VALUE COL
        try:
            print(r.split('</a></td><td>')[1].split('</td><td class=')[0])
        except: IndexError
        pass
        # PARSE OUT CURRENCY PAIR
        try:
            print(r.split('sparkchart?symbols=')[1].split('=X&')[0])
        except: IndexError
        pass

    return


if __name__ == '__main__':
   main()

相关问题 更多 >