从HTML中的var提取字典

2024-06-01 02:16:07 发布

您现在位置:Python中文网/ 问答频道 /正文

第一次做网页抓取。我需要从HTML脚本中的两个变量中提取一些字典。以下是我提取HTML的步骤:

url = "https://www.backstabbr.com/game/Nexus-Season6-Game37/5466300639084544#" 
page = requests.get(url).text
soup = BeautifulSoup(page, 'html.parser')

放大我实际需要的HTML部分,我有:

<script>
  // NEW JAVSCRIPT!;
  var stage = "NEEDS_ORDERS";
  var orders = {};
  var unitsByPlayer = {"Austria": {"Ven": "F"}, "England": {"BAL": "F", "Den": "A", "HEL": "F", "IRI": "F", "Lon": "F", "NAO": "F", "NTH": "F", "Stp": "A", "War": "A"}, "France": {"Ber": "A", "Bre": "F", "Bur": "A", "ENG": "F", "Hol": "F", "Kie": "A", "MAO": "F", "Naf": "A", "Ruh": "A", "Tyr": "A"}, "Italy": {"Mar": "F", "Tus": "A", "TYS": "F"}, "Turkey": {"Ank": "A", "Bud": "A", "Con": "F", "Gal": "A", "ION": "F", "LYO": "F", "Mos": "A", "Ser": "A", "Smy": "F", "Tri": "A", "Ukr": "A"}};
  var territories = {"Lon": "England", "Lvp": "England", "Edi": "England", "Tri": "Turkey", "Bud": "Turkey", "Vie": "England", "Con": "Turkey", "Ank": "Turkey", "Smy": "Turkey", "Rom": "Italy", "Nap": "Italy", "Ven": "Austria", "Par": "France", "Mar": "Italy", "Bre": "France", "Sev": "Turkey", "Stp": "England", "Mos": "Turkey", "War": "England", "Ber": "France", "Mun": "France", "Kie": "France", "Den": "England", "Nwy": "England", "Bul": "Turkey", "Tun": "France", "Spa": "France", "Por": "France", "Rum": "Turkey", "Hol": "France", "Swe": "England", "Ser": "Turkey", "Gre": "Turkey", "Bel": "France"};
  var activePlayer = null;
  var unitChangeCount = {};
  var buildableTerritories = [];
  var unbuildableTerritories = [];
  var retreatOptions = {};
  var playerRetreatOrders = {}; // not sure this is used
  var disable_engine = true;
  var base_url = '/game/Nexus-Season6-Game37/5466300639084544';
  var session_id = '';
  var want_shaded_territories = none;
  var gameType = 'game';
  var nextAdjudicationTime = '2021-05-12 20:26:20.762615+00:00';
  var gapi_p1 = 'MjAyMS0wNS0xMSAyMDoyNjoyMC43NjMwMDQ=';
</script>

我需要提取UnitsByLayer和Territions变量的字典。有人知道如何直接使用变量soup来实现吗?提前非常感谢


Tags: nexusgameurl字典varhtmlpagescript
1条回答
网友
1楼 · 发布于 2024-06-01 02:16:07

您可以使用re/json模块来解析数据:

import re
import json
import requests

url = "https://www.backstabbr.com/game/Nexus-Season6-Game37/5466300639084544#"
page = requests.get(url).text

unitsByPlayer = json.loads(
    re.search(r"var unitsByPlayer = (\{.*\})", page).group(1)
)

territories = json.loads(
    re.search(r"var territories = (\{.*\})", page).group(1)
)

# pretty print:
print(json.dumps(unitsByPlayer, indent=4))
print()
print(json.dumps(territories, indent=4))

印刷品:

{
    "Austria": {
        "Ven": "F"
    },
    "England": {
        "BAL": "F",
        "Den": "A",
        "HEL": "F",
        "IRI": "F",
        "Lon": "F",
        "NAO": "F",
        "NTH": "F",
        "Stp": "A",
        "War": "A"
    },
    "France": {
        "Ber": "A",
        "Bre": "F",
        "Bur": "A",
        "ENG": "F",
        "Hol": "F",
        "Kie": "A",
        "MAO": "F",
        "Naf": "A",
        "Ruh": "A",
        "Tyr": "A"
    },
    "Italy": {
        "Mar": "F",
        "Tus": "A",
        "TYS": "F"
    },
    "Turkey": {
        "Ank": "A",
        "Bud": "A",
        "Con": "F",
        "Gal": "A",
        "ION": "F",
        "LYO": "F",
        "Mos": "A",
        "Ser": "A",
        "Smy": "F",
        "Tri": "A",
        "Ukr": "A"
    }
}

{
    "Lon": "England",
    "Lvp": "England",
    "Edi": "England",
    "Tri": "Turkey",
    "Bud": "Turkey",
    "Vie": "England",
    "Con": "Turkey",
    "Ank": "Turkey",
    "Smy": "Turkey",
    "Rom": "Italy",
    "Nap": "Italy",
    "Ven": "Austria",
    "Par": "France",
    "Mar": "Italy",
    "Bre": "France",
    "Sev": "Turkey",
    "Stp": "England",
    "Mos": "Turkey",
    "War": "England",
    "Ber": "France",
    "Mun": "France",
    "Kie": "France",
    "Den": "England",
    "Nwy": "England",
    "Bul": "Turkey",
    "Tun": "France",
    "Spa": "France",
    "Por": "France",
    "Rum": "Turkey",
    "Hol": "France",
    "Swe": "England",
    "Ser": "Turkey",
    "Gre": "Turkey",
    "Bel": "France"
}

相关问题 更多 >