从匹配的列表列表创建JSON文件

2024-05-15 06:01:55 发布

您现在位置:Python中文网/ 问答频道 /正文

我有以下清单:

data= [
       [[0.025], 
        ['-DOCSTART-'], 
        ['O']],

       [[0.166, 0.001, 4.354, 4.366, 7.668], 
        ['Summary', 'of', 'Consolidated', 'Financial', 'Data'], 
        ['O', 'O', 'B-ORG', 'I-ORG', 'E-ORG']],

       [[0.195, 0.1, 0.0, 3.561, 3.793, 6.741, 4.0, 0.05], 
        ['Port', 'conditions', 'from', 'Lloyds', 'Shipping', 'Intelligence', 'Service', '--'], 
        ['S-PER', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'E-ORG', 'O']]
      ]

注意:data[i]内的每个列表都有相同的长度,i in [0, 1, 2]

我想创建一个JSON文件,如下所示:

[{
  "sentence": "-DOCSTART- Summary of Consolidated Financial Data Port conditions from Lloyds Shipping Intelligence Service --",
  "annotations": [
    {
      "decision": "Consolidated Financial Data",
      "category": "ORG",
      "token_loss": [4.354, 4.366, 7.668],
      "totalloss": 4.354+4.366+7.668 # Here, I consider the sum of "token_loss"
    },
    {
      "decision": "Port",
      "category": "PER",
      "token_loss": 18.44,
      "totalloss": 18.44
    },
    {
      "decision": "Lloyds Shipping Intelligence Service",
      "category": "ORG",
      "token_loss": [3.561, 3.793, 6.741, 4.0],
      "totalloss": 3.561+3.793+6.741+4.0
    }]
}]

在列表中,始终有“B-”(开始)、“I-”(内部)和“E-”(结束)的顺序。总是有一个带“S-”(single)的单词。我不考虑“O”字(外)。p>


这就是我开始尝试解决这个问题的原因

startIdx = 0
endIdx = 10
decisions = []
for tag in tags:
    if tag.startswith('B'):
        start = tags.index(tag)
        startIdx = start
        while startIdx<10:
            if tags[startIdx+1].startswith('I'):
                decisions.append(tokens[startIdx:startIdx+1])
                startIdx += 1
            if tags[startIdx+1].startswith('E'):
                decisions.append(tokens[startIdx:startIdx+1])
                startIdx = 11

Tags: oforgtokendataportservicetagsintelligence
1条回答
网友
1楼 · 发布于 2024-05-15 06:01:55

您可以使用生成器函数生成分组:

import json, collections
data = [[[0.025], ['-DOCSTART-'], ['O']], [[0.166, 0.001, 4.354, 4.366, 7.668], ['Summary', 'of', 'Consolidated', 'Financial', 'Data'], ['O', 'O', 'B-ORG', 'I-ORG', 'E-ORG']], [[0.195, 0.1, 0.0, 3.561, 3.793, 6.741, 4.0, 0.05], ['Port', 'conditions', 'from', 'Lloyds', 'Shipping', 'Intelligence', 'Service', ' '], ['S-PER', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'E-ORG', 'O']]]
def p_ranges(s):
   r = None
   for i, a in enumerate(s):
      if a != 'O':
        if a.startswith('S'):
           yield ([i], a.split('-')[-1])
        elif a.startswith('B'):
           r = [i]
        elif a.startswith('E'):
           yield (r+[i], a.split('-')[-1])
           r = None
        elif r:
           r.append(i)

def get_pairings(d):
    for a, b, c in d:
       yield ' '.join(b)
       for i, _c in p_ranges(c):
           yield {"decision":' '.join(b[j] for j in i), 
                  "category":_c, 
                  "token_loss":(t:=[a[j] for j in i]),
                  "totalloss":sum(t)}

d = collections.defaultdict(list)
for i in get_pairings(data):
   d[type(i)].append(i)

result = [{'sentence':' '.join(d[str]), 'annotations':d[dict]}]
print(json.dumps(result, indent=4))

输出:

[
    {
        "sentence": "-DOCSTART- Summary of Consolidated Financial Data Port conditions from Lloyds Shipping Intelligence Service  ",
        "annotations": [
            {
                "decision": "Consolidated Financial Data",
                "category": "ORG",
                "token_loss": [
                    4.354,
                    4.366,
                    7.668
                ],
                "totalloss": 16.387999999999998
            },
            {
                "decision": "Port",
                "category": "PER",
                "token_loss": [
                    0.195
                ],
                "totalloss": 0.195
            },
            {
                "decision": "Lloyds Shipping Intelligence Service",
                "category": "ORG",
                "token_loss": [
                    3.561,
                    3.793,
                    6.741,
                    4.0
                ],
                "totalloss": 18.095
            }
        ]
    }
]

在新样本上运行时:

data = [[[0.036, 0.937, 0.032, 2.985, 0.0, 0.044, 0.033, 0.539, 0.01, 0.009, 0.628, 0.706], ['At', 'Colchester', ':', 'Gloucestershire', '280', '(', 'J.', 'Russell', '63', ',', 'A.', 'Symonds'], ['O', 'S-LOC', 'O', 'S-ORG', 'O', 'O', 'B-PER', 'E-PER', 'O', 'O', 'B-PER', 'E-PER']]]
d = collections.defaultdict(list)
for i in get_pairings(data):
   d[type(i)].append(i)

result = [{'sentence':' '.join(d[str]), 'annotations':d[dict]}]
print(json.dumps(result, indent=4))

输出:

[
    {
        "sentence": "At Colchester : Gloucestershire 280 ( J. Russell 63 , A. Symonds",
        "annotations": [
            {
                "decision": "Colchester",
                "category": "LOC",
                "token_loss": [
                    0.937
                ],
                "totalloss": 0.937
            },
            {
                "decision": "Gloucestershire",
                "category": "ORG",
                "token_loss": [
                    2.985
                ],
                "totalloss": 2.985
            },
            {
                "decision": "J. Russell",
                "category": "PER",
                "token_loss": [
                    0.033,
                    0.539
                ],
                "totalloss": 0.5720000000000001
            },
            {
                "decision": "A. Symonds",
                "category": "PER",
                "token_loss": [
                    0.628,
                    0.706
                ],
                "totalloss": 1.334
            }
        ]
    }
]

相关问题 更多 >

    热门问题