从匹配的列表列表创建JSON文件

data= [ [[0.025], ['-DOCSTART-'], ['O']], [[0.166, 0.001, 4.354, 4.366, 7.668], ['Summary', 'of', 'Consolidated', 'Financial', 'Data'], ['O', 'O', 'B-ORG', 'I-ORG', 'E-ORG']], [[0.195, 0.1, 0.0, 3.561, 3.793, 6.741, 4.0, 0.05], ['Port', 'conditions', 'from', 'Lloyds', 'Shipping', 'Intelligence', 'Service', '--'], ['S-PER', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'E-ORG', 'O']] ]

[{ "sentence": "-DOCSTART- Summary of Consolidated Financial Data Port conditions from Lloyds Shipping Intelligence Service --", "annotations": [ { "decision": "Consolidated Financial Data", "category": "ORG", "token_loss": [4.354, 4.366, 7.668], "totalloss": 4.354+4.366+7.668 # Here, I consider the sum of "token_loss" }, { "decision": "Port", "category": "PER", "token_loss": 18.44, "totalloss": 18.44 }, { "decision": "Lloyds Shipping Intelligence Service", "category": "ORG", "token_loss": [3.561, 3.793, 6.741, 4.0], "totalloss": 3.561+3.793+6.741+4.0 }] }]

startIdx = 0 endIdx = 10 decisions = [] for tag in tags: if tag.startswith('B'): start = tags.index(tag) startIdx = start while startIdx<10: if tags[startIdx+1].startswith('I'): decisions.append(tokens[startIdx:startIdx+1]) startIdx += 1 if tags[startIdx+1].startswith('E'): decisions.append(tokens[startIdx:startIdx+1]) startIdx = 11

1条回答

网友

1楼 · 发布于 2024-05-15 06:01:55

您可以使用生成器函数生成分组：

import json, collections
data = [[[0.025], ['-DOCSTART-'], ['O']], [[0.166, 0.001, 4.354, 4.366, 7.668], ['Summary', 'of', 'Consolidated', 'Financial', 'Data'], ['O', 'O', 'B-ORG', 'I-ORG', 'E-ORG']], [[0.195, 0.1, 0.0, 3.561, 3.793, 6.741, 4.0, 0.05], ['Port', 'conditions', 'from', 'Lloyds', 'Shipping', 'Intelligence', 'Service', ' '], ['S-PER', 'O', 'O', 'B-ORG', 'I-ORG', 'I-ORG', 'E-ORG', 'O']]]
def p_ranges(s):
   r = None
   for i, a in enumerate(s):
      if a != 'O':
        if a.startswith('S'):
           yield ([i], a.split('-')[-1])
        elif a.startswith('B'):
           r = [i]
        elif a.startswith('E'):
           yield (r+[i], a.split('-')[-1])
           r = None
        elif r:
           r.append(i)

def get_pairings(d):
    for a, b, c in d:
       yield ' '.join(b)
       for i, _c in p_ranges(c):
           yield {"decision":' '.join(b[j] for j in i), 
                  "category":_c, 
                  "token_loss":(t:=[a[j] for j in i]),
                  "totalloss":sum(t)}

d = collections.defaultdict(list)
for i in get_pairings(data):
   d[type(i)].append(i)

result = [{'sentence':' '.join(d[str]), 'annotations':d[dict]}]
print(json.dumps(result, indent=4))

输出：

[
    {
        "sentence": "-DOCSTART- Summary of Consolidated Financial Data Port conditions from Lloyds Shipping Intelligence Service  ",
        "annotations": [
            {
                "decision": "Consolidated Financial Data",
                "category": "ORG",
                "token_loss": [
                    4.354,
                    4.366,
                    7.668
                ],
                "totalloss": 16.387999999999998
            },
            {
                "decision": "Port",
                "category": "PER",
                "token_loss": [
                    0.195
                ],
                "totalloss": 0.195
            },
            {
                "decision": "Lloyds Shipping Intelligence Service",
                "category": "ORG",
                "token_loss": [
                    3.561,
                    3.793,
                    6.741,
                    4.0
                ],
                "totalloss": 18.095
            }
        ]
    }
]

在新样本上运行时：

data = [[[0.036, 0.937, 0.032, 2.985, 0.0, 0.044, 0.033, 0.539, 0.01, 0.009, 0.628, 0.706], ['At', 'Colchester', ':', 'Gloucestershire', '280', '(', 'J.', 'Russell', '63', ',', 'A.', 'Symonds'], ['O', 'S-LOC', 'O', 'S-ORG', 'O', 'O', 'B-PER', 'E-PER', 'O', 'O', 'B-PER', 'E-PER']]]
d = collections.defaultdict(list)
for i in get_pairings(data):
   d[type(i)].append(i)

result = [{'sentence':' '.join(d[str]), 'annotations':d[dict]}]
print(json.dumps(result, indent=4))

输出：

[
    {
        "sentence": "At Colchester : Gloucestershire 280 ( J. Russell 63 , A. Symonds",
        "annotations": [
            {
                "decision": "Colchester",
                "category": "LOC",
                "token_loss": [
                    0.937
                ],
                "totalloss": 0.937
            },
            {
                "decision": "Gloucestershire",
                "category": "ORG",
                "token_loss": [
                    2.985
                ],
                "totalloss": 2.985
            },
            {
                "decision": "J. Russell",
                "category": "PER",
                "token_loss": [
                    0.033,
                    0.539
                ],
                "totalloss": 0.5720000000000001
            },
            {
                "decision": "A. Symonds",
                "category": "PER",
                "token_loss": [
                    0.628,
                    0.706
                ],
                "totalloss": 1.334
            }
        ]
    }
]

相关问题更多 >

编程相关推荐

热门问题

热门文章